diff --git "a/marques/outputs/checkpoint-1000/trainer_state.json" "b/marques/outputs/checkpoint-1000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/marques/outputs/checkpoint-1000/trainer_state.json"
@@ -0,0 +1,7034 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.000351009252516144,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 3.5100925251614403e-07,
+      "grad_norm": 0.53782719373703,
+      "learning_rate": 0.0,
+      "loss": 0.5835,
+      "step": 1
+    },
+    {
+      "epoch": 7.020185050322881e-07,
+      "grad_norm": 0.6201626062393188,
+      "learning_rate": 4e-05,
+      "loss": 0.5242,
+      "step": 2
+    },
+    {
+      "epoch": 1.053027757548432e-06,
+      "grad_norm": 0.7571901082992554,
+      "learning_rate": 8e-05,
+      "loss": 0.5642,
+      "step": 3
+    },
+    {
+      "epoch": 1.4040370100645761e-06,
+      "grad_norm": 0.5588695406913757,
+      "learning_rate": 0.00012,
+      "loss": 0.4859,
+      "step": 4
+    },
+    {
+      "epoch": 1.75504626258072e-06,
+      "grad_norm": 0.7208331227302551,
+      "learning_rate": 0.00016,
+      "loss": 0.4645,
+      "step": 5
+    },
+    {
+      "epoch": 2.106055515096864e-06,
+      "grad_norm": 0.8169743418693542,
+      "learning_rate": 0.0002,
+      "loss": 0.3702,
+      "step": 6
+    },
+    {
+      "epoch": 2.4570647676130083e-06,
+      "grad_norm": 2.051530599594116,
+      "learning_rate": 0.00019993322203672788,
+      "loss": 0.4856,
+      "step": 7
+    },
+    {
+      "epoch": 2.8080740201291522e-06,
+      "grad_norm": 1.2310550212860107,
+      "learning_rate": 0.00019986644407345576,
+      "loss": 0.5192,
+      "step": 8
+    },
+    {
+      "epoch": 3.1590832726452962e-06,
+      "grad_norm": 1.612046241760254,
+      "learning_rate": 0.00019979966611018366,
+      "loss": 0.4719,
+      "step": 9
+    },
+    {
+      "epoch": 3.51009252516144e-06,
+      "grad_norm": 1.4484680891036987,
+      "learning_rate": 0.00019973288814691153,
+      "loss": 0.4416,
+      "step": 10
+    },
+    {
+      "epoch": 3.861101777677584e-06,
+      "grad_norm": 1.4529719352722168,
+      "learning_rate": 0.0001996661101836394,
+      "loss": 0.6275,
+      "step": 11
+    },
+    {
+      "epoch": 4.212111030193728e-06,
+      "grad_norm": 1.3963671922683716,
+      "learning_rate": 0.00019959933222036728,
+      "loss": 0.5874,
+      "step": 12
+    },
+    {
+      "epoch": 4.563120282709872e-06,
+      "grad_norm": 1.4744153022766113,
+      "learning_rate": 0.00019953255425709515,
+      "loss": 0.6422,
+      "step": 13
+    },
+    {
+      "epoch": 4.9141295352260165e-06,
+      "grad_norm": 0.8640050888061523,
+      "learning_rate": 0.00019946577629382305,
+      "loss": 0.5064,
+      "step": 14
+    },
+    {
+      "epoch": 5.26513878774216e-06,
+      "grad_norm": 0.7137419581413269,
+      "learning_rate": 0.00019939899833055092,
+      "loss": 0.5218,
+      "step": 15
+    },
+    {
+      "epoch": 5.6161480402583045e-06,
+      "grad_norm": 0.7769026756286621,
+      "learning_rate": 0.00019933222036727882,
+      "loss": 0.5377,
+      "step": 16
+    },
+    {
+      "epoch": 5.967157292774448e-06,
+      "grad_norm": 0.7558479905128479,
+      "learning_rate": 0.0001992654424040067,
+      "loss": 0.5054,
+      "step": 17
+    },
+    {
+      "epoch": 6.3181665452905924e-06,
+      "grad_norm": 0.8237054347991943,
+      "learning_rate": 0.00019919866444073457,
+      "loss": 0.5094,
+      "step": 18
+    },
+    {
+      "epoch": 6.669175797806736e-06,
+      "grad_norm": 1.0375059843063354,
+      "learning_rate": 0.00019913188647746244,
+      "loss": 0.5751,
+      "step": 19
+    },
+    {
+      "epoch": 7.02018505032288e-06,
+      "grad_norm": 1.075869083404541,
+      "learning_rate": 0.00019906510851419034,
+      "loss": 0.594,
+      "step": 20
+    },
+    {
+      "epoch": 7.371194302839024e-06,
+      "grad_norm": 0.8041358590126038,
+      "learning_rate": 0.00019899833055091822,
+      "loss": 0.553,
+      "step": 21
+    },
+    {
+      "epoch": 7.722203555355168e-06,
+      "grad_norm": 0.9264736771583557,
+      "learning_rate": 0.0001989315525876461,
+      "loss": 0.5555,
+      "step": 22
+    },
+    {
+      "epoch": 8.073212807871313e-06,
+      "grad_norm": 1.0074031352996826,
+      "learning_rate": 0.00019886477462437396,
+      "loss": 0.5353,
+      "step": 23
+    },
+    {
+      "epoch": 8.424222060387455e-06,
+      "grad_norm": 0.8725020885467529,
+      "learning_rate": 0.00019879799666110183,
+      "loss": 0.5557,
+      "step": 24
+    },
+    {
+      "epoch": 8.7752313129036e-06,
+      "grad_norm": 0.8867582678794861,
+      "learning_rate": 0.00019873121869782974,
+      "loss": 0.5992,
+      "step": 25
+    },
+    {
+      "epoch": 9.126240565419744e-06,
+      "grad_norm": 0.9235608577728271,
+      "learning_rate": 0.0001986644407345576,
+      "loss": 0.516,
+      "step": 26
+    },
+    {
+      "epoch": 9.477249817935889e-06,
+      "grad_norm": 0.8653218150138855,
+      "learning_rate": 0.00019859766277128548,
+      "loss": 0.5249,
+      "step": 27
+    },
+    {
+      "epoch": 9.828259070452033e-06,
+      "grad_norm": 0.7479026913642883,
+      "learning_rate": 0.00019853088480801335,
+      "loss": 0.5037,
+      "step": 28
+    },
+    {
+      "epoch": 1.0179268322968176e-05,
+      "grad_norm": 0.9531452655792236,
+      "learning_rate": 0.00019846410684474123,
+      "loss": 0.5896,
+      "step": 29
+    },
+    {
+      "epoch": 1.053027757548432e-05,
+      "grad_norm": 1.1012492179870605,
+      "learning_rate": 0.00019839732888146913,
+      "loss": 0.5139,
+      "step": 30
+    },
+    {
+      "epoch": 1.0881286828000465e-05,
+      "grad_norm": 1.0198887586593628,
+      "learning_rate": 0.000198330550918197,
+      "loss": 0.5587,
+      "step": 31
+    },
+    {
+      "epoch": 1.1232296080516609e-05,
+      "grad_norm": 0.8081266283988953,
+      "learning_rate": 0.00019826377295492487,
+      "loss": 0.4762,
+      "step": 32
+    },
+    {
+      "epoch": 1.1583305333032752e-05,
+      "grad_norm": 1.1965891122817993,
+      "learning_rate": 0.00019819699499165277,
+      "loss": 0.5719,
+      "step": 33
+    },
+    {
+      "epoch": 1.1934314585548896e-05,
+      "grad_norm": 1.214903473854065,
+      "learning_rate": 0.00019813021702838065,
+      "loss": 0.5756,
+      "step": 34
+    },
+    {
+      "epoch": 1.228532383806504e-05,
+      "grad_norm": 0.8360006213188171,
+      "learning_rate": 0.00019806343906510852,
+      "loss": 0.5688,
+      "step": 35
+    },
+    {
+      "epoch": 1.2636333090581185e-05,
+      "grad_norm": 0.8328489065170288,
+      "learning_rate": 0.00019799666110183642,
+      "loss": 0.6418,
+      "step": 36
+    },
+    {
+      "epoch": 1.298734234309733e-05,
+      "grad_norm": 1.1427714824676514,
+      "learning_rate": 0.0001979298831385643,
+      "loss": 0.6531,
+      "step": 37
+    },
+    {
+      "epoch": 1.3338351595613472e-05,
+      "grad_norm": 1.0145376920700073,
+      "learning_rate": 0.00019786310517529217,
+      "loss": 0.6473,
+      "step": 38
+    },
+    {
+      "epoch": 1.3689360848129616e-05,
+      "grad_norm": 0.8427861928939819,
+      "learning_rate": 0.00019779632721202004,
+      "loss": 0.5882,
+      "step": 39
+    },
+    {
+      "epoch": 1.404037010064576e-05,
+      "grad_norm": 0.8792659044265747,
+      "learning_rate": 0.00019772954924874791,
+      "loss": 0.608,
+      "step": 40
+    },
+    {
+      "epoch": 1.4391379353161905e-05,
+      "grad_norm": 0.9338463544845581,
+      "learning_rate": 0.00019766277128547581,
+      "loss": 0.7118,
+      "step": 41
+    },
+    {
+      "epoch": 1.4742388605678048e-05,
+      "grad_norm": 0.7554420232772827,
+      "learning_rate": 0.0001975959933222037,
+      "loss": 0.5898,
+      "step": 42
+    },
+    {
+      "epoch": 1.5093397858194192e-05,
+      "grad_norm": 0.7700084447860718,
+      "learning_rate": 0.00019752921535893156,
+      "loss": 0.6466,
+      "step": 43
+    },
+    {
+      "epoch": 1.5444407110710337e-05,
+      "grad_norm": 0.8639333248138428,
+      "learning_rate": 0.00019746243739565943,
+      "loss": 0.7253,
+      "step": 44
+    },
+    {
+      "epoch": 1.579541636322648e-05,
+      "grad_norm": 0.7760612964630127,
+      "learning_rate": 0.0001973956594323873,
+      "loss": 0.7099,
+      "step": 45
+    },
+    {
+      "epoch": 1.6146425615742626e-05,
+      "grad_norm": 0.7319066524505615,
+      "learning_rate": 0.0001973288814691152,
+      "loss": 0.6664,
+      "step": 46
+    },
+    {
+      "epoch": 1.6497434868258768e-05,
+      "grad_norm": 0.7557100057601929,
+      "learning_rate": 0.00019726210350584308,
+      "loss": 0.6318,
+      "step": 47
+    },
+    {
+      "epoch": 1.684844412077491e-05,
+      "grad_norm": 0.6420389413833618,
+      "learning_rate": 0.00019719532554257095,
+      "loss": 0.6688,
+      "step": 48
+    },
+    {
+      "epoch": 1.7199453373291057e-05,
+      "grad_norm": 0.660383939743042,
+      "learning_rate": 0.00019712854757929883,
+      "loss": 0.6204,
+      "step": 49
+    },
+    {
+      "epoch": 1.75504626258072e-05,
+      "grad_norm": 0.5614909529685974,
+      "learning_rate": 0.00019706176961602673,
+      "loss": 0.664,
+      "step": 50
+    },
+    {
+      "epoch": 1.7901471878323346e-05,
+      "grad_norm": 0.502738356590271,
+      "learning_rate": 0.0001969949916527546,
+      "loss": 0.6918,
+      "step": 51
+    },
+    {
+      "epoch": 1.825248113083949e-05,
+      "grad_norm": 0.47578102350234985,
+      "learning_rate": 0.0001969282136894825,
+      "loss": 0.6747,
+      "step": 52
+    },
+    {
+      "epoch": 1.860349038335563e-05,
+      "grad_norm": 0.5528931617736816,
+      "learning_rate": 0.00019686143572621037,
+      "loss": 0.765,
+      "step": 53
+    },
+    {
+      "epoch": 1.8954499635871777e-05,
+      "grad_norm": 0.6176997423171997,
+      "learning_rate": 0.00019679465776293825,
+      "loss": 0.5959,
+      "step": 54
+    },
+    {
+      "epoch": 1.930550888838792e-05,
+      "grad_norm": 0.43425047397613525,
+      "learning_rate": 0.00019672787979966612,
+      "loss": 0.6437,
+      "step": 55
+    },
+    {
+      "epoch": 1.9656518140904066e-05,
+      "grad_norm": 0.5135884881019592,
+      "learning_rate": 0.000196661101836394,
+      "loss": 0.7019,
+      "step": 56
+    },
+    {
+      "epoch": 2.000752739342021e-05,
+      "grad_norm": 0.4628916084766388,
+      "learning_rate": 0.0001965943238731219,
+      "loss": 0.5722,
+      "step": 57
+    },
+    {
+      "epoch": 2.035853664593635e-05,
+      "grad_norm": 0.48201897740364075,
+      "learning_rate": 0.00019652754590984977,
+      "loss": 0.6288,
+      "step": 58
+    },
+    {
+      "epoch": 2.0709545898452498e-05,
+      "grad_norm": 0.5772811770439148,
+      "learning_rate": 0.00019646076794657764,
+      "loss": 0.6067,
+      "step": 59
+    },
+    {
+      "epoch": 2.106055515096864e-05,
+      "grad_norm": 0.4976802170276642,
+      "learning_rate": 0.0001963939899833055,
+      "loss": 0.4722,
+      "step": 60
+    },
+    {
+      "epoch": 2.1411564403484786e-05,
+      "grad_norm": 0.4842129051685333,
+      "learning_rate": 0.00019632721202003339,
+      "loss": 0.5876,
+      "step": 61
+    },
+    {
+      "epoch": 2.176257365600093e-05,
+      "grad_norm": 0.46149536967277527,
+      "learning_rate": 0.00019626043405676129,
+      "loss": 0.6373,
+      "step": 62
+    },
+    {
+      "epoch": 2.2113582908517072e-05,
+      "grad_norm": 0.47199445962905884,
+      "learning_rate": 0.00019619365609348916,
+      "loss": 0.5546,
+      "step": 63
+    },
+    {
+      "epoch": 2.2464592161033218e-05,
+      "grad_norm": 0.6109340190887451,
+      "learning_rate": 0.00019612687813021703,
+      "loss": 0.6069,
+      "step": 64
+    },
+    {
+      "epoch": 2.281560141354936e-05,
+      "grad_norm": 0.5529135465621948,
+      "learning_rate": 0.0001960601001669449,
+      "loss": 0.553,
+      "step": 65
+    },
+    {
+      "epoch": 2.3166610666065503e-05,
+      "grad_norm": 0.500245213508606,
+      "learning_rate": 0.00019599332220367278,
+      "loss": 0.6149,
+      "step": 66
+    },
+    {
+      "epoch": 2.351761991858165e-05,
+      "grad_norm": 0.4841914474964142,
+      "learning_rate": 0.00019592654424040068,
+      "loss": 0.6509,
+      "step": 67
+    },
+    {
+      "epoch": 2.3868629171097792e-05,
+      "grad_norm": 0.5308504104614258,
+      "learning_rate": 0.00019585976627712855,
+      "loss": 0.7017,
+      "step": 68
+    },
+    {
+      "epoch": 2.4219638423613938e-05,
+      "grad_norm": 0.5157874822616577,
+      "learning_rate": 0.00019579298831385645,
+      "loss": 0.7125,
+      "step": 69
+    },
+    {
+      "epoch": 2.457064767613008e-05,
+      "grad_norm": 0.47787800431251526,
+      "learning_rate": 0.00019572621035058433,
+      "loss": 0.5792,
+      "step": 70
+    },
+    {
+      "epoch": 2.4921656928646224e-05,
+      "grad_norm": 0.46792763471603394,
+      "learning_rate": 0.0001956594323873122,
+      "loss": 0.7,
+      "step": 71
+    },
+    {
+      "epoch": 2.527266618116237e-05,
+      "grad_norm": 0.5394675135612488,
+      "learning_rate": 0.00019559265442404007,
+      "loss": 0.5549,
+      "step": 72
+    },
+    {
+      "epoch": 2.5623675433678512e-05,
+      "grad_norm": 0.45065200328826904,
+      "learning_rate": 0.00019552587646076797,
+      "loss": 0.6663,
+      "step": 73
+    },
+    {
+      "epoch": 2.597468468619466e-05,
+      "grad_norm": 0.4026688039302826,
+      "learning_rate": 0.00019545909849749584,
+      "loss": 0.6315,
+      "step": 74
+    },
+    {
+      "epoch": 2.63256939387108e-05,
+      "grad_norm": 0.42353659868240356,
+      "learning_rate": 0.00019539232053422372,
+      "loss": 0.5419,
+      "step": 75
+    },
+    {
+      "epoch": 2.6676703191226944e-05,
+      "grad_norm": 0.45561954379081726,
+      "learning_rate": 0.0001953255425709516,
+      "loss": 0.6624,
+      "step": 76
+    },
+    {
+      "epoch": 2.702771244374309e-05,
+      "grad_norm": 0.3954075574874878,
+      "learning_rate": 0.00019525876460767946,
+      "loss": 0.5479,
+      "step": 77
+    },
+    {
+      "epoch": 2.7378721696259233e-05,
+      "grad_norm": 0.4994329512119293,
+      "learning_rate": 0.00019519198664440736,
+      "loss": 0.7224,
+      "step": 78
+    },
+    {
+      "epoch": 2.7729730948775375e-05,
+      "grad_norm": 0.41149672865867615,
+      "learning_rate": 0.00019512520868113524,
+      "loss": 0.5621,
+      "step": 79
+    },
+    {
+      "epoch": 2.808074020129152e-05,
+      "grad_norm": 0.4199008345603943,
+      "learning_rate": 0.0001950584307178631,
+      "loss": 0.7038,
+      "step": 80
+    },
+    {
+      "epoch": 2.8431749453807664e-05,
+      "grad_norm": 0.4378969371318817,
+      "learning_rate": 0.00019499165275459098,
+      "loss": 0.6654,
+      "step": 81
+    },
+    {
+      "epoch": 2.878275870632381e-05,
+      "grad_norm": 0.4653928279876709,
+      "learning_rate": 0.00019492487479131886,
+      "loss": 0.6241,
+      "step": 82
+    },
+    {
+      "epoch": 2.9133767958839953e-05,
+      "grad_norm": 0.5166454911231995,
+      "learning_rate": 0.00019485809682804673,
+      "loss": 0.5366,
+      "step": 83
+    },
+    {
+      "epoch": 2.9484777211356096e-05,
+      "grad_norm": 0.43180733919143677,
+      "learning_rate": 0.00019479131886477463,
+      "loss": 0.6178,
+      "step": 84
+    },
+    {
+      "epoch": 2.9835786463872242e-05,
+      "grad_norm": 0.44828200340270996,
+      "learning_rate": 0.0001947245409015025,
+      "loss": 0.6706,
+      "step": 85
+    },
+    {
+      "epoch": 3.0186795716388385e-05,
+      "grad_norm": 0.384175181388855,
+      "learning_rate": 0.0001946577629382304,
+      "loss": 0.5551,
+      "step": 86
+    },
+    {
+      "epoch": 3.053780496890453e-05,
+      "grad_norm": 0.4359772503376007,
+      "learning_rate": 0.00019459098497495828,
+      "loss": 0.5626,
+      "step": 87
+    },
+    {
+      "epoch": 3.0888814221420673e-05,
+      "grad_norm": 0.4177016615867615,
+      "learning_rate": 0.00019452420701168615,
+      "loss": 0.6023,
+      "step": 88
+    },
+    {
+      "epoch": 3.1239823473936816e-05,
+      "grad_norm": 0.43592438101768494,
+      "learning_rate": 0.00019445742904841405,
+      "loss": 0.682,
+      "step": 89
+    },
+    {
+      "epoch": 3.159083272645296e-05,
+      "grad_norm": 0.48027974367141724,
+      "learning_rate": 0.00019439065108514192,
+      "loss": 0.7596,
+      "step": 90
+    },
+    {
+      "epoch": 3.194184197896911e-05,
+      "grad_norm": 0.35989537835121155,
+      "learning_rate": 0.0001943238731218698,
+      "loss": 0.6018,
+      "step": 91
+    },
+    {
+      "epoch": 3.229285123148525e-05,
+      "grad_norm": 0.48477092385292053,
+      "learning_rate": 0.00019425709515859767,
+      "loss": 0.512,
+      "step": 92
+    },
+    {
+      "epoch": 3.2643860484001394e-05,
+      "grad_norm": 0.38858646154403687,
+      "learning_rate": 0.00019419031719532554,
+      "loss": 0.6371,
+      "step": 93
+    },
+    {
+      "epoch": 3.2994869736517536e-05,
+      "grad_norm": 0.5323147177696228,
+      "learning_rate": 0.00019412353923205344,
+      "loss": 0.5221,
+      "step": 94
+    },
+    {
+      "epoch": 3.334587898903368e-05,
+      "grad_norm": 0.3784274160861969,
+      "learning_rate": 0.00019405676126878132,
+      "loss": 0.6158,
+      "step": 95
+    },
+    {
+      "epoch": 3.369688824154982e-05,
+      "grad_norm": 0.4076334834098816,
+      "learning_rate": 0.0001939899833055092,
+      "loss": 0.5535,
+      "step": 96
+    },
+    {
+      "epoch": 3.404789749406597e-05,
+      "grad_norm": 0.43930479884147644,
+      "learning_rate": 0.00019392320534223706,
+      "loss": 0.6482,
+      "step": 97
+    },
+    {
+      "epoch": 3.4398906746582114e-05,
+      "grad_norm": 0.4266909658908844,
+      "learning_rate": 0.00019385642737896494,
+      "loss": 0.6,
+      "step": 98
+    },
+    {
+      "epoch": 3.474991599909826e-05,
+      "grad_norm": 0.45353513956069946,
+      "learning_rate": 0.0001937896494156928,
+      "loss": 0.6596,
+      "step": 99
+    },
+    {
+      "epoch": 3.51009252516144e-05,
+      "grad_norm": 0.3424838185310364,
+      "learning_rate": 0.0001937228714524207,
+      "loss": 0.555,
+      "step": 100
+    },
+    {
+      "epoch": 3.545193450413054e-05,
+      "grad_norm": 0.40126165747642517,
+      "learning_rate": 0.00019365609348914858,
+      "loss": 0.6921,
+      "step": 101
+    },
+    {
+      "epoch": 3.580294375664669e-05,
+      "grad_norm": 0.36572012305259705,
+      "learning_rate": 0.00019358931552587646,
+      "loss": 0.5485,
+      "step": 102
+    },
+    {
+      "epoch": 3.6153953009162834e-05,
+      "grad_norm": 0.3972407281398773,
+      "learning_rate": 0.00019352253756260436,
+      "loss": 0.5884,
+      "step": 103
+    },
+    {
+      "epoch": 3.650496226167898e-05,
+      "grad_norm": 0.3900579512119293,
+      "learning_rate": 0.00019345575959933223,
+      "loss": 0.6664,
+      "step": 104
+    },
+    {
+      "epoch": 3.685597151419512e-05,
+      "grad_norm": 0.31666621565818787,
+      "learning_rate": 0.00019338898163606013,
+      "loss": 0.5009,
+      "step": 105
+    },
+    {
+      "epoch": 3.720698076671126e-05,
+      "grad_norm": 0.5269597172737122,
+      "learning_rate": 0.000193322203672788,
+      "loss": 0.6292,
+      "step": 106
+    },
+    {
+      "epoch": 3.755799001922741e-05,
+      "grad_norm": 0.4645126163959503,
+      "learning_rate": 0.00019325542570951588,
+      "loss": 0.636,
+      "step": 107
+    },
+    {
+      "epoch": 3.7908999271743555e-05,
+      "grad_norm": 0.3900754153728485,
+      "learning_rate": 0.00019318864774624375,
+      "loss": 0.5367,
+      "step": 108
+    },
+    {
+      "epoch": 3.82600085242597e-05,
+      "grad_norm": 0.42533883452415466,
+      "learning_rate": 0.00019312186978297162,
+      "loss": 0.6862,
+      "step": 109
+    },
+    {
+      "epoch": 3.861101777677584e-05,
+      "grad_norm": 0.6809422969818115,
+      "learning_rate": 0.00019305509181969952,
+      "loss": 0.6434,
+      "step": 110
+    },
+    {
+      "epoch": 3.896202702929198e-05,
+      "grad_norm": 0.5127860307693481,
+      "learning_rate": 0.0001929883138564274,
+      "loss": 0.6266,
+      "step": 111
+    },
+    {
+      "epoch": 3.931303628180813e-05,
+      "grad_norm": 0.5254234671592712,
+      "learning_rate": 0.00019292153589315527,
+      "loss": 0.6982,
+      "step": 112
+    },
+    {
+      "epoch": 3.9664045534324275e-05,
+      "grad_norm": 0.3699031472206116,
+      "learning_rate": 0.00019285475792988314,
+      "loss": 0.6037,
+      "step": 113
+    },
+    {
+      "epoch": 4.001505478684042e-05,
+      "grad_norm": 0.3807130455970764,
+      "learning_rate": 0.00019278797996661101,
+      "loss": 0.5861,
+      "step": 114
+    },
+    {
+      "epoch": 4.036606403935656e-05,
+      "grad_norm": 0.4455645978450775,
+      "learning_rate": 0.0001927212020033389,
+      "loss": 0.5658,
+      "step": 115
+    },
+    {
+      "epoch": 4.07170732918727e-05,
+      "grad_norm": 0.3830210864543915,
+      "learning_rate": 0.0001926544240400668,
+      "loss": 0.606,
+      "step": 116
+    },
+    {
+      "epoch": 4.106808254438885e-05,
+      "grad_norm": 0.41419631242752075,
+      "learning_rate": 0.00019258764607679466,
+      "loss": 0.6095,
+      "step": 117
+    },
+    {
+      "epoch": 4.1419091796904995e-05,
+      "grad_norm": 0.3929574489593506,
+      "learning_rate": 0.00019252086811352253,
+      "loss": 0.6464,
+      "step": 118
+    },
+    {
+      "epoch": 4.177010104942114e-05,
+      "grad_norm": 0.35958629846572876,
+      "learning_rate": 0.0001924540901502504,
+      "loss": 0.5185,
+      "step": 119
+    },
+    {
+      "epoch": 4.212111030193728e-05,
+      "grad_norm": 0.3790556490421295,
+      "learning_rate": 0.0001923873121869783,
+      "loss": 0.5156,
+      "step": 120
+    },
+    {
+      "epoch": 4.2472119554453423e-05,
+      "grad_norm": 0.37452438473701477,
+      "learning_rate": 0.00019232053422370618,
+      "loss": 0.5711,
+      "step": 121
+    },
+    {
+      "epoch": 4.282312880696957e-05,
+      "grad_norm": 0.38976770639419556,
+      "learning_rate": 0.00019225375626043408,
+      "loss": 0.6075,
+      "step": 122
+    },
+    {
+      "epoch": 4.3174138059485716e-05,
+      "grad_norm": 0.4098513424396515,
+      "learning_rate": 0.00019218697829716195,
+      "loss": 0.5312,
+      "step": 123
+    },
+    {
+      "epoch": 4.352514731200186e-05,
+      "grad_norm": 0.33890047669410706,
+      "learning_rate": 0.00019212020033388983,
+      "loss": 0.4984,
+      "step": 124
+    },
+    {
+      "epoch": 4.3876156564518e-05,
+      "grad_norm": 0.49077001214027405,
+      "learning_rate": 0.0001920534223706177,
+      "loss": 0.7159,
+      "step": 125
+    },
+    {
+      "epoch": 4.4227165817034144e-05,
+      "grad_norm": 0.41653814911842346,
+      "learning_rate": 0.0001919866444073456,
+      "loss": 0.5642,
+      "step": 126
+    },
+    {
+      "epoch": 4.4578175069550286e-05,
+      "grad_norm": 0.45710283517837524,
+      "learning_rate": 0.00019191986644407347,
+      "loss": 0.6936,
+      "step": 127
+    },
+    {
+      "epoch": 4.4929184322066436e-05,
+      "grad_norm": 0.36976873874664307,
+      "learning_rate": 0.00019185308848080135,
+      "loss": 0.5407,
+      "step": 128
+    },
+    {
+      "epoch": 4.528019357458258e-05,
+      "grad_norm": 0.42852675914764404,
+      "learning_rate": 0.00019178631051752922,
+      "loss": 0.6731,
+      "step": 129
+    },
+    {
+      "epoch": 4.563120282709872e-05,
+      "grad_norm": 0.5426310300827026,
+      "learning_rate": 0.0001917195325542571,
+      "loss": 0.5775,
+      "step": 130
+    },
+    {
+      "epoch": 4.5982212079614864e-05,
+      "grad_norm": 0.38442543148994446,
+      "learning_rate": 0.00019165275459098497,
+      "loss": 0.5994,
+      "step": 131
+    },
+    {
+      "epoch": 4.633322133213101e-05,
+      "grad_norm": 0.4298035502433777,
+      "learning_rate": 0.00019158597662771287,
+      "loss": 0.5563,
+      "step": 132
+    },
+    {
+      "epoch": 4.6684230584647156e-05,
+      "grad_norm": 0.40397605299949646,
+      "learning_rate": 0.00019151919866444074,
+      "loss": 0.6924,
+      "step": 133
+    },
+    {
+      "epoch": 4.70352398371633e-05,
+      "grad_norm": 0.4338497519493103,
+      "learning_rate": 0.0001914524207011686,
+      "loss": 0.5739,
+      "step": 134
+    },
+    {
+      "epoch": 4.738624908967944e-05,
+      "grad_norm": 0.39713653922080994,
+      "learning_rate": 0.0001913856427378965,
+      "loss": 0.4529,
+      "step": 135
+    },
+    {
+      "epoch": 4.7737258342195584e-05,
+      "grad_norm": 0.31409478187561035,
+      "learning_rate": 0.0001913188647746244,
+      "loss": 0.562,
+      "step": 136
+    },
+    {
+      "epoch": 4.808826759471173e-05,
+      "grad_norm": 0.371624618768692,
+      "learning_rate": 0.00019125208681135226,
+      "loss": 0.5288,
+      "step": 137
+    },
+    {
+      "epoch": 4.8439276847227877e-05,
+      "grad_norm": 0.4600190818309784,
+      "learning_rate": 0.00019118530884808016,
+      "loss": 0.6215,
+      "step": 138
+    },
+    {
+      "epoch": 4.879028609974402e-05,
+      "grad_norm": 0.45351359248161316,
+      "learning_rate": 0.00019111853088480803,
+      "loss": 0.686,
+      "step": 139
+    },
+    {
+      "epoch": 4.914129535226016e-05,
+      "grad_norm": 0.42282962799072266,
+      "learning_rate": 0.0001910517529215359,
+      "loss": 0.5966,
+      "step": 140
+    },
+    {
+      "epoch": 4.9492304604776305e-05,
+      "grad_norm": 0.41479986906051636,
+      "learning_rate": 0.00019098497495826378,
+      "loss": 0.5948,
+      "step": 141
+    },
+    {
+      "epoch": 4.984331385729245e-05,
+      "grad_norm": 0.40453553199768066,
+      "learning_rate": 0.00019091819699499168,
+      "loss": 0.6411,
+      "step": 142
+    },
+    {
+      "epoch": 5.01943231098086e-05,
+      "grad_norm": 0.3939369320869446,
+      "learning_rate": 0.00019085141903171955,
+      "loss": 0.5513,
+      "step": 143
+    },
+    {
+      "epoch": 5.054533236232474e-05,
+      "grad_norm": 0.3700481653213501,
+      "learning_rate": 0.00019078464106844743,
+      "loss": 0.5459,
+      "step": 144
+    },
+    {
+      "epoch": 5.089634161484088e-05,
+      "grad_norm": 0.4377487897872925,
+      "learning_rate": 0.0001907178631051753,
+      "loss": 0.6076,
+      "step": 145
+    },
+    {
+      "epoch": 5.1247350867357025e-05,
+      "grad_norm": 0.37919673323631287,
+      "learning_rate": 0.00019065108514190317,
+      "loss": 0.5207,
+      "step": 146
+    },
+    {
+      "epoch": 5.159836011987317e-05,
+      "grad_norm": 0.3841630816459656,
+      "learning_rate": 0.00019058430717863107,
+      "loss": 0.614,
+      "step": 147
+    },
+    {
+      "epoch": 5.194936937238932e-05,
+      "grad_norm": 0.43541714549064636,
+      "learning_rate": 0.00019051752921535895,
+      "loss": 0.6283,
+      "step": 148
+    },
+    {
+      "epoch": 5.230037862490546e-05,
+      "grad_norm": 0.4853285253047943,
+      "learning_rate": 0.00019045075125208682,
+      "loss": 0.5807,
+      "step": 149
+    },
+    {
+      "epoch": 5.26513878774216e-05,
+      "grad_norm": 0.3572970926761627,
+      "learning_rate": 0.0001903839732888147,
+      "loss": 0.6866,
+      "step": 150
+    },
+    {
+      "epoch": 5.3002397129937745e-05,
+      "grad_norm": 0.3674347698688507,
+      "learning_rate": 0.00019031719532554257,
+      "loss": 0.5552,
+      "step": 151
+    },
+    {
+      "epoch": 5.335340638245389e-05,
+      "grad_norm": 0.37748461961746216,
+      "learning_rate": 0.00019025041736227044,
+      "loss": 0.6278,
+      "step": 152
+    },
+    {
+      "epoch": 5.370441563497003e-05,
+      "grad_norm": 0.3788503408432007,
+      "learning_rate": 0.00019018363939899834,
+      "loss": 0.622,
+      "step": 153
+    },
+    {
+      "epoch": 5.405542488748618e-05,
+      "grad_norm": 0.3736303150653839,
+      "learning_rate": 0.0001901168614357262,
+      "loss": 0.5822,
+      "step": 154
+    },
+    {
+      "epoch": 5.440643414000232e-05,
+      "grad_norm": 0.32680070400238037,
+      "learning_rate": 0.0001900500834724541,
+      "loss": 0.5715,
+      "step": 155
+    },
+    {
+      "epoch": 5.4757443392518466e-05,
+      "grad_norm": 0.34495192766189575,
+      "learning_rate": 0.00018998330550918199,
+      "loss": 0.6497,
+      "step": 156
+    },
+    {
+      "epoch": 5.510845264503461e-05,
+      "grad_norm": 0.4244193136692047,
+      "learning_rate": 0.00018991652754590986,
+      "loss": 0.5519,
+      "step": 157
+    },
+    {
+      "epoch": 5.545946189755075e-05,
+      "grad_norm": 0.4024031162261963,
+      "learning_rate": 0.00018984974958263776,
+      "loss": 0.5339,
+      "step": 158
+    },
+    {
+      "epoch": 5.58104711500669e-05,
+      "grad_norm": 0.46051299571990967,
+      "learning_rate": 0.00018978297161936563,
+      "loss": 0.5979,
+      "step": 159
+    },
+    {
+      "epoch": 5.616148040258304e-05,
+      "grad_norm": 0.49051615595817566,
+      "learning_rate": 0.0001897161936560935,
+      "loss": 0.5563,
+      "step": 160
+    },
+    {
+      "epoch": 5.6512489655099186e-05,
+      "grad_norm": 0.43045854568481445,
+      "learning_rate": 0.00018964941569282138,
+      "loss": 0.5984,
+      "step": 161
+    },
+    {
+      "epoch": 5.686349890761533e-05,
+      "grad_norm": 0.37778228521347046,
+      "learning_rate": 0.00018958263772954925,
+      "loss": 0.5955,
+      "step": 162
+    },
+    {
+      "epoch": 5.721450816013147e-05,
+      "grad_norm": 0.3736341893672943,
+      "learning_rate": 0.00018951585976627715,
+      "loss": 0.6438,
+      "step": 163
+    },
+    {
+      "epoch": 5.756551741264762e-05,
+      "grad_norm": 0.3940117061138153,
+      "learning_rate": 0.00018944908180300502,
+      "loss": 0.503,
+      "step": 164
+    },
+    {
+      "epoch": 5.7916526665163763e-05,
+      "grad_norm": 0.4193519055843353,
+      "learning_rate": 0.0001893823038397329,
+      "loss": 0.6324,
+      "step": 165
+    },
+    {
+      "epoch": 5.8267535917679906e-05,
+      "grad_norm": 0.34481996297836304,
+      "learning_rate": 0.00018931552587646077,
+      "loss": 0.5745,
+      "step": 166
+    },
+    {
+      "epoch": 5.861854517019605e-05,
+      "grad_norm": 0.38285771012306213,
+      "learning_rate": 0.00018924874791318864,
+      "loss": 0.639,
+      "step": 167
+    },
+    {
+      "epoch": 5.896955442271219e-05,
+      "grad_norm": 0.36933982372283936,
+      "learning_rate": 0.00018918196994991652,
+      "loss": 0.6681,
+      "step": 168
+    },
+    {
+      "epoch": 5.932056367522834e-05,
+      "grad_norm": 0.36970776319503784,
+      "learning_rate": 0.00018911519198664442,
+      "loss": 0.5626,
+      "step": 169
+    },
+    {
+      "epoch": 5.9671572927744484e-05,
+      "grad_norm": 0.38494783639907837,
+      "learning_rate": 0.0001890484140233723,
+      "loss": 0.6066,
+      "step": 170
+    },
+    {
+      "epoch": 6.0022582180260627e-05,
+      "grad_norm": 0.3446069061756134,
+      "learning_rate": 0.00018898163606010016,
+      "loss": 0.6354,
+      "step": 171
+    },
+    {
+      "epoch": 6.037359143277677e-05,
+      "grad_norm": 0.4466759264469147,
+      "learning_rate": 0.00018891485809682806,
+      "loss": 0.4737,
+      "step": 172
+    },
+    {
+      "epoch": 6.072460068529291e-05,
+      "grad_norm": 0.43630918860435486,
+      "learning_rate": 0.00018884808013355594,
+      "loss": 0.6839,
+      "step": 173
+    },
+    {
+      "epoch": 6.107560993780906e-05,
+      "grad_norm": 0.37083202600479126,
+      "learning_rate": 0.00018878130217028384,
+      "loss": 0.5372,
+      "step": 174
+    },
+    {
+      "epoch": 6.14266191903252e-05,
+      "grad_norm": 0.37066200375556946,
+      "learning_rate": 0.0001887145242070117,
+      "loss": 0.6653,
+      "step": 175
+    },
+    {
+      "epoch": 6.177762844284135e-05,
+      "grad_norm": 0.5191747546195984,
+      "learning_rate": 0.00018864774624373958,
+      "loss": 0.6677,
+      "step": 176
+    },
+    {
+      "epoch": 6.21286376953575e-05,
+      "grad_norm": 0.4235158860683441,
+      "learning_rate": 0.00018858096828046746,
+      "loss": 0.5971,
+      "step": 177
+    },
+    {
+      "epoch": 6.247964694787363e-05,
+      "grad_norm": 0.405074805021286,
+      "learning_rate": 0.00018851419031719533,
+      "loss": 0.5717,
+      "step": 178
+    },
+    {
+      "epoch": 6.283065620038978e-05,
+      "grad_norm": 0.45817336440086365,
+      "learning_rate": 0.00018844741235392323,
+      "loss": 0.5878,
+      "step": 179
+    },
+    {
+      "epoch": 6.318166545290592e-05,
+      "grad_norm": 0.6313037276268005,
+      "learning_rate": 0.0001883806343906511,
+      "loss": 0.62,
+      "step": 180
+    },
+    {
+      "epoch": 6.353267470542207e-05,
+      "grad_norm": 0.41896742582321167,
+      "learning_rate": 0.00018831385642737898,
+      "loss": 0.5565,
+      "step": 181
+    },
+    {
+      "epoch": 6.388368395793822e-05,
+      "grad_norm": 0.4143432676792145,
+      "learning_rate": 0.00018824707846410685,
+      "loss": 0.5552,
+      "step": 182
+    },
+    {
+      "epoch": 6.423469321045435e-05,
+      "grad_norm": 0.38745641708374023,
+      "learning_rate": 0.00018818030050083472,
+      "loss": 0.5949,
+      "step": 183
+    },
+    {
+      "epoch": 6.45857024629705e-05,
+      "grad_norm": 0.7472612261772156,
+      "learning_rate": 0.0001881135225375626,
+      "loss": 0.6708,
+      "step": 184
+    },
+    {
+      "epoch": 6.493671171548664e-05,
+      "grad_norm": 0.4416198432445526,
+      "learning_rate": 0.0001880467445742905,
+      "loss": 0.6069,
+      "step": 185
+    },
+    {
+      "epoch": 6.528772096800279e-05,
+      "grad_norm": 0.4312993884086609,
+      "learning_rate": 0.00018797996661101837,
+      "loss": 0.5778,
+      "step": 186
+    },
+    {
+      "epoch": 6.563873022051894e-05,
+      "grad_norm": 0.4524860978126526,
+      "learning_rate": 0.00018791318864774624,
+      "loss": 0.5091,
+      "step": 187
+    },
+    {
+      "epoch": 6.598973947303507e-05,
+      "grad_norm": 0.4320828914642334,
+      "learning_rate": 0.00018784641068447412,
+      "loss": 0.6557,
+      "step": 188
+    },
+    {
+      "epoch": 6.634074872555122e-05,
+      "grad_norm": 0.6967452168464661,
+      "learning_rate": 0.00018777963272120202,
+      "loss": 0.612,
+      "step": 189
+    },
+    {
+      "epoch": 6.669175797806736e-05,
+      "grad_norm": 0.4389924705028534,
+      "learning_rate": 0.0001877128547579299,
+      "loss": 0.6271,
+      "step": 190
+    },
+    {
+      "epoch": 6.704276723058351e-05,
+      "grad_norm": 0.3693922162055969,
+      "learning_rate": 0.0001876460767946578,
+      "loss": 0.6715,
+      "step": 191
+    },
+    {
+      "epoch": 6.739377648309964e-05,
+      "grad_norm": 0.32230404019355774,
+      "learning_rate": 0.00018757929883138566,
+      "loss": 0.6344,
+      "step": 192
+    },
+    {
+      "epoch": 6.774478573561579e-05,
+      "grad_norm": 0.4440002143383026,
+      "learning_rate": 0.00018751252086811354,
+      "loss": 0.6671,
+      "step": 193
+    },
+    {
+      "epoch": 6.809579498813194e-05,
+      "grad_norm": 0.5676587820053101,
+      "learning_rate": 0.0001874457429048414,
+      "loss": 0.6818,
+      "step": 194
+    },
+    {
+      "epoch": 6.844680424064808e-05,
+      "grad_norm": 0.36207348108291626,
+      "learning_rate": 0.0001873789649415693,
+      "loss": 0.5029,
+      "step": 195
+    },
+    {
+      "epoch": 6.879781349316423e-05,
+      "grad_norm": 0.35714131593704224,
+      "learning_rate": 0.00018731218697829718,
+      "loss": 0.6127,
+      "step": 196
+    },
+    {
+      "epoch": 6.914882274568036e-05,
+      "grad_norm": 0.4285273551940918,
+      "learning_rate": 0.00018724540901502506,
+      "loss": 0.6355,
+      "step": 197
+    },
+    {
+      "epoch": 6.949983199819651e-05,
+      "grad_norm": 0.42585939168930054,
+      "learning_rate": 0.00018717863105175293,
+      "loss": 0.6302,
+      "step": 198
+    },
+    {
+      "epoch": 6.985084125071266e-05,
+      "grad_norm": 0.524303138256073,
+      "learning_rate": 0.0001871118530884808,
+      "loss": 0.6683,
+      "step": 199
+    },
+    {
+      "epoch": 7.02018505032288e-05,
+      "grad_norm": 0.39635923504829407,
+      "learning_rate": 0.00018704507512520868,
+      "loss": 0.6694,
+      "step": 200
+    },
+    {
+      "epoch": 7.055285975574495e-05,
+      "grad_norm": 0.39712437987327576,
+      "learning_rate": 0.00018697829716193658,
+      "loss": 0.5794,
+      "step": 201
+    },
+    {
+      "epoch": 7.090386900826108e-05,
+      "grad_norm": 0.4115397334098816,
+      "learning_rate": 0.00018691151919866445,
+      "loss": 0.5579,
+      "step": 202
+    },
+    {
+      "epoch": 7.125487826077723e-05,
+      "grad_norm": 0.4776385724544525,
+      "learning_rate": 0.00018684474123539232,
+      "loss": 0.5589,
+      "step": 203
+    },
+    {
+      "epoch": 7.160588751329338e-05,
+      "grad_norm": 0.35574638843536377,
+      "learning_rate": 0.0001867779632721202,
+      "loss": 0.5311,
+      "step": 204
+    },
+    {
+      "epoch": 7.195689676580952e-05,
+      "grad_norm": 0.44872432947158813,
+      "learning_rate": 0.00018671118530884807,
+      "loss": 0.635,
+      "step": 205
+    },
+    {
+      "epoch": 7.230790601832567e-05,
+      "grad_norm": 0.3511079251766205,
+      "learning_rate": 0.00018664440734557597,
+      "loss": 0.5317,
+      "step": 206
+    },
+    {
+      "epoch": 7.26589152708418e-05,
+      "grad_norm": 0.39862194657325745,
+      "learning_rate": 0.00018657762938230384,
+      "loss": 0.6653,
+      "step": 207
+    },
+    {
+      "epoch": 7.300992452335795e-05,
+      "grad_norm": 0.4046575725078583,
+      "learning_rate": 0.00018651085141903174,
+      "loss": 0.6065,
+      "step": 208
+    },
+    {
+      "epoch": 7.33609337758741e-05,
+      "grad_norm": 0.4231868088245392,
+      "learning_rate": 0.00018644407345575962,
+      "loss": 0.7078,
+      "step": 209
+    },
+    {
+      "epoch": 7.371194302839024e-05,
+      "grad_norm": 0.364700049161911,
+      "learning_rate": 0.0001863772954924875,
+      "loss": 0.6309,
+      "step": 210
+    },
+    {
+      "epoch": 7.406295228090639e-05,
+      "grad_norm": 0.5385531187057495,
+      "learning_rate": 0.0001863105175292154,
+      "loss": 0.4233,
+      "step": 211
+    },
+    {
+      "epoch": 7.441396153342252e-05,
+      "grad_norm": 0.39415115118026733,
+      "learning_rate": 0.00018624373956594326,
+      "loss": 0.5928,
+      "step": 212
+    },
+    {
+      "epoch": 7.476497078593867e-05,
+      "grad_norm": 0.6021363735198975,
+      "learning_rate": 0.00018617696160267113,
+      "loss": 0.6611,
+      "step": 213
+    },
+    {
+      "epoch": 7.511598003845482e-05,
+      "grad_norm": 0.3709903061389923,
+      "learning_rate": 0.000186110183639399,
+      "loss": 0.6136,
+      "step": 214
+    },
+    {
+      "epoch": 7.546698929097096e-05,
+      "grad_norm": 0.36710435152053833,
+      "learning_rate": 0.00018604340567612688,
+      "loss": 0.5267,
+      "step": 215
+    },
+    {
+      "epoch": 7.581799854348711e-05,
+      "grad_norm": 0.4379352033138275,
+      "learning_rate": 0.00018597662771285475,
+      "loss": 0.6429,
+      "step": 216
+    },
+    {
+      "epoch": 7.616900779600325e-05,
+      "grad_norm": 0.3408482074737549,
+      "learning_rate": 0.00018590984974958265,
+      "loss": 0.5379,
+      "step": 217
+    },
+    {
+      "epoch": 7.65200170485194e-05,
+      "grad_norm": 0.4487043023109436,
+      "learning_rate": 0.00018584307178631053,
+      "loss": 0.6582,
+      "step": 218
+    },
+    {
+      "epoch": 7.687102630103554e-05,
+      "grad_norm": 0.42003679275512695,
+      "learning_rate": 0.0001857762938230384,
+      "loss": 0.5712,
+      "step": 219
+    },
+    {
+      "epoch": 7.722203555355168e-05,
+      "grad_norm": 0.4698665738105774,
+      "learning_rate": 0.00018570951585976627,
+      "loss": 0.5715,
+      "step": 220
+    },
+    {
+      "epoch": 7.757304480606783e-05,
+      "grad_norm": 0.3777780830860138,
+      "learning_rate": 0.00018564273789649415,
+      "loss": 0.4667,
+      "step": 221
+    },
+    {
+      "epoch": 7.792405405858397e-05,
+      "grad_norm": 0.36794212460517883,
+      "learning_rate": 0.00018557595993322205,
+      "loss": 0.5382,
+      "step": 222
+    },
+    {
+      "epoch": 7.827506331110012e-05,
+      "grad_norm": 0.4582989513874054,
+      "learning_rate": 0.00018550918196994992,
+      "loss": 0.6437,
+      "step": 223
+    },
+    {
+      "epoch": 7.862607256361626e-05,
+      "grad_norm": 0.4065852761268616,
+      "learning_rate": 0.0001854424040066778,
+      "loss": 0.6928,
+      "step": 224
+    },
+    {
+      "epoch": 7.89770818161324e-05,
+      "grad_norm": 0.3857649564743042,
+      "learning_rate": 0.0001853756260434057,
+      "loss": 0.5405,
+      "step": 225
+    },
+    {
+      "epoch": 7.932809106864855e-05,
+      "grad_norm": 0.40056589245796204,
+      "learning_rate": 0.00018530884808013357,
+      "loss": 0.6425,
+      "step": 226
+    },
+    {
+      "epoch": 7.967910032116469e-05,
+      "grad_norm": 0.43137016892433167,
+      "learning_rate": 0.00018524207011686147,
+      "loss": 0.5001,
+      "step": 227
+    },
+    {
+      "epoch": 8.003010957368084e-05,
+      "grad_norm": 0.3723987340927124,
+      "learning_rate": 0.00018517529215358934,
+      "loss": 0.5118,
+      "step": 228
+    },
+    {
+      "epoch": 8.038111882619698e-05,
+      "grad_norm": 0.34196361899375916,
+      "learning_rate": 0.00018510851419031721,
+      "loss": 0.5468,
+      "step": 229
+    },
+    {
+      "epoch": 8.073212807871312e-05,
+      "grad_norm": 0.4319117069244385,
+      "learning_rate": 0.0001850417362270451,
+      "loss": 0.5703,
+      "step": 230
+    },
+    {
+      "epoch": 8.108313733122927e-05,
+      "grad_norm": 0.4467247724533081,
+      "learning_rate": 0.00018497495826377296,
+      "loss": 0.6536,
+      "step": 231
+    },
+    {
+      "epoch": 8.14341465837454e-05,
+      "grad_norm": 0.3569909632205963,
+      "learning_rate": 0.00018490818030050083,
+      "loss": 0.5335,
+      "step": 232
+    },
+    {
+      "epoch": 8.178515583626156e-05,
+      "grad_norm": 0.33486437797546387,
+      "learning_rate": 0.00018484140233722873,
+      "loss": 0.6803,
+      "step": 233
+    },
+    {
+      "epoch": 8.21361650887777e-05,
+      "grad_norm": 0.3783140480518341,
+      "learning_rate": 0.0001847746243739566,
+      "loss": 0.6361,
+      "step": 234
+    },
+    {
+      "epoch": 8.248717434129384e-05,
+      "grad_norm": 0.4844662547111511,
+      "learning_rate": 0.00018470784641068448,
+      "loss": 0.5322,
+      "step": 235
+    },
+    {
+      "epoch": 8.283818359380999e-05,
+      "grad_norm": 0.508406400680542,
+      "learning_rate": 0.00018464106844741235,
+      "loss": 0.6676,
+      "step": 236
+    },
+    {
+      "epoch": 8.318919284632613e-05,
+      "grad_norm": 0.3710225820541382,
+      "learning_rate": 0.00018457429048414023,
+      "loss": 0.6656,
+      "step": 237
+    },
+    {
+      "epoch": 8.354020209884228e-05,
+      "grad_norm": 0.3757292628288269,
+      "learning_rate": 0.00018450751252086813,
+      "loss": 0.6095,
+      "step": 238
+    },
+    {
+      "epoch": 8.389121135135843e-05,
+      "grad_norm": 0.40651261806488037,
+      "learning_rate": 0.000184440734557596,
+      "loss": 0.6626,
+      "step": 239
+    },
+    {
+      "epoch": 8.424222060387456e-05,
+      "grad_norm": 0.40700778365135193,
+      "learning_rate": 0.00018437395659432387,
+      "loss": 0.5328,
+      "step": 240
+    },
+    {
+      "epoch": 8.459322985639071e-05,
+      "grad_norm": 0.5067440867424011,
+      "learning_rate": 0.00018430717863105175,
+      "loss": 0.4811,
+      "step": 241
+    },
+    {
+      "epoch": 8.494423910890685e-05,
+      "grad_norm": 0.3934602737426758,
+      "learning_rate": 0.00018424040066777965,
+      "loss": 0.5691,
+      "step": 242
+    },
+    {
+      "epoch": 8.5295248361423e-05,
+      "grad_norm": 0.3360019624233246,
+      "learning_rate": 0.00018417362270450752,
+      "loss": 0.5542,
+      "step": 243
+    },
+    {
+      "epoch": 8.564625761393915e-05,
+      "grad_norm": 0.4023631513118744,
+      "learning_rate": 0.00018410684474123542,
+      "loss": 0.5192,
+      "step": 244
+    },
+    {
+      "epoch": 8.599726686645528e-05,
+      "grad_norm": 0.41704171895980835,
+      "learning_rate": 0.0001840400667779633,
+      "loss": 0.5018,
+      "step": 245
+    },
+    {
+      "epoch": 8.634827611897143e-05,
+      "grad_norm": 0.361977756023407,
+      "learning_rate": 0.00018397328881469117,
+      "loss": 0.6193,
+      "step": 246
+    },
+    {
+      "epoch": 8.669928537148757e-05,
+      "grad_norm": 0.37774717807769775,
+      "learning_rate": 0.00018390651085141904,
+      "loss": 0.5552,
+      "step": 247
+    },
+    {
+      "epoch": 8.705029462400372e-05,
+      "grad_norm": 0.3408471941947937,
+      "learning_rate": 0.0001838397328881469,
+      "loss": 0.5876,
+      "step": 248
+    },
+    {
+      "epoch": 8.740130387651985e-05,
+      "grad_norm": 0.3892226815223694,
+      "learning_rate": 0.0001837729549248748,
+      "loss": 0.4227,
+      "step": 249
+    },
+    {
+      "epoch": 8.7752313129036e-05,
+      "grad_norm": 0.5315036177635193,
+      "learning_rate": 0.00018370617696160269,
+      "loss": 0.5826,
+      "step": 250
+    },
+    {
+      "epoch": 8.810332238155215e-05,
+      "grad_norm": 0.35433024168014526,
+      "learning_rate": 0.00018363939899833056,
+      "loss": 0.5992,
+      "step": 251
+    },
+    {
+      "epoch": 8.845433163406829e-05,
+      "grad_norm": 0.34777382016181946,
+      "learning_rate": 0.00018357262103505843,
+      "loss": 0.4973,
+      "step": 252
+    },
+    {
+      "epoch": 8.880534088658444e-05,
+      "grad_norm": 0.3936387002468109,
+      "learning_rate": 0.0001835058430717863,
+      "loss": 0.6254,
+      "step": 253
+    },
+    {
+      "epoch": 8.915635013910057e-05,
+      "grad_norm": 0.4009217917919159,
+      "learning_rate": 0.0001834390651085142,
+      "loss": 0.4843,
+      "step": 254
+    },
+    {
+      "epoch": 8.950735939161672e-05,
+      "grad_norm": 0.4863683879375458,
+      "learning_rate": 0.00018337228714524208,
+      "loss": 0.5204,
+      "step": 255
+    },
+    {
+      "epoch": 8.985836864413287e-05,
+      "grad_norm": 0.6100988984107971,
+      "learning_rate": 0.00018330550918196995,
+      "loss": 0.7296,
+      "step": 256
+    },
+    {
+      "epoch": 9.020937789664901e-05,
+      "grad_norm": 0.40949374437332153,
+      "learning_rate": 0.00018323873121869782,
+      "loss": 0.5707,
+      "step": 257
+    },
+    {
+      "epoch": 9.056038714916516e-05,
+      "grad_norm": 0.47316402196884155,
+      "learning_rate": 0.0001831719532554257,
+      "loss": 0.6655,
+      "step": 258
+    },
+    {
+      "epoch": 9.091139640168129e-05,
+      "grad_norm": 0.4053696393966675,
+      "learning_rate": 0.0001831051752921536,
+      "loss": 0.5822,
+      "step": 259
+    },
+    {
+      "epoch": 9.126240565419744e-05,
+      "grad_norm": 0.4582972228527069,
+      "learning_rate": 0.00018303839732888147,
+      "loss": 0.5475,
+      "step": 260
+    },
+    {
+      "epoch": 9.161341490671359e-05,
+      "grad_norm": 0.38666802644729614,
+      "learning_rate": 0.00018297161936560937,
+      "loss": 0.4744,
+      "step": 261
+    },
+    {
+      "epoch": 9.196442415922973e-05,
+      "grad_norm": 0.31954991817474365,
+      "learning_rate": 0.00018290484140233724,
+      "loss": 0.6337,
+      "step": 262
+    },
+    {
+      "epoch": 9.231543341174588e-05,
+      "grad_norm": 0.3590424358844757,
+      "learning_rate": 0.00018283806343906512,
+      "loss": 0.5683,
+      "step": 263
+    },
+    {
+      "epoch": 9.266644266426201e-05,
+      "grad_norm": 0.4042195975780487,
+      "learning_rate": 0.000182771285475793,
+      "loss": 0.6142,
+      "step": 264
+    },
+    {
+      "epoch": 9.301745191677816e-05,
+      "grad_norm": 0.3474234342575073,
+      "learning_rate": 0.0001827045075125209,
+      "loss": 0.6035,
+      "step": 265
+    },
+    {
+      "epoch": 9.336846116929431e-05,
+      "grad_norm": 0.337091326713562,
+      "learning_rate": 0.00018263772954924876,
+      "loss": 0.6107,
+      "step": 266
+    },
+    {
+      "epoch": 9.371947042181045e-05,
+      "grad_norm": 0.3313732445240021,
+      "learning_rate": 0.00018257095158597664,
+      "loss": 0.6491,
+      "step": 267
+    },
+    {
+      "epoch": 9.40704796743266e-05,
+      "grad_norm": 0.3931679129600525,
+      "learning_rate": 0.0001825041736227045,
+      "loss": 0.5492,
+      "step": 268
+    },
+    {
+      "epoch": 9.442148892684273e-05,
+      "grad_norm": 0.5848420262336731,
+      "learning_rate": 0.00018243739565943238,
+      "loss": 0.7091,
+      "step": 269
+    },
+    {
+      "epoch": 9.477249817935888e-05,
+      "grad_norm": 0.4851846992969513,
+      "learning_rate": 0.00018237061769616028,
+      "loss": 0.5856,
+      "step": 270
+    },
+    {
+      "epoch": 9.512350743187503e-05,
+      "grad_norm": 0.3434993326663971,
+      "learning_rate": 0.00018230383973288816,
+      "loss": 0.5085,
+      "step": 271
+    },
+    {
+      "epoch": 9.547451668439117e-05,
+      "grad_norm": 0.2978988587856293,
+      "learning_rate": 0.00018223706176961603,
+      "loss": 0.481,
+      "step": 272
+    },
+    {
+      "epoch": 9.582552593690732e-05,
+      "grad_norm": 0.34215858578681946,
+      "learning_rate": 0.0001821702838063439,
+      "loss": 0.5723,
+      "step": 273
+    },
+    {
+      "epoch": 9.617653518942345e-05,
+      "grad_norm": 0.43445509672164917,
+      "learning_rate": 0.00018210350584307178,
+      "loss": 0.5691,
+      "step": 274
+    },
+    {
+      "epoch": 9.65275444419396e-05,
+      "grad_norm": 0.36094945669174194,
+      "learning_rate": 0.00018203672787979968,
+      "loss": 0.5543,
+      "step": 275
+    },
+    {
+      "epoch": 9.687855369445575e-05,
+      "grad_norm": 0.386106014251709,
+      "learning_rate": 0.00018196994991652755,
+      "loss": 0.5561,
+      "step": 276
+    },
+    {
+      "epoch": 9.722956294697189e-05,
+      "grad_norm": 0.36676689982414246,
+      "learning_rate": 0.00018190317195325542,
+      "loss": 0.5479,
+      "step": 277
+    },
+    {
+      "epoch": 9.758057219948804e-05,
+      "grad_norm": 0.37988394498825073,
+      "learning_rate": 0.00018183639398998332,
+      "loss": 0.5772,
+      "step": 278
+    },
+    {
+      "epoch": 9.793158145200417e-05,
+      "grad_norm": 0.4024789035320282,
+      "learning_rate": 0.0001817696160267112,
+      "loss": 0.6065,
+      "step": 279
+    },
+    {
+      "epoch": 9.828259070452032e-05,
+      "grad_norm": 0.3697255551815033,
+      "learning_rate": 0.0001817028380634391,
+      "loss": 0.5021,
+      "step": 280
+    },
+    {
+      "epoch": 9.863359995703647e-05,
+      "grad_norm": 0.43579426407814026,
+      "learning_rate": 0.00018163606010016697,
+      "loss": 0.555,
+      "step": 281
+    },
+    {
+      "epoch": 9.898460920955261e-05,
+      "grad_norm": 0.4760832190513611,
+      "learning_rate": 0.00018156928213689484,
+      "loss": 0.6438,
+      "step": 282
+    },
+    {
+      "epoch": 9.933561846206876e-05,
+      "grad_norm": 0.45258408784866333,
+      "learning_rate": 0.00018150250417362272,
+      "loss": 0.4717,
+      "step": 283
+    },
+    {
+      "epoch": 9.96866277145849e-05,
+      "grad_norm": 0.428108274936676,
+      "learning_rate": 0.0001814357262103506,
+      "loss": 0.6029,
+      "step": 284
+    },
+    {
+      "epoch": 0.00010003763696710104,
+      "grad_norm": 0.3999852240085602,
+      "learning_rate": 0.00018136894824707846,
+      "loss": 0.4524,
+      "step": 285
+    },
+    {
+      "epoch": 0.0001003886462196172,
+      "grad_norm": 0.44319403171539307,
+      "learning_rate": 0.00018130217028380636,
+      "loss": 0.6619,
+      "step": 286
+    },
+    {
+      "epoch": 0.00010073965547213333,
+      "grad_norm": 0.43008357286453247,
+      "learning_rate": 0.00018123539232053424,
+      "loss": 0.6105,
+      "step": 287
+    },
+    {
+      "epoch": 0.00010109066472464948,
+      "grad_norm": 0.38037821650505066,
+      "learning_rate": 0.0001811686143572621,
+      "loss": 0.6649,
+      "step": 288
+    },
+    {
+      "epoch": 0.00010144167397716562,
+      "grad_norm": 0.3713517487049103,
+      "learning_rate": 0.00018110183639398998,
+      "loss": 0.6381,
+      "step": 289
+    },
+    {
+      "epoch": 0.00010179268322968176,
+      "grad_norm": 0.3437170386314392,
+      "learning_rate": 0.00018103505843071786,
+      "loss": 0.4563,
+      "step": 290
+    },
+    {
+      "epoch": 0.00010214369248219791,
+      "grad_norm": 0.3661468029022217,
+      "learning_rate": 0.00018096828046744576,
+      "loss": 0.606,
+      "step": 291
+    },
+    {
+      "epoch": 0.00010249470173471405,
+      "grad_norm": 0.36346200108528137,
+      "learning_rate": 0.00018090150250417363,
+      "loss": 0.5895,
+      "step": 292
+    },
+    {
+      "epoch": 0.0001028457109872302,
+      "grad_norm": 0.31052225828170776,
+      "learning_rate": 0.0001808347245409015,
+      "loss": 0.4409,
+      "step": 293
+    },
+    {
+      "epoch": 0.00010319672023974634,
+      "grad_norm": 0.37012970447540283,
+      "learning_rate": 0.00018076794657762938,
+      "loss": 0.505,
+      "step": 294
+    },
+    {
+      "epoch": 0.00010354772949226248,
+      "grad_norm": 0.3958667814731598,
+      "learning_rate": 0.00018070116861435728,
+      "loss": 0.5371,
+      "step": 295
+    },
+    {
+      "epoch": 0.00010389873874477863,
+      "grad_norm": 0.4892179071903229,
+      "learning_rate": 0.00018063439065108515,
+      "loss": 0.6737,
+      "step": 296
+    },
+    {
+      "epoch": 0.00010424974799729477,
+      "grad_norm": 0.41874751448631287,
+      "learning_rate": 0.00018056761268781305,
+      "loss": 0.651,
+      "step": 297
+    },
+    {
+      "epoch": 0.00010460075724981092,
+      "grad_norm": 0.4167911410331726,
+      "learning_rate": 0.00018050083472454092,
+      "loss": 0.5531,
+      "step": 298
+    },
+    {
+      "epoch": 0.00010495176650232706,
+      "grad_norm": 0.3758225440979004,
+      "learning_rate": 0.0001804340567612688,
+      "loss": 0.6285,
+      "step": 299
+    },
+    {
+      "epoch": 0.0001053027757548432,
+      "grad_norm": 0.3688598573207855,
+      "learning_rate": 0.00018036727879799667,
+      "loss": 0.5219,
+      "step": 300
+    },
+    {
+      "epoch": 0.00010565378500735934,
+      "grad_norm": 0.3501751124858856,
+      "learning_rate": 0.00018030050083472454,
+      "loss": 0.6351,
+      "step": 301
+    },
+    {
+      "epoch": 0.00010600479425987549,
+      "grad_norm": 0.42876511812210083,
+      "learning_rate": 0.00018023372287145244,
+      "loss": 0.544,
+      "step": 302
+    },
+    {
+      "epoch": 0.00010635580351239164,
+      "grad_norm": 0.47046172618865967,
+      "learning_rate": 0.00018016694490818031,
+      "loss": 0.6304,
+      "step": 303
+    },
+    {
+      "epoch": 0.00010670681276490778,
+      "grad_norm": 0.402271032333374,
+      "learning_rate": 0.0001801001669449082,
+      "loss": 0.5039,
+      "step": 304
+    },
+    {
+      "epoch": 0.00010705782201742393,
+      "grad_norm": 0.41232413053512573,
+      "learning_rate": 0.00018003338898163606,
+      "loss": 0.5892,
+      "step": 305
+    },
+    {
+      "epoch": 0.00010740883126994006,
+      "grad_norm": 0.3628154993057251,
+      "learning_rate": 0.00017996661101836393,
+      "loss": 0.5737,
+      "step": 306
+    },
+    {
+      "epoch": 0.00010775984052245621,
+      "grad_norm": 0.4291020631790161,
+      "learning_rate": 0.00017989983305509183,
+      "loss": 0.6597,
+      "step": 307
+    },
+    {
+      "epoch": 0.00010811084977497236,
+      "grad_norm": 0.33218181133270264,
+      "learning_rate": 0.0001798330550918197,
+      "loss": 0.5726,
+      "step": 308
+    },
+    {
+      "epoch": 0.0001084618590274885,
+      "grad_norm": 0.3439387381076813,
+      "learning_rate": 0.00017976627712854758,
+      "loss": 0.5615,
+      "step": 309
+    },
+    {
+      "epoch": 0.00010881286828000465,
+      "grad_norm": 0.3523644208908081,
+      "learning_rate": 0.00017969949916527545,
+      "loss": 0.4968,
+      "step": 310
+    },
+    {
+      "epoch": 0.00010916387753252078,
+      "grad_norm": 0.4045630991458893,
+      "learning_rate": 0.00017963272120200333,
+      "loss": 0.6425,
+      "step": 311
+    },
+    {
+      "epoch": 0.00010951488678503693,
+      "grad_norm": 0.3726767599582672,
+      "learning_rate": 0.00017956594323873123,
+      "loss": 0.6575,
+      "step": 312
+    },
+    {
+      "epoch": 0.00010986589603755308,
+      "grad_norm": 0.32131972908973694,
+      "learning_rate": 0.0001794991652754591,
+      "loss": 0.5146,
+      "step": 313
+    },
+    {
+      "epoch": 0.00011021690529006922,
+      "grad_norm": 0.5013764500617981,
+      "learning_rate": 0.000179432387312187,
+      "loss": 0.53,
+      "step": 314
+    },
+    {
+      "epoch": 0.00011056791454258537,
+      "grad_norm": 0.36830246448516846,
+      "learning_rate": 0.00017936560934891487,
+      "loss": 0.6291,
+      "step": 315
+    },
+    {
+      "epoch": 0.0001109189237951015,
+      "grad_norm": 0.3587378263473511,
+      "learning_rate": 0.00017929883138564275,
+      "loss": 0.4954,
+      "step": 316
+    },
+    {
+      "epoch": 0.00011126993304761765,
+      "grad_norm": 0.3480195105075836,
+      "learning_rate": 0.00017923205342237062,
+      "loss": 0.606,
+      "step": 317
+    },
+    {
+      "epoch": 0.0001116209423001338,
+      "grad_norm": 0.38415858149528503,
+      "learning_rate": 0.00017916527545909852,
+      "loss": 0.7281,
+      "step": 318
+    },
+    {
+      "epoch": 0.00011197195155264994,
+      "grad_norm": 0.35853826999664307,
+      "learning_rate": 0.0001790984974958264,
+      "loss": 0.5851,
+      "step": 319
+    },
+    {
+      "epoch": 0.00011232296080516609,
+      "grad_norm": 0.42092210054397583,
+      "learning_rate": 0.00017903171953255427,
+      "loss": 0.5324,
+      "step": 320
+    },
+    {
+      "epoch": 0.00011267397005768222,
+      "grad_norm": 0.34538987278938293,
+      "learning_rate": 0.00017896494156928214,
+      "loss": 0.6387,
+      "step": 321
+    },
+    {
+      "epoch": 0.00011302497931019837,
+      "grad_norm": 0.38299745321273804,
+      "learning_rate": 0.00017889816360601,
+      "loss": 0.6013,
+      "step": 322
+    },
+    {
+      "epoch": 0.00011337598856271452,
+      "grad_norm": 0.32100436091423035,
+      "learning_rate": 0.0001788313856427379,
+      "loss": 0.4627,
+      "step": 323
+    },
+    {
+      "epoch": 0.00011372699781523066,
+      "grad_norm": 0.3458426594734192,
+      "learning_rate": 0.0001787646076794658,
+      "loss": 0.5865,
+      "step": 324
+    },
+    {
+      "epoch": 0.0001140780070677468,
+      "grad_norm": 0.33228665590286255,
+      "learning_rate": 0.00017869782971619366,
+      "loss": 0.4611,
+      "step": 325
+    },
+    {
+      "epoch": 0.00011442901632026294,
+      "grad_norm": 0.38747021555900574,
+      "learning_rate": 0.00017863105175292153,
+      "loss": 0.5777,
+      "step": 326
+    },
+    {
+      "epoch": 0.00011478002557277909,
+      "grad_norm": 0.3888608515262604,
+      "learning_rate": 0.0001785642737896494,
+      "loss": 0.5664,
+      "step": 327
+    },
+    {
+      "epoch": 0.00011513103482529524,
+      "grad_norm": 0.4084737002849579,
+      "learning_rate": 0.0001784974958263773,
+      "loss": 0.5939,
+      "step": 328
+    },
+    {
+      "epoch": 0.00011548204407781138,
+      "grad_norm": 0.4964492917060852,
+      "learning_rate": 0.00017843071786310518,
+      "loss": 0.6256,
+      "step": 329
+    },
+    {
+      "epoch": 0.00011583305333032753,
+      "grad_norm": 0.37329745292663574,
+      "learning_rate": 0.00017836393989983305,
+      "loss": 0.5388,
+      "step": 330
+    },
+    {
+      "epoch": 0.00011618406258284366,
+      "grad_norm": 0.37680140137672424,
+      "learning_rate": 0.00017829716193656095,
+      "loss": 0.6203,
+      "step": 331
+    },
+    {
+      "epoch": 0.00011653507183535981,
+      "grad_norm": 0.4162957966327667,
+      "learning_rate": 0.00017823038397328883,
+      "loss": 0.6478,
+      "step": 332
+    },
+    {
+      "epoch": 0.00011688608108787596,
+      "grad_norm": 0.3473896086215973,
+      "learning_rate": 0.0001781636060100167,
+      "loss": 0.589,
+      "step": 333
+    },
+    {
+      "epoch": 0.0001172370903403921,
+      "grad_norm": 0.4039511978626251,
+      "learning_rate": 0.0001780968280467446,
+      "loss": 0.5681,
+      "step": 334
+    },
+    {
+      "epoch": 0.00011758809959290825,
+      "grad_norm": 0.3135715425014496,
+      "learning_rate": 0.00017803005008347247,
+      "loss": 0.5069,
+      "step": 335
+    },
+    {
+      "epoch": 0.00011793910884542438,
+      "grad_norm": 0.4296559989452362,
+      "learning_rate": 0.00017796327212020035,
+      "loss": 0.5413,
+      "step": 336
+    },
+    {
+      "epoch": 0.00011829011809794053,
+      "grad_norm": 0.4197536110877991,
+      "learning_rate": 0.00017789649415692822,
+      "loss": 0.694,
+      "step": 337
+    },
+    {
+      "epoch": 0.00011864112735045668,
+      "grad_norm": 0.3633468449115753,
+      "learning_rate": 0.0001778297161936561,
+      "loss": 0.5475,
+      "step": 338
+    },
+    {
+      "epoch": 0.00011899213660297282,
+      "grad_norm": 0.2867147922515869,
+      "learning_rate": 0.000177762938230384,
+      "loss": 0.485,
+      "step": 339
+    },
+    {
+      "epoch": 0.00011934314585548897,
+      "grad_norm": 0.3445490300655365,
+      "learning_rate": 0.00017769616026711187,
+      "loss": 0.6304,
+      "step": 340
+    },
+    {
+      "epoch": 0.0001196941551080051,
+      "grad_norm": 0.31692221760749817,
+      "learning_rate": 0.00017762938230383974,
+      "loss": 0.5804,
+      "step": 341
+    },
+    {
+      "epoch": 0.00012004516436052125,
+      "grad_norm": 0.31391167640686035,
+      "learning_rate": 0.0001775626043405676,
+      "loss": 0.5945,
+      "step": 342
+    },
+    {
+      "epoch": 0.0001203961736130374,
+      "grad_norm": 0.3484472632408142,
+      "learning_rate": 0.00017749582637729548,
+      "loss": 0.6577,
+      "step": 343
+    },
+    {
+      "epoch": 0.00012074718286555354,
+      "grad_norm": 0.37430596351623535,
+      "learning_rate": 0.00017742904841402339,
+      "loss": 0.6854,
+      "step": 344
+    },
+    {
+      "epoch": 0.00012109819211806969,
+      "grad_norm": 0.34305211901664734,
+      "learning_rate": 0.00017736227045075126,
+      "loss": 0.5123,
+      "step": 345
+    },
+    {
+      "epoch": 0.00012144920137058582,
+      "grad_norm": 0.3398534059524536,
+      "learning_rate": 0.00017729549248747913,
+      "loss": 0.5602,
+      "step": 346
+    },
+    {
+      "epoch": 0.00012180021062310197,
+      "grad_norm": 0.4278014600276947,
+      "learning_rate": 0.000177228714524207,
+      "loss": 0.5152,
+      "step": 347
+    },
+    {
+      "epoch": 0.00012215121987561812,
+      "grad_norm": 0.4011085629463196,
+      "learning_rate": 0.0001771619365609349,
+      "loss": 0.6217,
+      "step": 348
+    },
+    {
+      "epoch": 0.00012250222912813427,
+      "grad_norm": 0.3425695598125458,
+      "learning_rate": 0.00017709515859766278,
+      "loss": 0.5037,
+      "step": 349
+    },
+    {
+      "epoch": 0.0001228532383806504,
+      "grad_norm": 0.34036242961883545,
+      "learning_rate": 0.00017702838063439068,
+      "loss": 0.649,
+      "step": 350
+    },
+    {
+      "epoch": 0.00012320424763316654,
+      "grad_norm": 0.5631874203681946,
+      "learning_rate": 0.00017696160267111855,
+      "loss": 0.5656,
+      "step": 351
+    },
+    {
+      "epoch": 0.0001235552568856827,
+      "grad_norm": 0.4195176661014557,
+      "learning_rate": 0.00017689482470784642,
+      "loss": 0.6899,
+      "step": 352
+    },
+    {
+      "epoch": 0.00012390626613819884,
+      "grad_norm": 0.41814154386520386,
+      "learning_rate": 0.0001768280467445743,
+      "loss": 0.551,
+      "step": 353
+    },
+    {
+      "epoch": 0.000124257275390715,
+      "grad_norm": 0.3374340534210205,
+      "learning_rate": 0.00017676126878130217,
+      "loss": 0.7022,
+      "step": 354
+    },
+    {
+      "epoch": 0.00012460828464323112,
+      "grad_norm": 0.41464921832084656,
+      "learning_rate": 0.00017669449081803007,
+      "loss": 0.5301,
+      "step": 355
+    },
+    {
+      "epoch": 0.00012495929389574726,
+      "grad_norm": 0.4443178176879883,
+      "learning_rate": 0.00017662771285475794,
+      "loss": 0.5487,
+      "step": 356
+    },
+    {
+      "epoch": 0.00012531030314826341,
+      "grad_norm": 0.3389272093772888,
+      "learning_rate": 0.00017656093489148582,
+      "loss": 0.581,
+      "step": 357
+    },
+    {
+      "epoch": 0.00012566131240077956,
+      "grad_norm": 0.29650986194610596,
+      "learning_rate": 0.0001764941569282137,
+      "loss": 0.5801,
+      "step": 358
+    },
+    {
+      "epoch": 0.0001260123216532957,
+      "grad_norm": 0.40271905064582825,
+      "learning_rate": 0.00017642737896494156,
+      "loss": 0.6738,
+      "step": 359
+    },
+    {
+      "epoch": 0.00012636333090581184,
+      "grad_norm": 0.352225661277771,
+      "learning_rate": 0.00017636060100166946,
+      "loss": 0.5727,
+      "step": 360
+    },
+    {
+      "epoch": 0.00012671434015832798,
+      "grad_norm": 0.3469563126564026,
+      "learning_rate": 0.00017629382303839734,
+      "loss": 0.5188,
+      "step": 361
+    },
+    {
+      "epoch": 0.00012706534941084413,
+      "grad_norm": 0.30644670128822327,
+      "learning_rate": 0.0001762270450751252,
+      "loss": 0.497,
+      "step": 362
+    },
+    {
+      "epoch": 0.00012741635866336028,
+      "grad_norm": 0.3472917377948761,
+      "learning_rate": 0.00017616026711185308,
+      "loss": 0.6363,
+      "step": 363
+    },
+    {
+      "epoch": 0.00012776736791587643,
+      "grad_norm": 0.37184756994247437,
+      "learning_rate": 0.00017609348914858096,
+      "loss": 0.5223,
+      "step": 364
+    },
+    {
+      "epoch": 0.00012811837716839256,
+      "grad_norm": 0.3247138559818268,
+      "learning_rate": 0.00017602671118530886,
+      "loss": 0.5457,
+      "step": 365
+    },
+    {
+      "epoch": 0.0001284693864209087,
+      "grad_norm": 0.5236158967018127,
+      "learning_rate": 0.00017595993322203673,
+      "loss": 0.615,
+      "step": 366
+    },
+    {
+      "epoch": 0.00012882039567342485,
+      "grad_norm": 0.33708465099334717,
+      "learning_rate": 0.00017589315525876463,
+      "loss": 0.6163,
+      "step": 367
+    },
+    {
+      "epoch": 0.000129171404925941,
+      "grad_norm": 0.33848705887794495,
+      "learning_rate": 0.0001758263772954925,
+      "loss": 0.4229,
+      "step": 368
+    },
+    {
+      "epoch": 0.00012952241417845715,
+      "grad_norm": 0.5827682018280029,
+      "learning_rate": 0.00017575959933222038,
+      "loss": 0.5668,
+      "step": 369
+    },
+    {
+      "epoch": 0.00012987342343097328,
+      "grad_norm": 0.36217448115348816,
+      "learning_rate": 0.00017569282136894825,
+      "loss": 0.4983,
+      "step": 370
+    },
+    {
+      "epoch": 0.00013022443268348943,
+      "grad_norm": 0.329414963722229,
+      "learning_rate": 0.00017562604340567615,
+      "loss": 0.4281,
+      "step": 371
+    },
+    {
+      "epoch": 0.00013057544193600557,
+      "grad_norm": 0.36746612191200256,
+      "learning_rate": 0.00017555926544240402,
+      "loss": 0.6629,
+      "step": 372
+    },
+    {
+      "epoch": 0.00013092645118852172,
+      "grad_norm": 0.3954717516899109,
+      "learning_rate": 0.0001754924874791319,
+      "loss": 0.5784,
+      "step": 373
+    },
+    {
+      "epoch": 0.00013127746044103787,
+      "grad_norm": 0.41279932856559753,
+      "learning_rate": 0.00017542570951585977,
+      "loss": 0.5994,
+      "step": 374
+    },
+    {
+      "epoch": 0.000131628469693554,
+      "grad_norm": 0.3019951581954956,
+      "learning_rate": 0.00017535893155258764,
+      "loss": 0.5584,
+      "step": 375
+    },
+    {
+      "epoch": 0.00013197947894607015,
+      "grad_norm": 0.3079768121242523,
+      "learning_rate": 0.00017529215358931554,
+      "loss": 0.5904,
+      "step": 376
+    },
+    {
+      "epoch": 0.0001323304881985863,
+      "grad_norm": 0.5678027272224426,
+      "learning_rate": 0.00017522537562604342,
+      "loss": 0.6441,
+      "step": 377
+    },
+    {
+      "epoch": 0.00013268149745110244,
+      "grad_norm": 0.38624581694602966,
+      "learning_rate": 0.0001751585976627713,
+      "loss": 0.5582,
+      "step": 378
+    },
+    {
+      "epoch": 0.0001330325067036186,
+      "grad_norm": 0.4368002712726593,
+      "learning_rate": 0.00017509181969949916,
+      "loss": 0.686,
+      "step": 379
+    },
+    {
+      "epoch": 0.00013338351595613472,
+      "grad_norm": 0.3409269154071808,
+      "learning_rate": 0.00017502504173622704,
+      "loss": 0.582,
+      "step": 380
+    },
+    {
+      "epoch": 0.00013373452520865087,
+      "grad_norm": 0.3772698938846588,
+      "learning_rate": 0.0001749582637729549,
+      "loss": 0.5314,
+      "step": 381
+    },
+    {
+      "epoch": 0.00013408553446116702,
+      "grad_norm": 0.3791707158088684,
+      "learning_rate": 0.0001748914858096828,
+      "loss": 0.6143,
+      "step": 382
+    },
+    {
+      "epoch": 0.00013443654371368317,
+      "grad_norm": 0.4441101551055908,
+      "learning_rate": 0.0001748247078464107,
+      "loss": 0.5726,
+      "step": 383
+    },
+    {
+      "epoch": 0.0001347875529661993,
+      "grad_norm": 0.4160211980342865,
+      "learning_rate": 0.00017475792988313858,
+      "loss": 0.6003,
+      "step": 384
+    },
+    {
+      "epoch": 0.00013513856221871544,
+      "grad_norm": 0.41698628664016724,
+      "learning_rate": 0.00017469115191986646,
+      "loss": 0.4539,
+      "step": 385
+    },
+    {
+      "epoch": 0.00013548957147123159,
+      "grad_norm": 0.337007999420166,
+      "learning_rate": 0.00017462437395659433,
+      "loss": 0.5176,
+      "step": 386
+    },
+    {
+      "epoch": 0.00013584058072374774,
+      "grad_norm": 0.30926409363746643,
+      "learning_rate": 0.00017455759599332223,
+      "loss": 0.6072,
+      "step": 387
+    },
+    {
+      "epoch": 0.00013619158997626389,
+      "grad_norm": 0.3663052022457123,
+      "learning_rate": 0.0001744908180300501,
+      "loss": 0.538,
+      "step": 388
+    },
+    {
+      "epoch": 0.00013654259922878,
+      "grad_norm": 0.3410074710845947,
+      "learning_rate": 0.00017442404006677798,
+      "loss": 0.5687,
+      "step": 389
+    },
+    {
+      "epoch": 0.00013689360848129616,
+      "grad_norm": 0.5266095399856567,
+      "learning_rate": 0.00017435726210350585,
+      "loss": 0.6685,
+      "step": 390
+    },
+    {
+      "epoch": 0.0001372446177338123,
+      "grad_norm": 0.4020686149597168,
+      "learning_rate": 0.00017429048414023372,
+      "loss": 0.586,
+      "step": 391
+    },
+    {
+      "epoch": 0.00013759562698632846,
+      "grad_norm": 0.39995548129081726,
+      "learning_rate": 0.00017422370617696162,
+      "loss": 0.6958,
+      "step": 392
+    },
+    {
+      "epoch": 0.0001379466362388446,
+      "grad_norm": 0.4024721682071686,
+      "learning_rate": 0.0001741569282136895,
+      "loss": 0.6411,
+      "step": 393
+    },
+    {
+      "epoch": 0.00013829764549136073,
+      "grad_norm": 0.38193392753601074,
+      "learning_rate": 0.00017409015025041737,
+      "loss": 0.5857,
+      "step": 394
+    },
+    {
+      "epoch": 0.00013864865474387688,
+      "grad_norm": 0.39786526560783386,
+      "learning_rate": 0.00017402337228714524,
+      "loss": 0.5215,
+      "step": 395
+    },
+    {
+      "epoch": 0.00013899966399639303,
+      "grad_norm": 0.49223974347114563,
+      "learning_rate": 0.00017395659432387311,
+      "loss": 0.5881,
+      "step": 396
+    },
+    {
+      "epoch": 0.00013935067324890918,
+      "grad_norm": 0.3398894667625427,
+      "learning_rate": 0.00017388981636060101,
+      "loss": 0.5466,
+      "step": 397
+    },
+    {
+      "epoch": 0.00013970168250142533,
+      "grad_norm": 0.34891223907470703,
+      "learning_rate": 0.0001738230383973289,
+      "loss": 0.5901,
+      "step": 398
+    },
+    {
+      "epoch": 0.00014005269175394145,
+      "grad_norm": 0.47644108533859253,
+      "learning_rate": 0.00017375626043405676,
+      "loss": 0.5075,
+      "step": 399
+    },
+    {
+      "epoch": 0.0001404037010064576,
+      "grad_norm": 0.42530229687690735,
+      "learning_rate": 0.00017368948247078466,
+      "loss": 0.663,
+      "step": 400
+    },
+    {
+      "epoch": 0.00014075471025897375,
+      "grad_norm": 0.30858534574508667,
+      "learning_rate": 0.00017362270450751253,
+      "loss": 0.4724,
+      "step": 401
+    },
+    {
+      "epoch": 0.0001411057195114899,
+      "grad_norm": 0.42453449964523315,
+      "learning_rate": 0.0001735559265442404,
+      "loss": 0.6074,
+      "step": 402
+    },
+    {
+      "epoch": 0.00014145672876400605,
+      "grad_norm": 0.3964505195617676,
+      "learning_rate": 0.0001734891485809683,
+      "loss": 0.4913,
+      "step": 403
+    },
+    {
+      "epoch": 0.00014180773801652217,
+      "grad_norm": 0.3317703902721405,
+      "learning_rate": 0.00017342237061769618,
+      "loss": 0.5504,
+      "step": 404
+    },
+    {
+      "epoch": 0.00014215874726903832,
+      "grad_norm": 0.3912264108657837,
+      "learning_rate": 0.00017335559265442405,
+      "loss": 0.6301,
+      "step": 405
+    },
+    {
+      "epoch": 0.00014250975652155447,
+      "grad_norm": 0.3582877218723297,
+      "learning_rate": 0.00017328881469115193,
+      "loss": 0.6205,
+      "step": 406
+    },
+    {
+      "epoch": 0.00014286076577407062,
+      "grad_norm": 0.3691099286079407,
+      "learning_rate": 0.0001732220367278798,
+      "loss": 0.5348,
+      "step": 407
+    },
+    {
+      "epoch": 0.00014321177502658677,
+      "grad_norm": 0.35860803723335266,
+      "learning_rate": 0.0001731552587646077,
+      "loss": 0.6029,
+      "step": 408
+    },
+    {
+      "epoch": 0.0001435627842791029,
+      "grad_norm": 0.3640693426132202,
+      "learning_rate": 0.00017308848080133557,
+      "loss": 0.6673,
+      "step": 409
+    },
+    {
+      "epoch": 0.00014391379353161904,
+      "grad_norm": 0.3550623953342438,
+      "learning_rate": 0.00017302170283806345,
+      "loss": 0.4659,
+      "step": 410
+    },
+    {
+      "epoch": 0.0001442648027841352,
+      "grad_norm": 0.45885637402534485,
+      "learning_rate": 0.00017295492487479132,
+      "loss": 0.4781,
+      "step": 411
+    },
+    {
+      "epoch": 0.00014461581203665134,
+      "grad_norm": 0.3703556954860687,
+      "learning_rate": 0.0001728881469115192,
+      "loss": 0.4829,
+      "step": 412
+    },
+    {
+      "epoch": 0.0001449668212891675,
+      "grad_norm": 0.5436837077140808,
+      "learning_rate": 0.0001728213689482471,
+      "loss": 0.6056,
+      "step": 413
+    },
+    {
+      "epoch": 0.0001453178305416836,
+      "grad_norm": 0.3953244686126709,
+      "learning_rate": 0.00017275459098497497,
+      "loss": 0.4884,
+      "step": 414
+    },
+    {
+      "epoch": 0.00014566883979419976,
+      "grad_norm": 0.34003904461860657,
+      "learning_rate": 0.00017268781302170284,
+      "loss": 0.6014,
+      "step": 415
+    },
+    {
+      "epoch": 0.0001460198490467159,
+      "grad_norm": 0.3463648557662964,
+      "learning_rate": 0.0001726210350584307,
+      "loss": 0.603,
+      "step": 416
+    },
+    {
+      "epoch": 0.00014637085829923206,
+      "grad_norm": 0.4293590784072876,
+      "learning_rate": 0.0001725542570951586,
+      "loss": 0.6686,
+      "step": 417
+    },
+    {
+      "epoch": 0.0001467218675517482,
+      "grad_norm": 0.4243469834327698,
+      "learning_rate": 0.0001724874791318865,
+      "loss": 0.6422,
+      "step": 418
+    },
+    {
+      "epoch": 0.00014707287680426433,
+      "grad_norm": 0.38327839970588684,
+      "learning_rate": 0.0001724207011686144,
+      "loss": 0.5595,
+      "step": 419
+    },
+    {
+      "epoch": 0.00014742388605678048,
+      "grad_norm": 0.31334301829338074,
+      "learning_rate": 0.00017235392320534226,
+      "loss": 0.474,
+      "step": 420
+    },
+    {
+      "epoch": 0.00014777489530929663,
+      "grad_norm": 0.3335350453853607,
+      "learning_rate": 0.00017228714524207013,
+      "loss": 0.6172,
+      "step": 421
+    },
+    {
+      "epoch": 0.00014812590456181278,
+      "grad_norm": 0.373696506023407,
+      "learning_rate": 0.000172220367278798,
+      "loss": 0.6183,
+      "step": 422
+    },
+    {
+      "epoch": 0.00014847691381432893,
+      "grad_norm": 0.45814886689186096,
+      "learning_rate": 0.00017215358931552588,
+      "loss": 0.5059,
+      "step": 423
+    },
+    {
+      "epoch": 0.00014882792306684505,
+      "grad_norm": 0.3578277826309204,
+      "learning_rate": 0.00017208681135225378,
+      "loss": 0.5771,
+      "step": 424
+    },
+    {
+      "epoch": 0.0001491789323193612,
+      "grad_norm": 0.42081883549690247,
+      "learning_rate": 0.00017202003338898165,
+      "loss": 0.5604,
+      "step": 425
+    },
+    {
+      "epoch": 0.00014952994157187735,
+      "grad_norm": 0.3173503875732422,
+      "learning_rate": 0.00017195325542570953,
+      "loss": 0.5738,
+      "step": 426
+    },
+    {
+      "epoch": 0.0001498809508243935,
+      "grad_norm": 0.38292011618614197,
+      "learning_rate": 0.0001718864774624374,
+      "loss": 0.6067,
+      "step": 427
+    },
+    {
+      "epoch": 0.00015023196007690965,
+      "grad_norm": 0.3518977463245392,
+      "learning_rate": 0.00017181969949916527,
+      "loss": 0.5073,
+      "step": 428
+    },
+    {
+      "epoch": 0.00015058296932942577,
+      "grad_norm": 0.5157706141471863,
+      "learning_rate": 0.00017175292153589317,
+      "loss": 0.5496,
+      "step": 429
+    },
+    {
+      "epoch": 0.00015093397858194192,
+      "grad_norm": 0.32064110040664673,
+      "learning_rate": 0.00017168614357262105,
+      "loss": 0.4766,
+      "step": 430
+    },
+    {
+      "epoch": 0.00015128498783445807,
+      "grad_norm": 0.42229798436164856,
+      "learning_rate": 0.00017161936560934892,
+      "loss": 0.5953,
+      "step": 431
+    },
+    {
+      "epoch": 0.00015163599708697422,
+      "grad_norm": 0.4723895192146301,
+      "learning_rate": 0.0001715525876460768,
+      "loss": 0.4783,
+      "step": 432
+    },
+    {
+      "epoch": 0.00015198700633949037,
+      "grad_norm": 0.3841445744037628,
+      "learning_rate": 0.00017148580968280467,
+      "loss": 0.5003,
+      "step": 433
+    },
+    {
+      "epoch": 0.0001523380155920065,
+      "grad_norm": 0.38026461005210876,
+      "learning_rate": 0.00017141903171953257,
+      "loss": 0.5093,
+      "step": 434
+    },
+    {
+      "epoch": 0.00015268902484452264,
+      "grad_norm": 0.37034904956817627,
+      "learning_rate": 0.00017135225375626044,
+      "loss": 0.6158,
+      "step": 435
+    },
+    {
+      "epoch": 0.0001530400340970388,
+      "grad_norm": 0.3876091241836548,
+      "learning_rate": 0.00017128547579298834,
+      "loss": 0.5287,
+      "step": 436
+    },
+    {
+      "epoch": 0.00015339104334955494,
+      "grad_norm": 0.30055519938468933,
+      "learning_rate": 0.0001712186978297162,
+      "loss": 0.5018,
+      "step": 437
+    },
+    {
+      "epoch": 0.0001537420526020711,
+      "grad_norm": 0.36094966530799866,
+      "learning_rate": 0.00017115191986644409,
+      "loss": 0.4961,
+      "step": 438
+    },
+    {
+      "epoch": 0.0001540930618545872,
+      "grad_norm": 0.3300524055957794,
+      "learning_rate": 0.00017108514190317196,
+      "loss": 0.5246,
+      "step": 439
+    },
+    {
+      "epoch": 0.00015444407110710336,
+      "grad_norm": 0.40980783104896545,
+      "learning_rate": 0.00017101836393989986,
+      "loss": 0.5705,
+      "step": 440
+    },
+    {
+      "epoch": 0.0001547950803596195,
+      "grad_norm": 0.3442326784133911,
+      "learning_rate": 0.00017095158597662773,
+      "loss": 0.5595,
+      "step": 441
+    },
+    {
+      "epoch": 0.00015514608961213566,
+      "grad_norm": 0.48015034198760986,
+      "learning_rate": 0.0001708848080133556,
+      "loss": 0.5642,
+      "step": 442
+    },
+    {
+      "epoch": 0.0001554970988646518,
+      "grad_norm": 0.5570142269134521,
+      "learning_rate": 0.00017081803005008348,
+      "loss": 0.6111,
+      "step": 443
+    },
+    {
+      "epoch": 0.00015584810811716793,
+      "grad_norm": 0.30470094084739685,
+      "learning_rate": 0.00017075125208681135,
+      "loss": 0.5151,
+      "step": 444
+    },
+    {
+      "epoch": 0.00015619911736968408,
+      "grad_norm": 0.31946614384651184,
+      "learning_rate": 0.00017068447412353925,
+      "loss": 0.5265,
+      "step": 445
+    },
+    {
+      "epoch": 0.00015655012662220023,
+      "grad_norm": 0.38980719447135925,
+      "learning_rate": 0.00017061769616026712,
+      "loss": 0.575,
+      "step": 446
+    },
+    {
+      "epoch": 0.00015690113587471638,
+      "grad_norm": 0.4077732264995575,
+      "learning_rate": 0.000170550918196995,
+      "loss": 0.5729,
+      "step": 447
+    },
+    {
+      "epoch": 0.00015725214512723253,
+      "grad_norm": 0.38632732629776,
+      "learning_rate": 0.00017048414023372287,
+      "loss": 0.594,
+      "step": 448
+    },
+    {
+      "epoch": 0.00015760315437974865,
+      "grad_norm": 0.37193921208381653,
+      "learning_rate": 0.00017041736227045074,
+      "loss": 0.6062,
+      "step": 449
+    },
+    {
+      "epoch": 0.0001579541636322648,
+      "grad_norm": 0.399029016494751,
+      "learning_rate": 0.00017035058430717862,
+      "loss": 0.4538,
+      "step": 450
+    },
+    {
+      "epoch": 0.00015830517288478095,
+      "grad_norm": 0.37710487842559814,
+      "learning_rate": 0.00017028380634390652,
+      "loss": 0.5615,
+      "step": 451
+    },
+    {
+      "epoch": 0.0001586561821372971,
+      "grad_norm": 0.38591668009757996,
+      "learning_rate": 0.0001702170283806344,
+      "loss": 0.5316,
+      "step": 452
+    },
+    {
+      "epoch": 0.00015900719138981325,
+      "grad_norm": 0.3453538417816162,
+      "learning_rate": 0.0001701502504173623,
+      "loss": 0.4645,
+      "step": 453
+    },
+    {
+      "epoch": 0.00015935820064232937,
+      "grad_norm": 0.34171512722969055,
+      "learning_rate": 0.00017008347245409016,
+      "loss": 0.5856,
+      "step": 454
+    },
+    {
+      "epoch": 0.00015970920989484552,
+      "grad_norm": 0.39591720700263977,
+      "learning_rate": 0.00017001669449081804,
+      "loss": 0.573,
+      "step": 455
+    },
+    {
+      "epoch": 0.00016006021914736167,
+      "grad_norm": 0.4127822816371918,
+      "learning_rate": 0.00016994991652754594,
+      "loss": 0.5183,
+      "step": 456
+    },
+    {
+      "epoch": 0.00016041122839987782,
+      "grad_norm": 0.37893375754356384,
+      "learning_rate": 0.0001698831385642738,
+      "loss": 0.566,
+      "step": 457
+    },
+    {
+      "epoch": 0.00016076223765239397,
+      "grad_norm": 0.33429333567619324,
+      "learning_rate": 0.00016981636060100168,
+      "loss": 0.449,
+      "step": 458
+    },
+    {
+      "epoch": 0.0001611132469049101,
+      "grad_norm": 0.3333180546760559,
+      "learning_rate": 0.00016974958263772956,
+      "loss": 0.4441,
+      "step": 459
+    },
+    {
+      "epoch": 0.00016146425615742624,
+      "grad_norm": 0.3591359257698059,
+      "learning_rate": 0.00016968280467445743,
+      "loss": 0.55,
+      "step": 460
+    },
+    {
+      "epoch": 0.0001618152654099424,
+      "grad_norm": 0.35390427708625793,
+      "learning_rate": 0.00016961602671118533,
+      "loss": 0.6445,
+      "step": 461
+    },
+    {
+      "epoch": 0.00016216627466245854,
+      "grad_norm": 0.42036697268486023,
+      "learning_rate": 0.0001695492487479132,
+      "loss": 0.5411,
+      "step": 462
+    },
+    {
+      "epoch": 0.0001625172839149747,
+      "grad_norm": 0.42147770524024963,
+      "learning_rate": 0.00016948247078464108,
+      "loss": 0.6218,
+      "step": 463
+    },
+    {
+      "epoch": 0.0001628682931674908,
+      "grad_norm": 0.3960399329662323,
+      "learning_rate": 0.00016941569282136895,
+      "loss": 0.6608,
+      "step": 464
+    },
+    {
+      "epoch": 0.00016321930242000696,
+      "grad_norm": 0.39676985144615173,
+      "learning_rate": 0.00016934891485809682,
+      "loss": 0.5838,
+      "step": 465
+    },
+    {
+      "epoch": 0.0001635703116725231,
+      "grad_norm": 0.2839520573616028,
+      "learning_rate": 0.0001692821368948247,
+      "loss": 0.5334,
+      "step": 466
+    },
+    {
+      "epoch": 0.00016392132092503926,
+      "grad_norm": 0.3654347062110901,
+      "learning_rate": 0.0001692153589315526,
+      "loss": 0.6065,
+      "step": 467
+    },
+    {
+      "epoch": 0.0001642723301775554,
+      "grad_norm": 0.3709166646003723,
+      "learning_rate": 0.00016914858096828047,
+      "loss": 0.509,
+      "step": 468
+    },
+    {
+      "epoch": 0.00016462333943007153,
+      "grad_norm": 0.29224780201911926,
+      "learning_rate": 0.00016908180300500834,
+      "loss": 0.5372,
+      "step": 469
+    },
+    {
+      "epoch": 0.00016497434868258768,
+      "grad_norm": 0.34979283809661865,
+      "learning_rate": 0.00016901502504173624,
+      "loss": 0.3968,
+      "step": 470
+    },
+    {
+      "epoch": 0.00016532535793510383,
+      "grad_norm": 0.34580183029174805,
+      "learning_rate": 0.00016894824707846412,
+      "loss": 0.6032,
+      "step": 471
+    },
+    {
+      "epoch": 0.00016567636718761998,
+      "grad_norm": 0.39046213030815125,
+      "learning_rate": 0.00016888146911519202,
+      "loss": 0.5628,
+      "step": 472
+    },
+    {
+      "epoch": 0.00016602737644013613,
+      "grad_norm": 0.35301411151885986,
+      "learning_rate": 0.0001688146911519199,
+      "loss": 0.607,
+      "step": 473
+    },
+    {
+      "epoch": 0.00016637838569265225,
+      "grad_norm": 0.4572748839855194,
+      "learning_rate": 0.00016874791318864776,
+      "loss": 0.5018,
+      "step": 474
+    },
+    {
+      "epoch": 0.0001667293949451684,
+      "grad_norm": 0.38230374455451965,
+      "learning_rate": 0.00016868113522537564,
+      "loss": 0.5026,
+      "step": 475
+    },
+    {
+      "epoch": 0.00016708040419768455,
+      "grad_norm": 0.37066343426704407,
+      "learning_rate": 0.0001686143572621035,
+      "loss": 0.5819,
+      "step": 476
+    },
+    {
+      "epoch": 0.0001674314134502007,
+      "grad_norm": 0.3658660054206848,
+      "learning_rate": 0.0001685475792988314,
+      "loss": 0.6825,
+      "step": 477
+    },
+    {
+      "epoch": 0.00016778242270271685,
+      "grad_norm": 0.42174890637397766,
+      "learning_rate": 0.00016848080133555928,
+      "loss": 0.6065,
+      "step": 478
+    },
+    {
+      "epoch": 0.00016813343195523297,
+      "grad_norm": 0.3462882936000824,
+      "learning_rate": 0.00016841402337228716,
+      "loss": 0.5888,
+      "step": 479
+    },
+    {
+      "epoch": 0.00016848444120774912,
+      "grad_norm": 0.44681960344314575,
+      "learning_rate": 0.00016834724540901503,
+      "loss": 0.4987,
+      "step": 480
+    },
+    {
+      "epoch": 0.00016883545046026527,
+      "grad_norm": 0.3535650372505188,
+      "learning_rate": 0.0001682804674457429,
+      "loss": 0.6478,
+      "step": 481
+    },
+    {
+      "epoch": 0.00016918645971278142,
+      "grad_norm": 0.3357018232345581,
+      "learning_rate": 0.00016821368948247077,
+      "loss": 0.4949,
+      "step": 482
+    },
+    {
+      "epoch": 0.00016953746896529757,
+      "grad_norm": 0.42756739258766174,
+      "learning_rate": 0.00016814691151919868,
+      "loss": 0.6475,
+      "step": 483
+    },
+    {
+      "epoch": 0.0001698884782178137,
+      "grad_norm": 0.36174866557121277,
+      "learning_rate": 0.00016808013355592655,
+      "loss": 0.598,
+      "step": 484
+    },
+    {
+      "epoch": 0.00017023948747032984,
+      "grad_norm": 0.37115278840065,
+      "learning_rate": 0.00016801335559265442,
+      "loss": 0.6215,
+      "step": 485
+    },
+    {
+      "epoch": 0.000170590496722846,
+      "grad_norm": 0.340249627828598,
+      "learning_rate": 0.0001679465776293823,
+      "loss": 0.5702,
+      "step": 486
+    },
+    {
+      "epoch": 0.00017094150597536214,
+      "grad_norm": 0.31226348876953125,
+      "learning_rate": 0.0001678797996661102,
+      "loss": 0.6531,
+      "step": 487
+    },
+    {
+      "epoch": 0.0001712925152278783,
+      "grad_norm": 0.35571998357772827,
+      "learning_rate": 0.00016781302170283807,
+      "loss": 0.6406,
+      "step": 488
+    },
+    {
+      "epoch": 0.00017164352448039441,
+      "grad_norm": 0.4167378842830658,
+      "learning_rate": 0.00016774624373956597,
+      "loss": 0.5111,
+      "step": 489
+    },
+    {
+      "epoch": 0.00017199453373291056,
+      "grad_norm": 0.292304128408432,
+      "learning_rate": 0.00016767946577629384,
+      "loss": 0.6643,
+      "step": 490
+    },
+    {
+      "epoch": 0.0001723455429854267,
+      "grad_norm": 0.38789069652557373,
+      "learning_rate": 0.00016761268781302171,
+      "loss": 0.4542,
+      "step": 491
+    },
+    {
+      "epoch": 0.00017269655223794286,
+      "grad_norm": 0.33764714002609253,
+      "learning_rate": 0.0001675459098497496,
+      "loss": 0.4158,
+      "step": 492
+    },
+    {
+      "epoch": 0.00017304756149045898,
+      "grad_norm": 0.34849148988723755,
+      "learning_rate": 0.0001674791318864775,
+      "loss": 0.4737,
+      "step": 493
+    },
+    {
+      "epoch": 0.00017339857074297513,
+      "grad_norm": 0.2921352684497833,
+      "learning_rate": 0.00016741235392320536,
+      "loss": 0.679,
+      "step": 494
+    },
+    {
+      "epoch": 0.00017374957999549128,
+      "grad_norm": 0.33746641874313354,
+      "learning_rate": 0.00016734557595993323,
+      "loss": 0.4957,
+      "step": 495
+    },
+    {
+      "epoch": 0.00017410058924800743,
+      "grad_norm": 0.4029395878314972,
+      "learning_rate": 0.0001672787979966611,
+      "loss": 0.6708,
+      "step": 496
+    },
+    {
+      "epoch": 0.00017445159850052358,
+      "grad_norm": 0.440033882856369,
+      "learning_rate": 0.00016721202003338898,
+      "loss": 0.5889,
+      "step": 497
+    },
+    {
+      "epoch": 0.0001748026077530397,
+      "grad_norm": 0.330692857503891,
+      "learning_rate": 0.00016714524207011685,
+      "loss": 0.5942,
+      "step": 498
+    },
+    {
+      "epoch": 0.00017515361700555585,
+      "grad_norm": 0.3111809492111206,
+      "learning_rate": 0.00016707846410684475,
+      "loss": 0.5506,
+      "step": 499
+    },
+    {
+      "epoch": 0.000175504626258072,
+      "grad_norm": 0.38885676860809326,
+      "learning_rate": 0.00016701168614357263,
+      "loss": 0.4713,
+      "step": 500
+    },
+    {
+      "epoch": 0.00017585563551058815,
+      "grad_norm": 0.3697550296783447,
+      "learning_rate": 0.0001669449081803005,
+      "loss": 0.5955,
+      "step": 501
+    },
+    {
+      "epoch": 0.0001762066447631043,
+      "grad_norm": 0.35807061195373535,
+      "learning_rate": 0.00016687813021702837,
+      "loss": 0.555,
+      "step": 502
+    },
+    {
+      "epoch": 0.00017655765401562043,
+      "grad_norm": 0.44033464789390564,
+      "learning_rate": 0.00016681135225375625,
+      "loss": 0.5668,
+      "step": 503
+    },
+    {
+      "epoch": 0.00017690866326813657,
+      "grad_norm": 0.3363400399684906,
+      "learning_rate": 0.00016674457429048415,
+      "loss": 0.6176,
+      "step": 504
+    },
+    {
+      "epoch": 0.00017725967252065272,
+      "grad_norm": 0.31457507610321045,
+      "learning_rate": 0.00016667779632721202,
+      "loss": 0.6524,
+      "step": 505
+    },
+    {
+      "epoch": 0.00017761068177316887,
+      "grad_norm": 0.38115641474723816,
+      "learning_rate": 0.00016661101836393992,
+      "loss": 0.5848,
+      "step": 506
+    },
+    {
+      "epoch": 0.00017796169102568502,
+      "grad_norm": 0.3387603759765625,
+      "learning_rate": 0.0001665442404006678,
+      "loss": 0.6992,
+      "step": 507
+    },
+    {
+      "epoch": 0.00017831270027820115,
+      "grad_norm": 0.31671345233917236,
+      "learning_rate": 0.00016647746243739567,
+      "loss": 0.5744,
+      "step": 508
+    },
+    {
+      "epoch": 0.0001786637095307173,
+      "grad_norm": 0.3776471018791199,
+      "learning_rate": 0.00016641068447412357,
+      "loss": 0.622,
+      "step": 509
+    },
+    {
+      "epoch": 0.00017901471878323344,
+      "grad_norm": 0.37572941184043884,
+      "learning_rate": 0.00016634390651085144,
+      "loss": 0.5259,
+      "step": 510
+    },
+    {
+      "epoch": 0.0001793657280357496,
+      "grad_norm": 0.3335510194301605,
+      "learning_rate": 0.0001662771285475793,
+      "loss": 0.547,
+      "step": 511
+    },
+    {
+      "epoch": 0.00017971673728826574,
+      "grad_norm": 0.33241015672683716,
+      "learning_rate": 0.00016621035058430719,
+      "loss": 0.5827,
+      "step": 512
+    },
+    {
+      "epoch": 0.00018006774654078187,
+      "grad_norm": 0.3761122524738312,
+      "learning_rate": 0.00016614357262103506,
+      "loss": 0.6962,
+      "step": 513
+    },
+    {
+      "epoch": 0.00018041875579329802,
+      "grad_norm": 0.4172234833240509,
+      "learning_rate": 0.00016607679465776293,
+      "loss": 0.4922,
+      "step": 514
+    },
+    {
+      "epoch": 0.00018076976504581416,
+      "grad_norm": 0.45372599363327026,
+      "learning_rate": 0.00016601001669449083,
+      "loss": 0.5804,
+      "step": 515
+    },
+    {
+      "epoch": 0.00018112077429833031,
+      "grad_norm": 0.3854759931564331,
+      "learning_rate": 0.0001659432387312187,
+      "loss": 0.6026,
+      "step": 516
+    },
+    {
+      "epoch": 0.00018147178355084646,
+      "grad_norm": 0.3399171829223633,
+      "learning_rate": 0.00016587646076794658,
+      "loss": 0.4773,
+      "step": 517
+    },
+    {
+      "epoch": 0.00018182279280336259,
+      "grad_norm": 0.36649778485298157,
+      "learning_rate": 0.00016580968280467445,
+      "loss": 0.59,
+      "step": 518
+    },
+    {
+      "epoch": 0.00018217380205587874,
+      "grad_norm": 0.39988765120506287,
+      "learning_rate": 0.00016574290484140233,
+      "loss": 0.6094,
+      "step": 519
+    },
+    {
+      "epoch": 0.00018252481130839489,
+      "grad_norm": 0.34659436345100403,
+      "learning_rate": 0.00016567612687813023,
+      "loss": 0.4832,
+      "step": 520
+    },
+    {
+      "epoch": 0.00018287582056091103,
+      "grad_norm": 0.3742654025554657,
+      "learning_rate": 0.0001656093489148581,
+      "loss": 0.413,
+      "step": 521
+    },
+    {
+      "epoch": 0.00018322682981342718,
+      "grad_norm": 0.43068456649780273,
+      "learning_rate": 0.00016554257095158597,
+      "loss": 0.6576,
+      "step": 522
+    },
+    {
+      "epoch": 0.0001835778390659433,
+      "grad_norm": 0.42455193400382996,
+      "learning_rate": 0.00016547579298831387,
+      "loss": 0.5897,
+      "step": 523
+    },
+    {
+      "epoch": 0.00018392884831845946,
+      "grad_norm": 0.3290526568889618,
+      "learning_rate": 0.00016540901502504175,
+      "loss": 0.4022,
+      "step": 524
+    },
+    {
+      "epoch": 0.0001842798575709756,
+      "grad_norm": 0.3744141161441803,
+      "learning_rate": 0.00016534223706176965,
+      "loss": 0.5577,
+      "step": 525
+    },
+    {
+      "epoch": 0.00018463086682349176,
+      "grad_norm": 0.3516618609428406,
+      "learning_rate": 0.00016527545909849752,
+      "loss": 0.5481,
+      "step": 526
+    },
+    {
+      "epoch": 0.0001849818760760079,
+      "grad_norm": 0.3591526448726654,
+      "learning_rate": 0.0001652086811352254,
+      "loss": 0.6339,
+      "step": 527
+    },
+    {
+      "epoch": 0.00018533288532852403,
+      "grad_norm": 0.4024425745010376,
+      "learning_rate": 0.00016514190317195327,
+      "loss": 0.5268,
+      "step": 528
+    },
+    {
+      "epoch": 0.00018568389458104018,
+      "grad_norm": 0.3502136766910553,
+      "learning_rate": 0.00016507512520868114,
+      "loss": 0.5112,
+      "step": 529
+    },
+    {
+      "epoch": 0.00018603490383355633,
+      "grad_norm": 0.3338727056980133,
+      "learning_rate": 0.00016500834724540904,
+      "loss": 0.5623,
+      "step": 530
+    },
+    {
+      "epoch": 0.00018638591308607248,
+      "grad_norm": 0.43554845452308655,
+      "learning_rate": 0.0001649415692821369,
+      "loss": 0.5853,
+      "step": 531
+    },
+    {
+      "epoch": 0.00018673692233858862,
+      "grad_norm": 0.34424322843551636,
+      "learning_rate": 0.00016487479131886478,
+      "loss": 0.4951,
+      "step": 532
+    },
+    {
+      "epoch": 0.00018708793159110475,
+      "grad_norm": 0.4424237012863159,
+      "learning_rate": 0.00016480801335559266,
+      "loss": 0.4576,
+      "step": 533
+    },
+    {
+      "epoch": 0.0001874389408436209,
+      "grad_norm": 0.4616681933403015,
+      "learning_rate": 0.00016474123539232053,
+      "loss": 0.4974,
+      "step": 534
+    },
+    {
+      "epoch": 0.00018778995009613705,
+      "grad_norm": 0.3599206507205963,
+      "learning_rate": 0.0001646744574290484,
+      "loss": 0.5987,
+      "step": 535
+    },
+    {
+      "epoch": 0.0001881409593486532,
+      "grad_norm": 0.40468478202819824,
+      "learning_rate": 0.0001646076794657763,
+      "loss": 0.5914,
+      "step": 536
+    },
+    {
+      "epoch": 0.00018849196860116935,
+      "grad_norm": 0.5389227271080017,
+      "learning_rate": 0.00016454090150250418,
+      "loss": 0.6459,
+      "step": 537
+    },
+    {
+      "epoch": 0.00018884297785368547,
+      "grad_norm": 0.3493568003177643,
+      "learning_rate": 0.00016447412353923205,
+      "loss": 0.5191,
+      "step": 538
+    },
+    {
+      "epoch": 0.00018919398710620162,
+      "grad_norm": 0.31237804889678955,
+      "learning_rate": 0.00016440734557595992,
+      "loss": 0.4819,
+      "step": 539
+    },
+    {
+      "epoch": 0.00018954499635871777,
+      "grad_norm": 0.31142041087150574,
+      "learning_rate": 0.00016434056761268782,
+      "loss": 0.5659,
+      "step": 540
+    },
+    {
+      "epoch": 0.00018989600561123392,
+      "grad_norm": 0.3323245644569397,
+      "learning_rate": 0.0001642737896494157,
+      "loss": 0.5779,
+      "step": 541
+    },
+    {
+      "epoch": 0.00019024701486375007,
+      "grad_norm": 0.3679036498069763,
+      "learning_rate": 0.0001642070116861436,
+      "loss": 0.6919,
+      "step": 542
+    },
+    {
+      "epoch": 0.0001905980241162662,
+      "grad_norm": 0.3094903528690338,
+      "learning_rate": 0.00016414023372287147,
+      "loss": 0.4773,
+      "step": 543
+    },
+    {
+      "epoch": 0.00019094903336878234,
+      "grad_norm": 0.37995582818984985,
+      "learning_rate": 0.00016407345575959934,
+      "loss": 0.539,
+      "step": 544
+    },
+    {
+      "epoch": 0.0001913000426212985,
+      "grad_norm": 0.46415746212005615,
+      "learning_rate": 0.00016400667779632722,
+      "loss": 0.6708,
+      "step": 545
+    },
+    {
+      "epoch": 0.00019165105187381464,
+      "grad_norm": 0.3479398190975189,
+      "learning_rate": 0.00016393989983305512,
+      "loss": 0.5496,
+      "step": 546
+    },
+    {
+      "epoch": 0.00019200206112633079,
+      "grad_norm": 0.3740891218185425,
+      "learning_rate": 0.000163873121869783,
+      "loss": 0.6256,
+      "step": 547
+    },
+    {
+      "epoch": 0.0001923530703788469,
+      "grad_norm": 0.4934074878692627,
+      "learning_rate": 0.00016380634390651086,
+      "loss": 0.6788,
+      "step": 548
+    },
+    {
+      "epoch": 0.00019270407963136306,
+      "grad_norm": 0.42659157514572144,
+      "learning_rate": 0.00016373956594323874,
+      "loss": 0.5981,
+      "step": 549
+    },
+    {
+      "epoch": 0.0001930550888838792,
+      "grad_norm": 0.35727575421333313,
+      "learning_rate": 0.0001636727879799666,
+      "loss": 0.4095,
+      "step": 550
+    },
+    {
+      "epoch": 0.00019340609813639536,
+      "grad_norm": 0.4294300377368927,
+      "learning_rate": 0.00016360601001669448,
+      "loss": 0.5386,
+      "step": 551
+    },
+    {
+      "epoch": 0.0001937571073889115,
+      "grad_norm": 0.33482253551483154,
+      "learning_rate": 0.00016353923205342238,
+      "loss": 0.4901,
+      "step": 552
+    },
+    {
+      "epoch": 0.00019410811664142763,
+      "grad_norm": 0.3379746079444885,
+      "learning_rate": 0.00016347245409015026,
+      "loss": 0.5454,
+      "step": 553
+    },
+    {
+      "epoch": 0.00019445912589394378,
+      "grad_norm": 0.42393919825553894,
+      "learning_rate": 0.00016340567612687813,
+      "loss": 0.5959,
+      "step": 554
+    },
+    {
+      "epoch": 0.00019481013514645993,
+      "grad_norm": 0.31975501775741577,
+      "learning_rate": 0.000163338898163606,
+      "loss": 0.6048,
+      "step": 555
+    },
+    {
+      "epoch": 0.00019516114439897608,
+      "grad_norm": 0.43404972553253174,
+      "learning_rate": 0.00016327212020033388,
+      "loss": 0.6252,
+      "step": 556
+    },
+    {
+      "epoch": 0.00019551215365149223,
+      "grad_norm": 0.3559292256832123,
+      "learning_rate": 0.00016320534223706178,
+      "loss": 0.6036,
+      "step": 557
+    },
+    {
+      "epoch": 0.00019586316290400835,
+      "grad_norm": 0.3134891092777252,
+      "learning_rate": 0.00016313856427378965,
+      "loss": 0.5656,
+      "step": 558
+    },
+    {
+      "epoch": 0.0001962141721565245,
+      "grad_norm": 0.32056671380996704,
+      "learning_rate": 0.00016307178631051755,
+      "loss": 0.6509,
+      "step": 559
+    },
+    {
+      "epoch": 0.00019656518140904065,
+      "grad_norm": 0.46249130368232727,
+      "learning_rate": 0.00016300500834724542,
+      "loss": 0.6379,
+      "step": 560
+    },
+    {
+      "epoch": 0.0001969161906615568,
+      "grad_norm": 0.36366966366767883,
+      "learning_rate": 0.0001629382303839733,
+      "loss": 0.5334,
+      "step": 561
+    },
+    {
+      "epoch": 0.00019726719991407295,
+      "grad_norm": 0.4234124422073364,
+      "learning_rate": 0.0001628714524207012,
+      "loss": 0.4864,
+      "step": 562
+    },
+    {
+      "epoch": 0.00019761820916658907,
+      "grad_norm": 0.3687801659107208,
+      "learning_rate": 0.00016280467445742907,
+      "loss": 0.4855,
+      "step": 563
+    },
+    {
+      "epoch": 0.00019796921841910522,
+      "grad_norm": 0.37247028946876526,
+      "learning_rate": 0.00016273789649415694,
+      "loss": 0.6215,
+      "step": 564
+    },
+    {
+      "epoch": 0.00019832022767162137,
+      "grad_norm": 0.30445635318756104,
+      "learning_rate": 0.00016267111853088482,
+      "loss": 0.5741,
+      "step": 565
+    },
+    {
+      "epoch": 0.00019867123692413752,
+      "grad_norm": 0.3349187970161438,
+      "learning_rate": 0.0001626043405676127,
+      "loss": 0.4524,
+      "step": 566
+    },
+    {
+      "epoch": 0.00019902224617665367,
+      "grad_norm": 0.36938101053237915,
+      "learning_rate": 0.00016253756260434056,
+      "loss": 0.5046,
+      "step": 567
+    },
+    {
+      "epoch": 0.0001993732554291698,
+      "grad_norm": 0.37673529982566833,
+      "learning_rate": 0.00016247078464106846,
+      "loss": 0.5001,
+      "step": 568
+    },
+    {
+      "epoch": 0.00019972426468168594,
+      "grad_norm": 0.3571556508541107,
+      "learning_rate": 0.00016240400667779634,
+      "loss": 0.6419,
+      "step": 569
+    },
+    {
+      "epoch": 0.0002000752739342021,
+      "grad_norm": 0.35543423891067505,
+      "learning_rate": 0.0001623372287145242,
+      "loss": 0.6191,
+      "step": 570
+    },
+    {
+      "epoch": 0.00020042628318671824,
+      "grad_norm": 0.3096729516983032,
+      "learning_rate": 0.00016227045075125208,
+      "loss": 0.5373,
+      "step": 571
+    },
+    {
+      "epoch": 0.0002007772924392344,
+      "grad_norm": 0.30310383439064026,
+      "learning_rate": 0.00016220367278797996,
+      "loss": 0.558,
+      "step": 572
+    },
+    {
+      "epoch": 0.0002011283016917505,
+      "grad_norm": 0.3616211712360382,
+      "learning_rate": 0.00016213689482470786,
+      "loss": 0.6504,
+      "step": 573
+    },
+    {
+      "epoch": 0.00020147931094426666,
+      "grad_norm": 0.34818220138549805,
+      "learning_rate": 0.00016207011686143573,
+      "loss": 0.6136,
+      "step": 574
+    },
+    {
+      "epoch": 0.0002018303201967828,
+      "grad_norm": 0.36225444078445435,
+      "learning_rate": 0.0001620033388981636,
+      "loss": 0.4905,
+      "step": 575
+    },
+    {
+      "epoch": 0.00020218132944929896,
+      "grad_norm": 0.40039536356925964,
+      "learning_rate": 0.0001619365609348915,
+      "loss": 0.5997,
+      "step": 576
+    },
+    {
+      "epoch": 0.0002025323387018151,
+      "grad_norm": 0.33715930581092834,
+      "learning_rate": 0.00016186978297161938,
+      "loss": 0.5284,
+      "step": 577
+    },
+    {
+      "epoch": 0.00020288334795433123,
+      "grad_norm": 0.4137067198753357,
+      "learning_rate": 0.00016180300500834728,
+      "loss": 0.6873,
+      "step": 578
+    },
+    {
+      "epoch": 0.00020323435720684738,
+      "grad_norm": 0.41598305106163025,
+      "learning_rate": 0.00016173622704507515,
+      "loss": 0.491,
+      "step": 579
+    },
+    {
+      "epoch": 0.00020358536645936353,
+      "grad_norm": 0.5466423034667969,
+      "learning_rate": 0.00016166944908180302,
+      "loss": 0.6188,
+      "step": 580
+    },
+    {
+      "epoch": 0.00020393637571187968,
+      "grad_norm": 0.3718060851097107,
+      "learning_rate": 0.0001616026711185309,
+      "loss": 0.5573,
+      "step": 581
+    },
+    {
+      "epoch": 0.00020428738496439583,
+      "grad_norm": 0.33747225999832153,
+      "learning_rate": 0.00016153589315525877,
+      "loss": 0.4887,
+      "step": 582
+    },
+    {
+      "epoch": 0.00020463839421691195,
+      "grad_norm": 0.36478081345558167,
+      "learning_rate": 0.00016146911519198664,
+      "loss": 0.553,
+      "step": 583
+    },
+    {
+      "epoch": 0.0002049894034694281,
+      "grad_norm": 0.38441962003707886,
+      "learning_rate": 0.00016140233722871454,
+      "loss": 0.4833,
+      "step": 584
+    },
+    {
+      "epoch": 0.00020534041272194425,
+      "grad_norm": 0.45594358444213867,
+      "learning_rate": 0.00016133555926544241,
+      "loss": 0.5877,
+      "step": 585
+    },
+    {
+      "epoch": 0.0002056914219744604,
+      "grad_norm": 0.356517493724823,
+      "learning_rate": 0.0001612687813021703,
+      "loss": 0.5614,
+      "step": 586
+    },
+    {
+      "epoch": 0.00020604243122697655,
+      "grad_norm": 0.4051963686943054,
+      "learning_rate": 0.00016120200333889816,
+      "loss": 0.5208,
+      "step": 587
+    },
+    {
+      "epoch": 0.00020639344047949267,
+      "grad_norm": 0.36947959661483765,
+      "learning_rate": 0.00016113522537562603,
+      "loss": 0.4385,
+      "step": 588
+    },
+    {
+      "epoch": 0.00020674444973200882,
+      "grad_norm": 0.45947200059890747,
+      "learning_rate": 0.00016106844741235393,
+      "loss": 0.4972,
+      "step": 589
+    },
+    {
+      "epoch": 0.00020709545898452497,
+      "grad_norm": 0.40610602498054504,
+      "learning_rate": 0.0001610016694490818,
+      "loss": 0.4022,
+      "step": 590
+    },
+    {
+      "epoch": 0.00020744646823704112,
+      "grad_norm": 0.3529384732246399,
+      "learning_rate": 0.00016093489148580968,
+      "loss": 0.5222,
+      "step": 591
+    },
+    {
+      "epoch": 0.00020779747748955727,
+      "grad_norm": 0.35114821791648865,
+      "learning_rate": 0.00016086811352253755,
+      "loss": 0.6224,
+      "step": 592
+    },
+    {
+      "epoch": 0.0002081484867420734,
+      "grad_norm": 0.3596336841583252,
+      "learning_rate": 0.00016080133555926545,
+      "loss": 0.5081,
+      "step": 593
+    },
+    {
+      "epoch": 0.00020849949599458954,
+      "grad_norm": 0.4214174747467041,
+      "learning_rate": 0.00016073455759599333,
+      "loss": 0.5189,
+      "step": 594
+    },
+    {
+      "epoch": 0.0002088505052471057,
+      "grad_norm": 0.39635175466537476,
+      "learning_rate": 0.00016066777963272123,
+      "loss": 0.582,
+      "step": 595
+    },
+    {
+      "epoch": 0.00020920151449962184,
+      "grad_norm": 0.36160576343536377,
+      "learning_rate": 0.0001606010016694491,
+      "loss": 0.568,
+      "step": 596
+    },
+    {
+      "epoch": 0.000209552523752138,
+      "grad_norm": 0.4242927134037018,
+      "learning_rate": 0.00016053422370617697,
+      "loss": 0.6235,
+      "step": 597
+    },
+    {
+      "epoch": 0.0002099035330046541,
+      "grad_norm": 0.4257853925228119,
+      "learning_rate": 0.00016046744574290485,
+      "loss": 0.5294,
+      "step": 598
+    },
+    {
+      "epoch": 0.00021025454225717026,
+      "grad_norm": 0.3890500068664551,
+      "learning_rate": 0.00016040066777963272,
+      "loss": 0.6224,
+      "step": 599
+    },
+    {
+      "epoch": 0.0002106055515096864,
+      "grad_norm": 0.2971879541873932,
+      "learning_rate": 0.00016033388981636062,
+      "loss": 0.5951,
+      "step": 600
+    },
+    {
+      "epoch": 0.00021095656076220256,
+      "grad_norm": 0.29551970958709717,
+      "learning_rate": 0.0001602671118530885,
+      "loss": 0.6713,
+      "step": 601
+    },
+    {
+      "epoch": 0.00021130757001471868,
+      "grad_norm": 0.31588122248649597,
+      "learning_rate": 0.00016020033388981637,
+      "loss": 0.6384,
+      "step": 602
+    },
+    {
+      "epoch": 0.00021165857926723483,
+      "grad_norm": 0.3138657510280609,
+      "learning_rate": 0.00016013355592654424,
+      "loss": 0.5846,
+      "step": 603
+    },
+    {
+      "epoch": 0.00021200958851975098,
+      "grad_norm": 0.31286585330963135,
+      "learning_rate": 0.0001600667779632721,
+      "loss": 0.6236,
+      "step": 604
+    },
+    {
+      "epoch": 0.00021236059777226713,
+      "grad_norm": 0.32098105549812317,
+      "learning_rate": 0.00016,
+      "loss": 0.4926,
+      "step": 605
+    },
+    {
+      "epoch": 0.00021271160702478328,
+      "grad_norm": 0.371427446603775,
+      "learning_rate": 0.00015993322203672789,
+      "loss": 0.6205,
+      "step": 606
+    },
+    {
+      "epoch": 0.0002130626162772994,
+      "grad_norm": 0.28764042258262634,
+      "learning_rate": 0.00015986644407345576,
+      "loss": 0.449,
+      "step": 607
+    },
+    {
+      "epoch": 0.00021341362552981555,
+      "grad_norm": 0.35086238384246826,
+      "learning_rate": 0.00015979966611018363,
+      "loss": 0.549,
+      "step": 608
+    },
+    {
+      "epoch": 0.0002137646347823317,
+      "grad_norm": 0.3118048906326294,
+      "learning_rate": 0.0001597328881469115,
+      "loss": 0.6037,
+      "step": 609
+    },
+    {
+      "epoch": 0.00021411564403484785,
+      "grad_norm": 0.3894517123699188,
+      "learning_rate": 0.0001596661101836394,
+      "loss": 0.5989,
+      "step": 610
+    },
+    {
+      "epoch": 0.000214466653287364,
+      "grad_norm": 0.39642322063446045,
+      "learning_rate": 0.00015959933222036728,
+      "loss": 0.566,
+      "step": 611
+    },
+    {
+      "epoch": 0.00021481766253988012,
+      "grad_norm": 0.35333508253097534,
+      "learning_rate": 0.00015953255425709518,
+      "loss": 0.5055,
+      "step": 612
+    },
+    {
+      "epoch": 0.00021516867179239627,
+      "grad_norm": 0.39200490713119507,
+      "learning_rate": 0.00015946577629382305,
+      "loss": 0.5951,
+      "step": 613
+    },
+    {
+      "epoch": 0.00021551968104491242,
+      "grad_norm": 0.38436442613601685,
+      "learning_rate": 0.00015939899833055093,
+      "loss": 0.4876,
+      "step": 614
+    },
+    {
+      "epoch": 0.00021587069029742857,
+      "grad_norm": 0.3397504389286041,
+      "learning_rate": 0.0001593322203672788,
+      "loss": 0.6287,
+      "step": 615
+    },
+    {
+      "epoch": 0.00021622169954994472,
+      "grad_norm": 0.35870012640953064,
+      "learning_rate": 0.0001592654424040067,
+      "loss": 0.5857,
+      "step": 616
+    },
+    {
+      "epoch": 0.00021657270880246084,
+      "grad_norm": 0.31163597106933594,
+      "learning_rate": 0.00015919866444073457,
+      "loss": 0.4831,
+      "step": 617
+    },
+    {
+      "epoch": 0.000216923718054977,
+      "grad_norm": 0.35106539726257324,
+      "learning_rate": 0.00015913188647746245,
+      "loss": 0.5776,
+      "step": 618
+    },
+    {
+      "epoch": 0.00021727472730749314,
+      "grad_norm": 0.3639923334121704,
+      "learning_rate": 0.00015906510851419032,
+      "loss": 0.5039,
+      "step": 619
+    },
+    {
+      "epoch": 0.0002176257365600093,
+      "grad_norm": 0.3622918128967285,
+      "learning_rate": 0.0001589983305509182,
+      "loss": 0.6293,
+      "step": 620
+    },
+    {
+      "epoch": 0.00021797674581252544,
+      "grad_norm": 0.3899349868297577,
+      "learning_rate": 0.0001589315525876461,
+      "loss": 0.567,
+      "step": 621
+    },
+    {
+      "epoch": 0.00021832775506504156,
+      "grad_norm": 0.3834361732006073,
+      "learning_rate": 0.00015886477462437397,
+      "loss": 0.5106,
+      "step": 622
+    },
+    {
+      "epoch": 0.0002186787643175577,
+      "grad_norm": 0.34996962547302246,
+      "learning_rate": 0.00015879799666110184,
+      "loss": 0.5155,
+      "step": 623
+    },
+    {
+      "epoch": 0.00021902977357007386,
+      "grad_norm": 0.47908079624176025,
+      "learning_rate": 0.0001587312186978297,
+      "loss": 0.4529,
+      "step": 624
+    },
+    {
+      "epoch": 0.00021938078282259,
+      "grad_norm": 0.3167901635169983,
+      "learning_rate": 0.00015866444073455758,
+      "loss": 0.6075,
+      "step": 625
+    },
+    {
+      "epoch": 0.00021973179207510616,
+      "grad_norm": 0.4254927337169647,
+      "learning_rate": 0.00015859766277128548,
+      "loss": 0.6404,
+      "step": 626
+    },
+    {
+      "epoch": 0.00022008280132762228,
+      "grad_norm": 0.4317469000816345,
+      "learning_rate": 0.00015853088480801336,
+      "loss": 0.5881,
+      "step": 627
+    },
+    {
+      "epoch": 0.00022043381058013843,
+      "grad_norm": 0.4441644251346588,
+      "learning_rate": 0.00015846410684474123,
+      "loss": 0.5864,
+      "step": 628
+    },
+    {
+      "epoch": 0.00022078481983265458,
+      "grad_norm": 0.37883102893829346,
+      "learning_rate": 0.00015839732888146913,
+      "loss": 0.5664,
+      "step": 629
+    },
+    {
+      "epoch": 0.00022113582908517073,
+      "grad_norm": 0.35548868775367737,
+      "learning_rate": 0.000158330550918197,
+      "loss": 0.5712,
+      "step": 630
+    },
+    {
+      "epoch": 0.00022148683833768688,
+      "grad_norm": 0.31588616967201233,
+      "learning_rate": 0.00015826377295492488,
+      "loss": 0.4856,
+      "step": 631
+    },
+    {
+      "epoch": 0.000221837847590203,
+      "grad_norm": 0.3186424672603607,
+      "learning_rate": 0.00015819699499165278,
+      "loss": 0.542,
+      "step": 632
+    },
+    {
+      "epoch": 0.00022218885684271915,
+      "grad_norm": 0.41098466515541077,
+      "learning_rate": 0.00015813021702838065,
+      "loss": 0.6311,
+      "step": 633
+    },
+    {
+      "epoch": 0.0002225398660952353,
+      "grad_norm": 0.413401335477829,
+      "learning_rate": 0.00015806343906510852,
+      "loss": 0.5036,
+      "step": 634
+    },
+    {
+      "epoch": 0.00022289087534775145,
+      "grad_norm": 0.34203773736953735,
+      "learning_rate": 0.0001579966611018364,
+      "loss": 0.5508,
+      "step": 635
+    },
+    {
+      "epoch": 0.0002232418846002676,
+      "grad_norm": 0.34416648745536804,
+      "learning_rate": 0.00015792988313856427,
+      "loss": 0.5442,
+      "step": 636
+    },
+    {
+      "epoch": 0.00022359289385278372,
+      "grad_norm": 0.3439941704273224,
+      "learning_rate": 0.00015786310517529217,
+      "loss": 0.4969,
+      "step": 637
+    },
+    {
+      "epoch": 0.00022394390310529987,
+      "grad_norm": 0.3547762930393219,
+      "learning_rate": 0.00015779632721202004,
+      "loss": 0.5564,
+      "step": 638
+    },
+    {
+      "epoch": 0.00022429491235781602,
+      "grad_norm": 0.35666894912719727,
+      "learning_rate": 0.00015772954924874792,
+      "loss": 0.4759,
+      "step": 639
+    },
+    {
+      "epoch": 0.00022464592161033217,
+      "grad_norm": 0.3175058364868164,
+      "learning_rate": 0.0001576627712854758,
+      "loss": 0.5708,
+      "step": 640
+    },
+    {
+      "epoch": 0.00022499693086284832,
+      "grad_norm": 0.4329943358898163,
+      "learning_rate": 0.00015759599332220366,
+      "loss": 0.5293,
+      "step": 641
+    },
+    {
+      "epoch": 0.00022534794011536444,
+      "grad_norm": 0.5703821778297424,
+      "learning_rate": 0.00015752921535893156,
+      "loss": 0.6187,
+      "step": 642
+    },
+    {
+      "epoch": 0.0002256989493678806,
+      "grad_norm": 0.32244032621383667,
+      "learning_rate": 0.00015746243739565944,
+      "loss": 0.4847,
+      "step": 643
+    },
+    {
+      "epoch": 0.00022604995862039674,
+      "grad_norm": 0.36224085092544556,
+      "learning_rate": 0.0001573956594323873,
+      "loss": 0.6804,
+      "step": 644
+    },
+    {
+      "epoch": 0.0002264009678729129,
+      "grad_norm": 0.3316931426525116,
+      "learning_rate": 0.0001573288814691152,
+      "loss": 0.6413,
+      "step": 645
+    },
+    {
+      "epoch": 0.00022675197712542904,
+      "grad_norm": 0.38156425952911377,
+      "learning_rate": 0.00015726210350584308,
+      "loss": 0.5659,
+      "step": 646
+    },
+    {
+      "epoch": 0.00022710298637794516,
+      "grad_norm": 0.48353493213653564,
+      "learning_rate": 0.00015719532554257096,
+      "loss": 0.5788,
+      "step": 647
+    },
+    {
+      "epoch": 0.00022745399563046131,
+      "grad_norm": 0.3913673758506775,
+      "learning_rate": 0.00015712854757929886,
+      "loss": 0.6899,
+      "step": 648
+    },
+    {
+      "epoch": 0.00022780500488297746,
+      "grad_norm": 0.46836981177330017,
+      "learning_rate": 0.00015706176961602673,
+      "loss": 0.5712,
+      "step": 649
+    },
+    {
+      "epoch": 0.0002281560141354936,
+      "grad_norm": 0.34713172912597656,
+      "learning_rate": 0.0001569949916527546,
+      "loss": 0.381,
+      "step": 650
+    },
+    {
+      "epoch": 0.00022850702338800976,
+      "grad_norm": 0.3837398886680603,
+      "learning_rate": 0.00015692821368948248,
+      "loss": 0.5236,
+      "step": 651
+    },
+    {
+      "epoch": 0.00022885803264052589,
+      "grad_norm": 0.5181556940078735,
+      "learning_rate": 0.00015686143572621035,
+      "loss": 0.5889,
+      "step": 652
+    },
+    {
+      "epoch": 0.00022920904189304203,
+      "grad_norm": 0.42713961005210876,
+      "learning_rate": 0.00015679465776293825,
+      "loss": 0.5346,
+      "step": 653
+    },
+    {
+      "epoch": 0.00022956005114555818,
+      "grad_norm": 0.2868479788303375,
+      "learning_rate": 0.00015672787979966612,
+      "loss": 0.5546,
+      "step": 654
+    },
+    {
+      "epoch": 0.00022991106039807433,
+      "grad_norm": 0.31901800632476807,
+      "learning_rate": 0.000156661101836394,
+      "loss": 0.5014,
+      "step": 655
+    },
+    {
+      "epoch": 0.00023026206965059048,
+      "grad_norm": 0.41681963205337524,
+      "learning_rate": 0.00015659432387312187,
+      "loss": 0.5709,
+      "step": 656
+    },
+    {
+      "epoch": 0.0002306130789031066,
+      "grad_norm": 0.5942090749740601,
+      "learning_rate": 0.00015652754590984974,
+      "loss": 0.6022,
+      "step": 657
+    },
+    {
+      "epoch": 0.00023096408815562276,
+      "grad_norm": 0.405391126871109,
+      "learning_rate": 0.00015646076794657764,
+      "loss": 0.5363,
+      "step": 658
+    },
+    {
+      "epoch": 0.0002313150974081389,
+      "grad_norm": 0.3201390206813812,
+      "learning_rate": 0.00015639398998330552,
+      "loss": 0.6045,
+      "step": 659
+    },
+    {
+      "epoch": 0.00023166610666065505,
+      "grad_norm": 0.2989407479763031,
+      "learning_rate": 0.0001563272120200334,
+      "loss": 0.5604,
+      "step": 660
+    },
+    {
+      "epoch": 0.0002320171159131712,
+      "grad_norm": 0.3919268548488617,
+      "learning_rate": 0.00015626043405676126,
+      "loss": 0.5413,
+      "step": 661
+    },
+    {
+      "epoch": 0.00023236812516568733,
+      "grad_norm": 0.4080122709274292,
+      "learning_rate": 0.00015619365609348916,
+      "loss": 0.498,
+      "step": 662
+    },
+    {
+      "epoch": 0.00023271913441820348,
+      "grad_norm": 0.38974156975746155,
+      "learning_rate": 0.00015612687813021704,
+      "loss": 0.6149,
+      "step": 663
+    },
+    {
+      "epoch": 0.00023307014367071962,
+      "grad_norm": 0.3145015835762024,
+      "learning_rate": 0.00015606010016694494,
+      "loss": 0.4886,
+      "step": 664
+    },
+    {
+      "epoch": 0.00023342115292323577,
+      "grad_norm": 0.3009328246116638,
+      "learning_rate": 0.0001559933222036728,
+      "loss": 0.5534,
+      "step": 665
+    },
+    {
+      "epoch": 0.00023377216217575192,
+      "grad_norm": 0.4774717092514038,
+      "learning_rate": 0.00015592654424040068,
+      "loss": 0.6006,
+      "step": 666
+    },
+    {
+      "epoch": 0.00023412317142826805,
+      "grad_norm": 0.32965418696403503,
+      "learning_rate": 0.00015585976627712856,
+      "loss": 0.5463,
+      "step": 667
+    },
+    {
+      "epoch": 0.0002344741806807842,
+      "grad_norm": 0.3066554665565491,
+      "learning_rate": 0.00015579298831385643,
+      "loss": 0.5675,
+      "step": 668
+    },
+    {
+      "epoch": 0.00023482518993330035,
+      "grad_norm": 0.3879207372665405,
+      "learning_rate": 0.00015572621035058433,
+      "loss": 0.5825,
+      "step": 669
+    },
+    {
+      "epoch": 0.0002351761991858165,
+      "grad_norm": 0.3171943128108978,
+      "learning_rate": 0.0001556594323873122,
+      "loss": 0.5677,
+      "step": 670
+    },
+    {
+      "epoch": 0.00023552720843833264,
+      "grad_norm": 0.36982622742652893,
+      "learning_rate": 0.00015559265442404007,
+      "loss": 0.5885,
+      "step": 671
+    },
+    {
+      "epoch": 0.00023587821769084877,
+      "grad_norm": 0.30437183380126953,
+      "learning_rate": 0.00015552587646076795,
+      "loss": 0.6288,
+      "step": 672
+    },
+    {
+      "epoch": 0.00023622922694336492,
+      "grad_norm": 0.30654504895210266,
+      "learning_rate": 0.00015545909849749582,
+      "loss": 0.5924,
+      "step": 673
+    },
+    {
+      "epoch": 0.00023658023619588107,
+      "grad_norm": 0.3771214783191681,
+      "learning_rate": 0.00015539232053422372,
+      "loss": 0.4901,
+      "step": 674
+    },
+    {
+      "epoch": 0.00023693124544839721,
+      "grad_norm": 0.3018699884414673,
+      "learning_rate": 0.0001553255425709516,
+      "loss": 0.6159,
+      "step": 675
+    },
+    {
+      "epoch": 0.00023728225470091336,
+      "grad_norm": 0.32899734377861023,
+      "learning_rate": 0.00015525876460767947,
+      "loss": 0.6197,
+      "step": 676
+    },
+    {
+      "epoch": 0.0002376332639534295,
+      "grad_norm": 0.31837883591651917,
+      "learning_rate": 0.00015519198664440734,
+      "loss": 0.5449,
+      "step": 677
+    },
+    {
+      "epoch": 0.00023798427320594564,
+      "grad_norm": 0.35326528549194336,
+      "learning_rate": 0.00015512520868113521,
+      "loss": 0.6315,
+      "step": 678
+    },
+    {
+      "epoch": 0.00023833528245846179,
+      "grad_norm": 0.3714829385280609,
+      "learning_rate": 0.00015505843071786311,
+      "loss": 0.6352,
+      "step": 679
+    },
+    {
+      "epoch": 0.00023868629171097794,
+      "grad_norm": 0.4002094864845276,
+      "learning_rate": 0.000154991652754591,
+      "loss": 0.4235,
+      "step": 680
+    },
+    {
+      "epoch": 0.00023903730096349408,
+      "grad_norm": 0.3382783532142639,
+      "learning_rate": 0.0001549248747913189,
+      "loss": 0.5476,
+      "step": 681
+    },
+    {
+      "epoch": 0.0002393883102160102,
+      "grad_norm": 0.2985747158527374,
+      "learning_rate": 0.00015485809682804676,
+      "loss": 0.5684,
+      "step": 682
+    },
+    {
+      "epoch": 0.00023973931946852636,
+      "grad_norm": 0.3288929760456085,
+      "learning_rate": 0.00015479131886477463,
+      "loss": 0.5657,
+      "step": 683
+    },
+    {
+      "epoch": 0.0002400903287210425,
+      "grad_norm": 0.39641210436820984,
+      "learning_rate": 0.0001547245409015025,
+      "loss": 0.6283,
+      "step": 684
+    },
+    {
+      "epoch": 0.00024044133797355866,
+      "grad_norm": 0.37413230538368225,
+      "learning_rate": 0.0001546577629382304,
+      "loss": 0.5778,
+      "step": 685
+    },
+    {
+      "epoch": 0.0002407923472260748,
+      "grad_norm": 0.28837504982948303,
+      "learning_rate": 0.00015459098497495828,
+      "loss": 0.5079,
+      "step": 686
+    },
+    {
+      "epoch": 0.00024114335647859093,
+      "grad_norm": 0.32851526141166687,
+      "learning_rate": 0.00015452420701168615,
+      "loss": 0.649,
+      "step": 687
+    },
+    {
+      "epoch": 0.00024149436573110708,
+      "grad_norm": 0.3848758637905121,
+      "learning_rate": 0.00015445742904841403,
+      "loss": 0.6099,
+      "step": 688
+    },
+    {
+      "epoch": 0.00024184537498362323,
+      "grad_norm": 0.35494935512542725,
+      "learning_rate": 0.0001543906510851419,
+      "loss": 0.6498,
+      "step": 689
+    },
+    {
+      "epoch": 0.00024219638423613938,
+      "grad_norm": 0.3431280553340912,
+      "learning_rate": 0.0001543238731218698,
+      "loss": 0.4934,
+      "step": 690
+    },
+    {
+      "epoch": 0.00024254739348865553,
+      "grad_norm": 0.33980974555015564,
+      "learning_rate": 0.00015425709515859767,
+      "loss": 0.5556,
+      "step": 691
+    },
+    {
+      "epoch": 0.00024289840274117165,
+      "grad_norm": 0.3086068034172058,
+      "learning_rate": 0.00015419031719532555,
+      "loss": 0.5955,
+      "step": 692
+    },
+    {
+      "epoch": 0.0002432494119936878,
+      "grad_norm": 0.33093178272247314,
+      "learning_rate": 0.00015412353923205342,
+      "loss": 0.5926,
+      "step": 693
+    },
+    {
+      "epoch": 0.00024360042124620395,
+      "grad_norm": 0.3660534620285034,
+      "learning_rate": 0.0001540567612687813,
+      "loss": 0.5494,
+      "step": 694
+    },
+    {
+      "epoch": 0.0002439514304987201,
+      "grad_norm": 0.29803964495658875,
+      "learning_rate": 0.0001539899833055092,
+      "loss": 0.6074,
+      "step": 695
+    },
+    {
+      "epoch": 0.00024430243975123625,
+      "grad_norm": 0.36542224884033203,
+      "learning_rate": 0.00015392320534223707,
+      "loss": 0.59,
+      "step": 696
+    },
+    {
+      "epoch": 0.00024465344900375237,
+      "grad_norm": 0.34015166759490967,
+      "learning_rate": 0.00015385642737896494,
+      "loss": 0.6029,
+      "step": 697
+    },
+    {
+      "epoch": 0.00024500445825626854,
+      "grad_norm": 0.3211725950241089,
+      "learning_rate": 0.00015378964941569284,
+      "loss": 0.535,
+      "step": 698
+    },
+    {
+      "epoch": 0.00024535546750878467,
+      "grad_norm": 0.37027183175086975,
+      "learning_rate": 0.0001537228714524207,
+      "loss": 0.6265,
+      "step": 699
+    },
+    {
+      "epoch": 0.0002457064767613008,
+      "grad_norm": 0.3447396159172058,
+      "learning_rate": 0.00015365609348914859,
+      "loss": 0.6061,
+      "step": 700
+    },
+    {
+      "epoch": 0.00024605748601381697,
+      "grad_norm": 0.3344075679779053,
+      "learning_rate": 0.00015358931552587649,
+      "loss": 0.5412,
+      "step": 701
+    },
+    {
+      "epoch": 0.0002464084952663331,
+      "grad_norm": 0.29049620032310486,
+      "learning_rate": 0.00015352253756260436,
+      "loss": 0.5137,
+      "step": 702
+    },
+    {
+      "epoch": 0.00024675950451884926,
+      "grad_norm": 0.37048932909965515,
+      "learning_rate": 0.00015345575959933223,
+      "loss": 0.6118,
+      "step": 703
+    },
+    {
+      "epoch": 0.0002471105137713654,
+      "grad_norm": 0.38212522864341736,
+      "learning_rate": 0.0001533889816360601,
+      "loss": 0.466,
+      "step": 704
+    },
+    {
+      "epoch": 0.0002474615230238815,
+      "grad_norm": 0.3576483428478241,
+      "learning_rate": 0.00015332220367278798,
+      "loss": 0.561,
+      "step": 705
+    },
+    {
+      "epoch": 0.0002478125322763977,
+      "grad_norm": 0.3550293743610382,
+      "learning_rate": 0.00015325542570951588,
+      "loss": 0.5634,
+      "step": 706
+    },
+    {
+      "epoch": 0.0002481635415289138,
+      "grad_norm": 0.362474650144577,
+      "learning_rate": 0.00015318864774624375,
+      "loss": 0.5608,
+      "step": 707
+    },
+    {
+      "epoch": 0.00024851455078143,
+      "grad_norm": 0.39463603496551514,
+      "learning_rate": 0.00015312186978297163,
+      "loss": 0.64,
+      "step": 708
+    },
+    {
+      "epoch": 0.0002488655600339461,
+      "grad_norm": 0.3456307649612427,
+      "learning_rate": 0.0001530550918196995,
+      "loss": 0.4631,
+      "step": 709
+    },
+    {
+      "epoch": 0.00024921656928646223,
+      "grad_norm": 0.3300929367542267,
+      "learning_rate": 0.00015298831385642737,
+      "loss": 0.3984,
+      "step": 710
+    },
+    {
+      "epoch": 0.0002495675785389784,
+      "grad_norm": 0.35923343896865845,
+      "learning_rate": 0.00015292153589315527,
+      "loss": 0.6003,
+      "step": 711
+    },
+    {
+      "epoch": 0.00024991858779149453,
+      "grad_norm": 0.4047611653804779,
+      "learning_rate": 0.00015285475792988315,
+      "loss": 0.5715,
+      "step": 712
+    },
+    {
+      "epoch": 0.0002502695970440107,
+      "grad_norm": 0.43539851903915405,
+      "learning_rate": 0.00015278797996661102,
+      "loss": 0.571,
+      "step": 713
+    },
+    {
+      "epoch": 0.00025062060629652683,
+      "grad_norm": 0.34745046496391296,
+      "learning_rate": 0.0001527212020033389,
+      "loss": 0.622,
+      "step": 714
+    },
+    {
+      "epoch": 0.00025097161554904295,
+      "grad_norm": 0.3130028247833252,
+      "learning_rate": 0.0001526544240400668,
+      "loss": 0.507,
+      "step": 715
+    },
+    {
+      "epoch": 0.0002513226248015591,
+      "grad_norm": 0.3093617558479309,
+      "learning_rate": 0.00015258764607679466,
+      "loss": 0.4951,
+      "step": 716
+    },
+    {
+      "epoch": 0.00025167363405407525,
+      "grad_norm": 0.34299540519714355,
+      "learning_rate": 0.00015252086811352257,
+      "loss": 0.539,
+      "step": 717
+    },
+    {
+      "epoch": 0.0002520246433065914,
+      "grad_norm": 0.32698413729667664,
+      "learning_rate": 0.00015245409015025044,
+      "loss": 0.4588,
+      "step": 718
+    },
+    {
+      "epoch": 0.00025237565255910755,
+      "grad_norm": 0.37853989005088806,
+      "learning_rate": 0.0001523873121869783,
+      "loss": 0.6227,
+      "step": 719
+    },
+    {
+      "epoch": 0.00025272666181162367,
+      "grad_norm": 0.32887300848960876,
+      "learning_rate": 0.00015232053422370618,
+      "loss": 0.5893,
+      "step": 720
+    },
+    {
+      "epoch": 0.00025307767106413985,
+      "grad_norm": 0.43352028727531433,
+      "learning_rate": 0.00015225375626043406,
+      "loss": 0.5811,
+      "step": 721
+    },
+    {
+      "epoch": 0.00025342868031665597,
+      "grad_norm": 0.42844903469085693,
+      "learning_rate": 0.00015218697829716196,
+      "loss": 0.6196,
+      "step": 722
+    },
+    {
+      "epoch": 0.00025377968956917215,
+      "grad_norm": 0.39929670095443726,
+      "learning_rate": 0.00015212020033388983,
+      "loss": 0.6722,
+      "step": 723
+    },
+    {
+      "epoch": 0.00025413069882168827,
+      "grad_norm": 0.5063486695289612,
+      "learning_rate": 0.0001520534223706177,
+      "loss": 0.6086,
+      "step": 724
+    },
+    {
+      "epoch": 0.0002544817080742044,
+      "grad_norm": 0.3625267446041107,
+      "learning_rate": 0.00015198664440734558,
+      "loss": 0.6331,
+      "step": 725
+    },
+    {
+      "epoch": 0.00025483271732672057,
+      "grad_norm": 0.3452700078487396,
+      "learning_rate": 0.00015191986644407345,
+      "loss": 0.5812,
+      "step": 726
+    },
+    {
+      "epoch": 0.0002551837265792367,
+      "grad_norm": 0.31915003061294556,
+      "learning_rate": 0.00015185308848080135,
+      "loss": 0.5653,
+      "step": 727
+    },
+    {
+      "epoch": 0.00025553473583175287,
+      "grad_norm": 0.3085877299308777,
+      "learning_rate": 0.00015178631051752922,
+      "loss": 0.4702,
+      "step": 728
+    },
+    {
+      "epoch": 0.000255885745084269,
+      "grad_norm": 0.31519320607185364,
+      "learning_rate": 0.0001517195325542571,
+      "loss": 0.5096,
+      "step": 729
+    },
+    {
+      "epoch": 0.0002562367543367851,
+      "grad_norm": 0.3637699782848358,
+      "learning_rate": 0.00015165275459098497,
+      "loss": 0.6001,
+      "step": 730
+    },
+    {
+      "epoch": 0.0002565877635893013,
+      "grad_norm": 0.34056970477104187,
+      "learning_rate": 0.00015158597662771284,
+      "loss": 0.5546,
+      "step": 731
+    },
+    {
+      "epoch": 0.0002569387728418174,
+      "grad_norm": 0.37110257148742676,
+      "learning_rate": 0.00015151919866444074,
+      "loss": 0.5612,
+      "step": 732
+    },
+    {
+      "epoch": 0.0002572897820943336,
+      "grad_norm": 0.35854101181030273,
+      "learning_rate": 0.00015145242070116862,
+      "loss": 0.6364,
+      "step": 733
+    },
+    {
+      "epoch": 0.0002576407913468497,
+      "grad_norm": 0.4340030252933502,
+      "learning_rate": 0.00015138564273789652,
+      "loss": 0.5772,
+      "step": 734
+    },
+    {
+      "epoch": 0.00025799180059936583,
+      "grad_norm": 0.3807721436023712,
+      "learning_rate": 0.0001513188647746244,
+      "loss": 0.4986,
+      "step": 735
+    },
+    {
+      "epoch": 0.000258342809851882,
+      "grad_norm": 0.3522527813911438,
+      "learning_rate": 0.00015125208681135226,
+      "loss": 0.5982,
+      "step": 736
+    },
+    {
+      "epoch": 0.00025869381910439813,
+      "grad_norm": 0.31251296401023865,
+      "learning_rate": 0.00015118530884808014,
+      "loss": 0.5239,
+      "step": 737
+    },
+    {
+      "epoch": 0.0002590448283569143,
+      "grad_norm": 0.3460885286331177,
+      "learning_rate": 0.00015111853088480804,
+      "loss": 0.5881,
+      "step": 738
+    },
+    {
+      "epoch": 0.00025939583760943043,
+      "grad_norm": 0.33298879861831665,
+      "learning_rate": 0.0001510517529215359,
+      "loss": 0.5272,
+      "step": 739
+    },
+    {
+      "epoch": 0.00025974684686194655,
+      "grad_norm": 0.351468950510025,
+      "learning_rate": 0.00015098497495826378,
+      "loss": 0.6049,
+      "step": 740
+    },
+    {
+      "epoch": 0.00026009785611446273,
+      "grad_norm": 0.3449242413043976,
+      "learning_rate": 0.00015091819699499166,
+      "loss": 0.5983,
+      "step": 741
+    },
+    {
+      "epoch": 0.00026044886536697885,
+      "grad_norm": 0.34724265336990356,
+      "learning_rate": 0.00015085141903171953,
+      "loss": 0.5292,
+      "step": 742
+    },
+    {
+      "epoch": 0.00026079987461949503,
+      "grad_norm": 0.3525671660900116,
+      "learning_rate": 0.00015078464106844743,
+      "loss": 0.5391,
+      "step": 743
+    },
+    {
+      "epoch": 0.00026115088387201115,
+      "grad_norm": 0.33959653973579407,
+      "learning_rate": 0.0001507178631051753,
+      "loss": 0.5898,
+      "step": 744
+    },
+    {
+      "epoch": 0.00026150189312452727,
+      "grad_norm": 0.5051225423812866,
+      "learning_rate": 0.00015065108514190318,
+      "loss": 0.5408,
+      "step": 745
+    },
+    {
+      "epoch": 0.00026185290237704345,
+      "grad_norm": 0.3298085629940033,
+      "learning_rate": 0.00015058430717863105,
+      "loss": 0.557,
+      "step": 746
+    },
+    {
+      "epoch": 0.00026220391162955957,
+      "grad_norm": 0.3375703990459442,
+      "learning_rate": 0.00015051752921535892,
+      "loss": 0.5541,
+      "step": 747
+    },
+    {
+      "epoch": 0.00026255492088207575,
+      "grad_norm": 0.27896445989608765,
+      "learning_rate": 0.0001504507512520868,
+      "loss": 0.5273,
+      "step": 748
+    },
+    {
+      "epoch": 0.00026290593013459187,
+      "grad_norm": 0.30591917037963867,
+      "learning_rate": 0.0001503839732888147,
+      "loss": 0.5988,
+      "step": 749
+    },
+    {
+      "epoch": 0.000263256939387108,
+      "grad_norm": 0.41014084219932556,
+      "learning_rate": 0.00015031719532554257,
+      "loss": 0.555,
+      "step": 750
+    },
+    {
+      "epoch": 0.00026360794863962417,
+      "grad_norm": 0.2935464084148407,
+      "learning_rate": 0.00015025041736227047,
+      "loss": 0.625,
+      "step": 751
+    },
+    {
+      "epoch": 0.0002639589578921403,
+      "grad_norm": 0.46361032128334045,
+      "learning_rate": 0.00015018363939899834,
+      "loss": 0.4753,
+      "step": 752
+    },
+    {
+      "epoch": 0.00026430996714465647,
+      "grad_norm": 0.35808300971984863,
+      "learning_rate": 0.00015011686143572622,
+      "loss": 0.5531,
+      "step": 753
+    },
+    {
+      "epoch": 0.0002646609763971726,
+      "grad_norm": 0.3411274254322052,
+      "learning_rate": 0.00015005008347245412,
+      "loss": 0.5577,
+      "step": 754
+    },
+    {
+      "epoch": 0.0002650119856496887,
+      "grad_norm": 0.34169328212738037,
+      "learning_rate": 0.000149983305509182,
+      "loss": 0.4856,
+      "step": 755
+    },
+    {
+      "epoch": 0.0002653629949022049,
+      "grad_norm": 0.38024139404296875,
+      "learning_rate": 0.00014991652754590986,
+      "loss": 0.5203,
+      "step": 756
+    },
+    {
+      "epoch": 0.000265714004154721,
+      "grad_norm": 0.35004425048828125,
+      "learning_rate": 0.00014984974958263774,
+      "loss": 0.4999,
+      "step": 757
+    },
+    {
+      "epoch": 0.0002660650134072372,
+      "grad_norm": 0.47526153922080994,
+      "learning_rate": 0.0001497829716193656,
+      "loss": 0.5503,
+      "step": 758
+    },
+    {
+      "epoch": 0.0002664160226597533,
+      "grad_norm": 0.35096925497055054,
+      "learning_rate": 0.0001497161936560935,
+      "loss": 0.5812,
+      "step": 759
+    },
+    {
+      "epoch": 0.00026676703191226943,
+      "grad_norm": 0.4505446255207062,
+      "learning_rate": 0.00014964941569282138,
+      "loss": 0.6069,
+      "step": 760
+    },
+    {
+      "epoch": 0.0002671180411647856,
+      "grad_norm": 0.3261663019657135,
+      "learning_rate": 0.00014958263772954926,
+      "loss": 0.5601,
+      "step": 761
+    },
+    {
+      "epoch": 0.00026746905041730173,
+      "grad_norm": 0.3397548794746399,
+      "learning_rate": 0.00014951585976627713,
+      "loss": 0.5572,
+      "step": 762
+    },
+    {
+      "epoch": 0.00026782005966981785,
+      "grad_norm": 0.35547688603401184,
+      "learning_rate": 0.000149449081803005,
+      "loss": 0.5983,
+      "step": 763
+    },
+    {
+      "epoch": 0.00026817106892233403,
+      "grad_norm": 0.41515079140663147,
+      "learning_rate": 0.00014938230383973287,
+      "loss": 0.6106,
+      "step": 764
+    },
+    {
+      "epoch": 0.00026852207817485015,
+      "grad_norm": 0.3840051591396332,
+      "learning_rate": 0.00014931552587646077,
+      "loss": 0.5328,
+      "step": 765
+    },
+    {
+      "epoch": 0.00026887308742736633,
+      "grad_norm": 0.3401285707950592,
+      "learning_rate": 0.00014924874791318865,
+      "loss": 0.4666,
+      "step": 766
+    },
+    {
+      "epoch": 0.00026922409667988245,
+      "grad_norm": 0.32983794808387756,
+      "learning_rate": 0.00014918196994991652,
+      "loss": 0.5214,
+      "step": 767
+    },
+    {
+      "epoch": 0.0002695751059323986,
+      "grad_norm": 0.30202198028564453,
+      "learning_rate": 0.00014911519198664442,
+      "loss": 0.4969,
+      "step": 768
+    },
+    {
+      "epoch": 0.00026992611518491475,
+      "grad_norm": 0.3222092092037201,
+      "learning_rate": 0.0001490484140233723,
+      "loss": 0.5093,
+      "step": 769
+    },
+    {
+      "epoch": 0.0002702771244374309,
+      "grad_norm": 0.4211997091770172,
+      "learning_rate": 0.0001489816360601002,
+      "loss": 0.6295,
+      "step": 770
+    },
+    {
+      "epoch": 0.00027062813368994705,
+      "grad_norm": 0.32112184166908264,
+      "learning_rate": 0.00014891485809682807,
+      "loss": 0.5611,
+      "step": 771
+    },
+    {
+      "epoch": 0.00027097914294246317,
+      "grad_norm": 0.3272956609725952,
+      "learning_rate": 0.00014884808013355594,
+      "loss": 0.6438,
+      "step": 772
+    },
+    {
+      "epoch": 0.0002713301521949793,
+      "grad_norm": 0.39423295855522156,
+      "learning_rate": 0.00014878130217028381,
+      "loss": 0.6029,
+      "step": 773
+    },
+    {
+      "epoch": 0.00027168116144749547,
+      "grad_norm": 0.3053528070449829,
+      "learning_rate": 0.0001487145242070117,
+      "loss": 0.4978,
+      "step": 774
+    },
+    {
+      "epoch": 0.0002720321707000116,
+      "grad_norm": 0.312774658203125,
+      "learning_rate": 0.0001486477462437396,
+      "loss": 0.5753,
+      "step": 775
+    },
+    {
+      "epoch": 0.00027238317995252777,
+      "grad_norm": 0.343964546918869,
+      "learning_rate": 0.00014858096828046746,
+      "loss": 0.5173,
+      "step": 776
+    },
+    {
+      "epoch": 0.0002727341892050439,
+      "grad_norm": 0.39104631543159485,
+      "learning_rate": 0.00014851419031719533,
+      "loss": 0.6381,
+      "step": 777
+    },
+    {
+      "epoch": 0.00027308519845756,
+      "grad_norm": 0.3958207070827484,
+      "learning_rate": 0.0001484474123539232,
+      "loss": 0.6046,
+      "step": 778
+    },
+    {
+      "epoch": 0.0002734362077100762,
+      "grad_norm": 0.36198097467422485,
+      "learning_rate": 0.00014838063439065108,
+      "loss": 0.6066,
+      "step": 779
+    },
+    {
+      "epoch": 0.0002737872169625923,
+      "grad_norm": 0.29619571566581726,
+      "learning_rate": 0.00014831385642737895,
+      "loss": 0.5131,
+      "step": 780
+    },
+    {
+      "epoch": 0.0002741382262151085,
+      "grad_norm": 0.344784677028656,
+      "learning_rate": 0.00014824707846410685,
+      "loss": 0.5626,
+      "step": 781
+    },
+    {
+      "epoch": 0.0002744892354676246,
+      "grad_norm": 0.35641250014305115,
+      "learning_rate": 0.00014818030050083473,
+      "loss": 0.5451,
+      "step": 782
+    },
+    {
+      "epoch": 0.00027484024472014074,
+      "grad_norm": 0.3496847152709961,
+      "learning_rate": 0.0001481135225375626,
+      "loss": 0.4814,
+      "step": 783
+    },
+    {
+      "epoch": 0.0002751912539726569,
+      "grad_norm": 0.3726658821105957,
+      "learning_rate": 0.00014804674457429047,
+      "loss": 0.6244,
+      "step": 784
+    },
+    {
+      "epoch": 0.00027554226322517303,
+      "grad_norm": 0.3317565619945526,
+      "learning_rate": 0.00014797996661101837,
+      "loss": 0.562,
+      "step": 785
+    },
+    {
+      "epoch": 0.0002758932724776892,
+      "grad_norm": 0.3478979468345642,
+      "learning_rate": 0.00014791318864774625,
+      "loss": 0.613,
+      "step": 786
+    },
+    {
+      "epoch": 0.00027624428173020533,
+      "grad_norm": 0.3572550415992737,
+      "learning_rate": 0.00014784641068447415,
+      "loss": 0.4841,
+      "step": 787
+    },
+    {
+      "epoch": 0.00027659529098272146,
+      "grad_norm": 0.34030210971832275,
+      "learning_rate": 0.00014777963272120202,
+      "loss": 0.4879,
+      "step": 788
+    },
+    {
+      "epoch": 0.00027694630023523763,
+      "grad_norm": 0.378203421831131,
+      "learning_rate": 0.0001477128547579299,
+      "loss": 0.6086,
+      "step": 789
+    },
+    {
+      "epoch": 0.00027729730948775375,
+      "grad_norm": 0.3390562832355499,
+      "learning_rate": 0.00014764607679465777,
+      "loss": 0.586,
+      "step": 790
+    },
+    {
+      "epoch": 0.00027764831874026993,
+      "grad_norm": 0.4986645579338074,
+      "learning_rate": 0.00014757929883138567,
+      "loss": 0.5592,
+      "step": 791
+    },
+    {
+      "epoch": 0.00027799932799278605,
+      "grad_norm": 0.3361869156360626,
+      "learning_rate": 0.00014751252086811354,
+      "loss": 0.4632,
+      "step": 792
+    },
+    {
+      "epoch": 0.0002783503372453022,
+      "grad_norm": 0.3726123571395874,
+      "learning_rate": 0.0001474457429048414,
+      "loss": 0.4915,
+      "step": 793
+    },
+    {
+      "epoch": 0.00027870134649781835,
+      "grad_norm": 0.3358845114707947,
+      "learning_rate": 0.00014737896494156929,
+      "loss": 0.5593,
+      "step": 794
+    },
+    {
+      "epoch": 0.0002790523557503345,
+      "grad_norm": 0.30473607778549194,
+      "learning_rate": 0.00014731218697829716,
+      "loss": 0.3672,
+      "step": 795
+    },
+    {
+      "epoch": 0.00027940336500285065,
+      "grad_norm": 0.33929023146629333,
+      "learning_rate": 0.00014724540901502506,
+      "loss": 0.5404,
+      "step": 796
+    },
+    {
+      "epoch": 0.0002797543742553668,
+      "grad_norm": 0.30778205394744873,
+      "learning_rate": 0.00014717863105175293,
+      "loss": 0.4379,
+      "step": 797
+    },
+    {
+      "epoch": 0.0002801053835078829,
+      "grad_norm": 0.286443829536438,
+      "learning_rate": 0.0001471118530884808,
+      "loss": 0.5579,
+      "step": 798
+    },
+    {
+      "epoch": 0.0002804563927603991,
+      "grad_norm": 0.4246799051761627,
+      "learning_rate": 0.00014704507512520868,
+      "loss": 0.536,
+      "step": 799
+    },
+    {
+      "epoch": 0.0002808074020129152,
+      "grad_norm": 0.4085538983345032,
+      "learning_rate": 0.00014697829716193655,
+      "loss": 0.5309,
+      "step": 800
+    },
+    {
+      "epoch": 0.00028115841126543137,
+      "grad_norm": 0.35396453738212585,
+      "learning_rate": 0.00014691151919866443,
+      "loss": 0.5307,
+      "step": 801
+    },
+    {
+      "epoch": 0.0002815094205179475,
+      "grad_norm": 0.45588648319244385,
+      "learning_rate": 0.00014684474123539233,
+      "loss": 0.5905,
+      "step": 802
+    },
+    {
+      "epoch": 0.0002818604297704636,
+      "grad_norm": 0.3353815972805023,
+      "learning_rate": 0.0001467779632721202,
+      "loss": 0.612,
+      "step": 803
+    },
+    {
+      "epoch": 0.0002822114390229798,
+      "grad_norm": 0.4152653217315674,
+      "learning_rate": 0.0001467111853088481,
+      "loss": 0.592,
+      "step": 804
+    },
+    {
+      "epoch": 0.0002825624482754959,
+      "grad_norm": 0.3651511073112488,
+      "learning_rate": 0.00014664440734557597,
+      "loss": 0.5909,
+      "step": 805
+    },
+    {
+      "epoch": 0.0002829134575280121,
+      "grad_norm": 0.3518235385417938,
+      "learning_rate": 0.00014657762938230385,
+      "loss": 0.5684,
+      "step": 806
+    },
+    {
+      "epoch": 0.0002832644667805282,
+      "grad_norm": 0.33562156558036804,
+      "learning_rate": 0.00014651085141903175,
+      "loss": 0.5165,
+      "step": 807
+    },
+    {
+      "epoch": 0.00028361547603304434,
+      "grad_norm": 0.3648052513599396,
+      "learning_rate": 0.00014644407345575962,
+      "loss": 0.5451,
+      "step": 808
+    },
+    {
+      "epoch": 0.0002839664852855605,
+      "grad_norm": 0.44342300295829773,
+      "learning_rate": 0.0001463772954924875,
+      "loss": 0.5907,
+      "step": 809
+    },
+    {
+      "epoch": 0.00028431749453807664,
+      "grad_norm": 0.33331966400146484,
+      "learning_rate": 0.00014631051752921536,
+      "loss": 0.4254,
+      "step": 810
+    },
+    {
+      "epoch": 0.0002846685037905928,
+      "grad_norm": 0.3444873094558716,
+      "learning_rate": 0.00014624373956594324,
+      "loss": 0.5201,
+      "step": 811
+    },
+    {
+      "epoch": 0.00028501951304310894,
+      "grad_norm": 0.4239615201950073,
+      "learning_rate": 0.00014617696160267114,
+      "loss": 0.5098,
+      "step": 812
+    },
+    {
+      "epoch": 0.00028537052229562506,
+      "grad_norm": 0.47895997762680054,
+      "learning_rate": 0.000146110183639399,
+      "loss": 0.6243,
+      "step": 813
+    },
+    {
+      "epoch": 0.00028572153154814123,
+      "grad_norm": 0.47322046756744385,
+      "learning_rate": 0.00014604340567612688,
+      "loss": 0.6841,
+      "step": 814
+    },
+    {
+      "epoch": 0.00028607254080065736,
+      "grad_norm": 0.35017871856689453,
+      "learning_rate": 0.00014597662771285476,
+      "loss": 0.5313,
+      "step": 815
+    },
+    {
+      "epoch": 0.00028642355005317353,
+      "grad_norm": 0.4342300295829773,
+      "learning_rate": 0.00014590984974958263,
+      "loss": 0.4363,
+      "step": 816
+    },
+    {
+      "epoch": 0.00028677455930568966,
+      "grad_norm": 0.2966228723526001,
+      "learning_rate": 0.0001458430717863105,
+      "loss": 0.6428,
+      "step": 817
+    },
+    {
+      "epoch": 0.0002871255685582058,
+      "grad_norm": 0.3320361375808716,
+      "learning_rate": 0.0001457762938230384,
+      "loss": 0.5266,
+      "step": 818
+    },
+    {
+      "epoch": 0.00028747657781072195,
+      "grad_norm": 0.3318590223789215,
+      "learning_rate": 0.00014570951585976628,
+      "loss": 0.5676,
+      "step": 819
+    },
+    {
+      "epoch": 0.0002878275870632381,
+      "grad_norm": 0.38573157787323,
+      "learning_rate": 0.00014564273789649415,
+      "loss": 0.7083,
+      "step": 820
+    },
+    {
+      "epoch": 0.00028817859631575425,
+      "grad_norm": 0.3731164038181305,
+      "learning_rate": 0.00014557595993322205,
+      "loss": 0.578,
+      "step": 821
+    },
+    {
+      "epoch": 0.0002885296055682704,
+      "grad_norm": 0.33610039949417114,
+      "learning_rate": 0.00014550918196994992,
+      "loss": 0.5923,
+      "step": 822
+    },
+    {
+      "epoch": 0.0002888806148207865,
+      "grad_norm": 0.3393179476261139,
+      "learning_rate": 0.00014544240400667782,
+      "loss": 0.5162,
+      "step": 823
+    },
+    {
+      "epoch": 0.0002892316240733027,
+      "grad_norm": 0.35552918910980225,
+      "learning_rate": 0.0001453756260434057,
+      "loss": 0.556,
+      "step": 824
+    },
+    {
+      "epoch": 0.0002895826333258188,
+      "grad_norm": 0.32425832748413086,
+      "learning_rate": 0.00014530884808013357,
+      "loss": 0.5157,
+      "step": 825
+    },
+    {
+      "epoch": 0.000289933642578335,
+      "grad_norm": 0.3353455662727356,
+      "learning_rate": 0.00014524207011686144,
+      "loss": 0.483,
+      "step": 826
+    },
+    {
+      "epoch": 0.0002902846518308511,
+      "grad_norm": 0.46254628896713257,
+      "learning_rate": 0.00014517529215358932,
+      "loss": 0.633,
+      "step": 827
+    },
+    {
+      "epoch": 0.0002906356610833672,
+      "grad_norm": 0.3275732100009918,
+      "learning_rate": 0.00014510851419031722,
+      "loss": 0.5502,
+      "step": 828
+    },
+    {
+      "epoch": 0.0002909866703358834,
+      "grad_norm": 0.3495190441608429,
+      "learning_rate": 0.0001450417362270451,
+      "loss": 0.368,
+      "step": 829
+    },
+    {
+      "epoch": 0.0002913376795883995,
+      "grad_norm": 0.35350501537323,
+      "learning_rate": 0.00014497495826377296,
+      "loss": 0.5819,
+      "step": 830
+    },
+    {
+      "epoch": 0.0002916886888409157,
+      "grad_norm": 0.37886378169059753,
+      "learning_rate": 0.00014490818030050084,
+      "loss": 0.5418,
+      "step": 831
+    },
+    {
+      "epoch": 0.0002920396980934318,
+      "grad_norm": 0.4279928505420685,
+      "learning_rate": 0.0001448414023372287,
+      "loss": 0.5199,
+      "step": 832
+    },
+    {
+      "epoch": 0.00029239070734594794,
+      "grad_norm": 0.33105382323265076,
+      "learning_rate": 0.00014477462437395658,
+      "loss": 0.5952,
+      "step": 833
+    },
+    {
+      "epoch": 0.0002927417165984641,
+      "grad_norm": 0.40114086866378784,
+      "learning_rate": 0.00014470784641068448,
+      "loss": 0.4611,
+      "step": 834
+    },
+    {
+      "epoch": 0.00029309272585098024,
+      "grad_norm": 0.3294037878513336,
+      "learning_rate": 0.00014464106844741236,
+      "loss": 0.5562,
+      "step": 835
+    },
+    {
+      "epoch": 0.0002934437351034964,
+      "grad_norm": 0.3391546607017517,
+      "learning_rate": 0.00014457429048414023,
+      "loss": 0.5748,
+      "step": 836
+    },
+    {
+      "epoch": 0.00029379474435601254,
+      "grad_norm": 0.4093922972679138,
+      "learning_rate": 0.0001445075125208681,
+      "loss": 0.4607,
+      "step": 837
+    },
+    {
+      "epoch": 0.00029414575360852866,
+      "grad_norm": 0.3331819176673889,
+      "learning_rate": 0.000144440734557596,
+      "loss": 0.5874,
+      "step": 838
+    },
+    {
+      "epoch": 0.00029449676286104484,
+      "grad_norm": 0.43205946683883667,
+      "learning_rate": 0.00014437395659432388,
+      "loss": 0.6152,
+      "step": 839
+    },
+    {
+      "epoch": 0.00029484777211356096,
+      "grad_norm": 0.36046868562698364,
+      "learning_rate": 0.00014430717863105178,
+      "loss": 0.4781,
+      "step": 840
+    },
+    {
+      "epoch": 0.00029519878136607713,
+      "grad_norm": 0.35514524579048157,
+      "learning_rate": 0.00014424040066777965,
+      "loss": 0.568,
+      "step": 841
+    },
+    {
+      "epoch": 0.00029554979061859326,
+      "grad_norm": 0.40260326862335205,
+      "learning_rate": 0.00014417362270450752,
+      "loss": 0.6075,
+      "step": 842
+    },
+    {
+      "epoch": 0.0002959007998711094,
+      "grad_norm": 0.3102671205997467,
+      "learning_rate": 0.0001441068447412354,
+      "loss": 0.4927,
+      "step": 843
+    },
+    {
+      "epoch": 0.00029625180912362556,
+      "grad_norm": 0.30940982699394226,
+      "learning_rate": 0.0001440400667779633,
+      "loss": 0.5549,
+      "step": 844
+    },
+    {
+      "epoch": 0.0002966028183761417,
+      "grad_norm": 0.3652762174606323,
+      "learning_rate": 0.00014397328881469117,
+      "loss": 0.6085,
+      "step": 845
+    },
+    {
+      "epoch": 0.00029695382762865786,
+      "grad_norm": 0.43056777119636536,
+      "learning_rate": 0.00014390651085141904,
+      "loss": 0.494,
+      "step": 846
+    },
+    {
+      "epoch": 0.000297304836881174,
+      "grad_norm": 0.3112967014312744,
+      "learning_rate": 0.00014383973288814692,
+      "loss": 0.5141,
+      "step": 847
+    },
+    {
+      "epoch": 0.0002976558461336901,
+      "grad_norm": 0.36729326844215393,
+      "learning_rate": 0.0001437729549248748,
+      "loss": 0.5435,
+      "step": 848
+    },
+    {
+      "epoch": 0.0002980068553862063,
+      "grad_norm": 0.3128114938735962,
+      "learning_rate": 0.00014370617696160266,
+      "loss": 0.5419,
+      "step": 849
+    },
+    {
+      "epoch": 0.0002983578646387224,
+      "grad_norm": 0.4030589163303375,
+      "learning_rate": 0.00014363939899833056,
+      "loss": 0.5959,
+      "step": 850
+    },
+    {
+      "epoch": 0.0002987088738912386,
+      "grad_norm": 0.39571288228034973,
+      "learning_rate": 0.00014357262103505844,
+      "loss": 0.6798,
+      "step": 851
+    },
+    {
+      "epoch": 0.0002990598831437547,
+      "grad_norm": 0.3388408422470093,
+      "learning_rate": 0.0001435058430717863,
+      "loss": 0.4887,
+      "step": 852
+    },
+    {
+      "epoch": 0.0002994108923962708,
+      "grad_norm": 0.39615562558174133,
+      "learning_rate": 0.00014343906510851418,
+      "loss": 0.5654,
+      "step": 853
+    },
+    {
+      "epoch": 0.000299761901648787,
+      "grad_norm": 0.3967401683330536,
+      "learning_rate": 0.00014337228714524205,
+      "loss": 0.6192,
+      "step": 854
+    },
+    {
+      "epoch": 0.0003001129109013031,
+      "grad_norm": 0.5597772002220154,
+      "learning_rate": 0.00014330550918196995,
+      "loss": 0.5808,
+      "step": 855
+    },
+    {
+      "epoch": 0.0003004639201538193,
+      "grad_norm": 0.36231061816215515,
+      "learning_rate": 0.00014323873121869783,
+      "loss": 0.4936,
+      "step": 856
+    },
+    {
+      "epoch": 0.0003008149294063354,
+      "grad_norm": 0.3775942027568817,
+      "learning_rate": 0.00014317195325542573,
+      "loss": 0.5706,
+      "step": 857
+    },
+    {
+      "epoch": 0.00030116593865885154,
+      "grad_norm": 0.4139408767223358,
+      "learning_rate": 0.0001431051752921536,
+      "loss": 0.5784,
+      "step": 858
+    },
+    {
+      "epoch": 0.0003015169479113677,
+      "grad_norm": 0.4101429879665375,
+      "learning_rate": 0.00014303839732888147,
+      "loss": 0.5937,
+      "step": 859
+    },
+    {
+      "epoch": 0.00030186795716388384,
+      "grad_norm": 0.5272162556648254,
+      "learning_rate": 0.00014297161936560937,
+      "loss": 0.5244,
+      "step": 860
+    },
+    {
+      "epoch": 0.0003022189664164,
+      "grad_norm": 0.3587292730808258,
+      "learning_rate": 0.00014290484140233725,
+      "loss": 0.6333,
+      "step": 861
+    },
+    {
+      "epoch": 0.00030256997566891614,
+      "grad_norm": 0.3284890353679657,
+      "learning_rate": 0.00014283806343906512,
+      "loss": 0.5414,
+      "step": 862
+    },
+    {
+      "epoch": 0.00030292098492143226,
+      "grad_norm": 0.414974182844162,
+      "learning_rate": 0.000142771285475793,
+      "loss": 0.6116,
+      "step": 863
+    },
+    {
+      "epoch": 0.00030327199417394844,
+      "grad_norm": 0.33619245886802673,
+      "learning_rate": 0.00014270450751252087,
+      "loss": 0.5506,
+      "step": 864
+    },
+    {
+      "epoch": 0.00030362300342646456,
+      "grad_norm": 0.45475640892982483,
+      "learning_rate": 0.00014263772954924874,
+      "loss": 0.6347,
+      "step": 865
+    },
+    {
+      "epoch": 0.00030397401267898074,
+      "grad_norm": 0.2695920765399933,
+      "learning_rate": 0.00014257095158597664,
+      "loss": 0.4529,
+      "step": 866
+    },
+    {
+      "epoch": 0.00030432502193149686,
+      "grad_norm": 0.3314480781555176,
+      "learning_rate": 0.00014250417362270451,
+      "loss": 0.5812,
+      "step": 867
+    },
+    {
+      "epoch": 0.000304676031184013,
+      "grad_norm": 0.31949582695961,
+      "learning_rate": 0.0001424373956594324,
+      "loss": 0.5213,
+      "step": 868
+    },
+    {
+      "epoch": 0.00030502704043652916,
+      "grad_norm": 0.34049752354621887,
+      "learning_rate": 0.00014237061769616026,
+      "loss": 0.4645,
+      "step": 869
+    },
+    {
+      "epoch": 0.0003053780496890453,
+      "grad_norm": 0.4304719567298889,
+      "learning_rate": 0.00014230383973288813,
+      "loss": 0.5065,
+      "step": 870
+    },
+    {
+      "epoch": 0.00030572905894156146,
+      "grad_norm": 0.32379043102264404,
+      "learning_rate": 0.00014223706176961603,
+      "loss": 0.553,
+      "step": 871
+    },
+    {
+      "epoch": 0.0003060800681940776,
+      "grad_norm": 0.33285439014434814,
+      "learning_rate": 0.0001421702838063439,
+      "loss": 0.5092,
+      "step": 872
+    },
+    {
+      "epoch": 0.0003064310774465937,
+      "grad_norm": 0.336795449256897,
+      "learning_rate": 0.00014210350584307178,
+      "loss": 0.4967,
+      "step": 873
+    },
+    {
+      "epoch": 0.0003067820866991099,
+      "grad_norm": 0.34653040766716003,
+      "learning_rate": 0.00014203672787979968,
+      "loss": 0.5353,
+      "step": 874
+    },
+    {
+      "epoch": 0.000307133095951626,
+      "grad_norm": 0.3352467715740204,
+      "learning_rate": 0.00014196994991652755,
+      "loss": 0.5594,
+      "step": 875
+    },
+    {
+      "epoch": 0.0003074841052041422,
+      "grad_norm": 0.38723453879356384,
+      "learning_rate": 0.00014190317195325545,
+      "loss": 0.5897,
+      "step": 876
+    },
+    {
+      "epoch": 0.0003078351144566583,
+      "grad_norm": 0.3987238109111786,
+      "learning_rate": 0.00014183639398998333,
+      "loss": 0.4647,
+      "step": 877
+    },
+    {
+      "epoch": 0.0003081861237091744,
+      "grad_norm": 0.3452693223953247,
+      "learning_rate": 0.0001417696160267112,
+      "loss": 0.5687,
+      "step": 878
+    },
+    {
+      "epoch": 0.0003085371329616906,
+      "grad_norm": 0.3561328649520874,
+      "learning_rate": 0.00014170283806343907,
+      "loss": 0.5845,
+      "step": 879
+    },
+    {
+      "epoch": 0.0003088881422142067,
+      "grad_norm": 0.29658418893814087,
+      "learning_rate": 0.00014163606010016695,
+      "loss": 0.5202,
+      "step": 880
+    },
+    {
+      "epoch": 0.0003092391514667229,
+      "grad_norm": 0.3908213973045349,
+      "learning_rate": 0.00014156928213689482,
+      "loss": 0.4439,
+      "step": 881
+    },
+    {
+      "epoch": 0.000309590160719239,
+      "grad_norm": 0.35816919803619385,
+      "learning_rate": 0.00014150250417362272,
+      "loss": 0.5384,
+      "step": 882
+    },
+    {
+      "epoch": 0.00030994116997175514,
+      "grad_norm": 0.3681255877017975,
+      "learning_rate": 0.0001414357262103506,
+      "loss": 0.5999,
+      "step": 883
+    },
+    {
+      "epoch": 0.0003102921792242713,
+      "grad_norm": 0.31137388944625854,
+      "learning_rate": 0.00014136894824707847,
+      "loss": 0.4495,
+      "step": 884
+    },
+    {
+      "epoch": 0.00031064318847678744,
+      "grad_norm": 0.2831423878669739,
+      "learning_rate": 0.00014130217028380634,
+      "loss": 0.4576,
+      "step": 885
+    },
+    {
+      "epoch": 0.0003109941977293036,
+      "grad_norm": 0.25953516364097595,
+      "learning_rate": 0.0001412353923205342,
+      "loss": 0.5606,
+      "step": 886
+    },
+    {
+      "epoch": 0.00031134520698181974,
+      "grad_norm": 0.31105297803878784,
+      "learning_rate": 0.0001411686143572621,
+      "loss": 0.5986,
+      "step": 887
+    },
+    {
+      "epoch": 0.00031169621623433586,
+      "grad_norm": 0.35177484154701233,
+      "learning_rate": 0.00014110183639398999,
+      "loss": 0.3394,
+      "step": 888
+    },
+    {
+      "epoch": 0.00031204722548685204,
+      "grad_norm": 0.373470276594162,
+      "learning_rate": 0.00014103505843071786,
+      "loss": 0.5862,
+      "step": 889
+    },
+    {
+      "epoch": 0.00031239823473936816,
+      "grad_norm": 0.37227189540863037,
+      "learning_rate": 0.00014096828046744576,
+      "loss": 0.4677,
+      "step": 890
+    },
+    {
+      "epoch": 0.00031274924399188434,
+      "grad_norm": 0.3799666464328766,
+      "learning_rate": 0.00014090150250417363,
+      "loss": 0.5255,
+      "step": 891
+    },
+    {
+      "epoch": 0.00031310025324440046,
+      "grad_norm": 0.3630129098892212,
+      "learning_rate": 0.00014083472454090153,
+      "loss": 0.5111,
+      "step": 892
+    },
+    {
+      "epoch": 0.0003134512624969166,
+      "grad_norm": 0.5131457448005676,
+      "learning_rate": 0.0001407679465776294,
+      "loss": 0.5207,
+      "step": 893
+    },
+    {
+      "epoch": 0.00031380227174943276,
+      "grad_norm": 0.3759867548942566,
+      "learning_rate": 0.00014070116861435728,
+      "loss": 0.6678,
+      "step": 894
+    },
+    {
+      "epoch": 0.0003141532810019489,
+      "grad_norm": 0.5577414631843567,
+      "learning_rate": 0.00014063439065108515,
+      "loss": 0.62,
+      "step": 895
+    },
+    {
+      "epoch": 0.00031450429025446506,
+      "grad_norm": 0.2789120376110077,
+      "learning_rate": 0.00014056761268781303,
+      "loss": 0.4204,
+      "step": 896
+    },
+    {
+      "epoch": 0.0003148552995069812,
+      "grad_norm": 0.2897239327430725,
+      "learning_rate": 0.0001405008347245409,
+      "loss": 0.432,
+      "step": 897
+    },
+    {
+      "epoch": 0.0003152063087594973,
+      "grad_norm": 0.3552323579788208,
+      "learning_rate": 0.0001404340567612688,
+      "loss": 0.5512,
+      "step": 898
+    },
+    {
+      "epoch": 0.0003155573180120135,
+      "grad_norm": 0.49963894486427307,
+      "learning_rate": 0.00014036727879799667,
+      "loss": 0.5868,
+      "step": 899
+    },
+    {
+      "epoch": 0.0003159083272645296,
+      "grad_norm": 0.37479934096336365,
+      "learning_rate": 0.00014030050083472454,
+      "loss": 0.6682,
+      "step": 900
+    },
+    {
+      "epoch": 0.0003162593365170458,
+      "grad_norm": 0.3415648639202118,
+      "learning_rate": 0.00014023372287145242,
+      "loss": 0.5301,
+      "step": 901
+    },
+    {
+      "epoch": 0.0003166103457695619,
+      "grad_norm": 0.37530943751335144,
+      "learning_rate": 0.0001401669449081803,
+      "loss": 0.5409,
+      "step": 902
+    },
+    {
+      "epoch": 0.000316961355022078,
+      "grad_norm": 0.37487658858299255,
+      "learning_rate": 0.0001401001669449082,
+      "loss": 0.5976,
+      "step": 903
+    },
+    {
+      "epoch": 0.0003173123642745942,
+      "grad_norm": 0.37174728512763977,
+      "learning_rate": 0.00014003338898163606,
+      "loss": 0.5933,
+      "step": 904
+    },
+    {
+      "epoch": 0.0003176633735271103,
+      "grad_norm": 0.491584450006485,
+      "learning_rate": 0.00013996661101836394,
+      "loss": 0.5112,
+      "step": 905
+    },
+    {
+      "epoch": 0.0003180143827796265,
+      "grad_norm": 0.38381487131118774,
+      "learning_rate": 0.0001398998330550918,
+      "loss": 0.6486,
+      "step": 906
+    },
+    {
+      "epoch": 0.0003183653920321426,
+      "grad_norm": 0.2867659330368042,
+      "learning_rate": 0.0001398330550918197,
+      "loss": 0.5033,
+      "step": 907
+    },
+    {
+      "epoch": 0.00031871640128465874,
+      "grad_norm": 0.3146355450153351,
+      "learning_rate": 0.00013976627712854758,
+      "loss": 0.5878,
+      "step": 908
+    },
+    {
+      "epoch": 0.0003190674105371749,
+      "grad_norm": 0.3454856276512146,
+      "learning_rate": 0.00013969949916527548,
+      "loss": 0.4751,
+      "step": 909
+    },
+    {
+      "epoch": 0.00031941841978969104,
+      "grad_norm": 0.32241204380989075,
+      "learning_rate": 0.00013963272120200336,
+      "loss": 0.6378,
+      "step": 910
+    },
+    {
+      "epoch": 0.0003197694290422072,
+      "grad_norm": 0.33703315258026123,
+      "learning_rate": 0.00013956594323873123,
+      "loss": 0.4634,
+      "step": 911
+    },
+    {
+      "epoch": 0.00032012043829472334,
+      "grad_norm": 0.3781648576259613,
+      "learning_rate": 0.0001394991652754591,
+      "loss": 0.5218,
+      "step": 912
+    },
+    {
+      "epoch": 0.00032047144754723946,
+      "grad_norm": 0.4124391973018646,
+      "learning_rate": 0.00013943238731218698,
+      "loss": 0.4958,
+      "step": 913
+    },
+    {
+      "epoch": 0.00032082245679975564,
+      "grad_norm": 0.3970220685005188,
+      "learning_rate": 0.00013936560934891488,
+      "loss": 0.5624,
+      "step": 914
+    },
+    {
+      "epoch": 0.00032117346605227176,
+      "grad_norm": 0.43682703375816345,
+      "learning_rate": 0.00013929883138564275,
+      "loss": 0.544,
+      "step": 915
+    },
+    {
+      "epoch": 0.00032152447530478794,
+      "grad_norm": 0.3476586639881134,
+      "learning_rate": 0.00013923205342237062,
+      "loss": 0.4418,
+      "step": 916
+    },
+    {
+      "epoch": 0.00032187548455730406,
+      "grad_norm": 0.36963552236557007,
+      "learning_rate": 0.0001391652754590985,
+      "loss": 0.5946,
+      "step": 917
+    },
+    {
+      "epoch": 0.0003222264938098202,
+      "grad_norm": 0.3445582985877991,
+      "learning_rate": 0.00013909849749582637,
+      "loss": 0.5879,
+      "step": 918
+    },
+    {
+      "epoch": 0.00032257750306233636,
+      "grad_norm": 0.39813530445098877,
+      "learning_rate": 0.00013903171953255427,
+      "loss": 0.5759,
+      "step": 919
+    },
+    {
+      "epoch": 0.0003229285123148525,
+      "grad_norm": 0.3314265012741089,
+      "learning_rate": 0.00013896494156928214,
+      "loss": 0.6165,
+      "step": 920
+    },
+    {
+      "epoch": 0.00032327952156736866,
+      "grad_norm": 0.4094330072402954,
+      "learning_rate": 0.00013889816360601002,
+      "loss": 0.5787,
+      "step": 921
+    },
+    {
+      "epoch": 0.0003236305308198848,
+      "grad_norm": 0.36821484565734863,
+      "learning_rate": 0.0001388313856427379,
+      "loss": 0.5303,
+      "step": 922
+    },
+    {
+      "epoch": 0.0003239815400724009,
+      "grad_norm": 0.3517453968524933,
+      "learning_rate": 0.00013876460767946576,
+      "loss": 0.4586,
+      "step": 923
+    },
+    {
+      "epoch": 0.0003243325493249171,
+      "grad_norm": 0.2959018647670746,
+      "learning_rate": 0.00013869782971619366,
+      "loss": 0.5225,
+      "step": 924
+    },
+    {
+      "epoch": 0.0003246835585774332,
+      "grad_norm": 0.3286895751953125,
+      "learning_rate": 0.00013863105175292154,
+      "loss": 0.5353,
+      "step": 925
+    },
+    {
+      "epoch": 0.0003250345678299494,
+      "grad_norm": 0.3328275680541992,
+      "learning_rate": 0.00013856427378964944,
+      "loss": 0.5915,
+      "step": 926
+    },
+    {
+      "epoch": 0.0003253855770824655,
+      "grad_norm": 0.3400813937187195,
+      "learning_rate": 0.0001384974958263773,
+      "loss": 0.4598,
+      "step": 927
+    },
+    {
+      "epoch": 0.0003257365863349816,
+      "grad_norm": 0.2876541018486023,
+      "learning_rate": 0.00013843071786310518,
+      "loss": 0.4835,
+      "step": 928
+    },
+    {
+      "epoch": 0.0003260875955874978,
+      "grad_norm": 0.3401765525341034,
+      "learning_rate": 0.00013836393989983308,
+      "loss": 0.56,
+      "step": 929
+    },
+    {
+      "epoch": 0.0003264386048400139,
+      "grad_norm": 0.34506598114967346,
+      "learning_rate": 0.00013829716193656096,
+      "loss": 0.6234,
+      "step": 930
+    },
+    {
+      "epoch": 0.0003267896140925301,
+      "grad_norm": 0.33732855319976807,
+      "learning_rate": 0.00013823038397328883,
+      "loss": 0.5686,
+      "step": 931
+    },
+    {
+      "epoch": 0.0003271406233450462,
+      "grad_norm": 0.34300100803375244,
+      "learning_rate": 0.0001381636060100167,
+      "loss": 0.6091,
+      "step": 932
+    },
+    {
+      "epoch": 0.00032749163259756235,
+      "grad_norm": 0.30349200963974,
+      "learning_rate": 0.00013809682804674458,
+      "loss": 0.4836,
+      "step": 933
+    },
+    {
+      "epoch": 0.0003278426418500785,
+      "grad_norm": 0.35742175579071045,
+      "learning_rate": 0.00013803005008347245,
+      "loss": 0.6443,
+      "step": 934
+    },
+    {
+      "epoch": 0.00032819365110259464,
+      "grad_norm": 0.33582496643066406,
+      "learning_rate": 0.00013796327212020035,
+      "loss": 0.6361,
+      "step": 935
+    },
+    {
+      "epoch": 0.0003285446603551108,
+      "grad_norm": 0.33403804898262024,
+      "learning_rate": 0.00013789649415692822,
+      "loss": 0.5911,
+      "step": 936
+    },
+    {
+      "epoch": 0.00032889566960762694,
+      "grad_norm": 0.4263191521167755,
+      "learning_rate": 0.0001378297161936561,
+      "loss": 0.5243,
+      "step": 937
+    },
+    {
+      "epoch": 0.00032924667886014307,
+      "grad_norm": 0.31543296575546265,
+      "learning_rate": 0.00013776293823038397,
+      "loss": 0.554,
+      "step": 938
+    },
+    {
+      "epoch": 0.00032959768811265924,
+      "grad_norm": 0.38975203037261963,
+      "learning_rate": 0.00013769616026711184,
+      "loss": 0.5358,
+      "step": 939
+    },
+    {
+      "epoch": 0.00032994869736517536,
+      "grad_norm": 0.3175157904624939,
+      "learning_rate": 0.00013762938230383974,
+      "loss": 0.5385,
+      "step": 940
+    },
+    {
+      "epoch": 0.00033029970661769154,
+      "grad_norm": 0.32753151655197144,
+      "learning_rate": 0.00013756260434056762,
+      "loss": 0.5191,
+      "step": 941
+    },
+    {
+      "epoch": 0.00033065071587020766,
+      "grad_norm": 0.2516227066516876,
+      "learning_rate": 0.0001374958263772955,
+      "loss": 0.3496,
+      "step": 942
+    },
+    {
+      "epoch": 0.0003310017251227238,
+      "grad_norm": 0.275806188583374,
+      "learning_rate": 0.0001374290484140234,
+      "loss": 0.4197,
+      "step": 943
+    },
+    {
+      "epoch": 0.00033135273437523996,
+      "grad_norm": 0.30234864354133606,
+      "learning_rate": 0.00013736227045075126,
+      "loss": 0.4909,
+      "step": 944
+    },
+    {
+      "epoch": 0.0003317037436277561,
+      "grad_norm": 0.32561683654785156,
+      "learning_rate": 0.00013729549248747916,
+      "loss": 0.5865,
+      "step": 945
+    },
+    {
+      "epoch": 0.00033205475288027226,
+      "grad_norm": 0.32075145840644836,
+      "learning_rate": 0.00013722871452420704,
+      "loss": 0.5957,
+      "step": 946
+    },
+    {
+      "epoch": 0.0003324057621327884,
+      "grad_norm": 0.3077705204486847,
+      "learning_rate": 0.0001371619365609349,
+      "loss": 0.6026,
+      "step": 947
+    },
+    {
+      "epoch": 0.0003327567713853045,
+      "grad_norm": 0.3092177212238312,
+      "learning_rate": 0.00013709515859766278,
+      "loss": 0.553,
+      "step": 948
+    },
+    {
+      "epoch": 0.0003331077806378207,
+      "grad_norm": 0.3611501157283783,
+      "learning_rate": 0.00013702838063439065,
+      "loss": 0.5707,
+      "step": 949
+    },
+    {
+      "epoch": 0.0003334587898903368,
+      "grad_norm": 0.3343827724456787,
+      "learning_rate": 0.00013696160267111853,
+      "loss": 0.5626,
+      "step": 950
+    },
+    {
+      "epoch": 0.000333809799142853,
+      "grad_norm": 0.3330281376838684,
+      "learning_rate": 0.00013689482470784643,
+      "loss": 0.6353,
+      "step": 951
+    },
+    {
+      "epoch": 0.0003341608083953691,
+      "grad_norm": 0.4045816957950592,
+      "learning_rate": 0.0001368280467445743,
+      "loss": 0.5781,
+      "step": 952
+    },
+    {
+      "epoch": 0.0003345118176478852,
+      "grad_norm": 0.3618166446685791,
+      "learning_rate": 0.00013676126878130217,
+      "loss": 0.6702,
+      "step": 953
+    },
+    {
+      "epoch": 0.0003348628269004014,
+      "grad_norm": 0.2836553752422333,
+      "learning_rate": 0.00013669449081803005,
+      "loss": 0.4371,
+      "step": 954
+    },
+    {
+      "epoch": 0.0003352138361529175,
+      "grad_norm": 0.3100498914718628,
+      "learning_rate": 0.00013662771285475792,
+      "loss": 0.5184,
+      "step": 955
+    },
+    {
+      "epoch": 0.0003355648454054337,
+      "grad_norm": 0.34877723455429077,
+      "learning_rate": 0.00013656093489148582,
+      "loss": 0.4778,
+      "step": 956
+    },
+    {
+      "epoch": 0.0003359158546579498,
+      "grad_norm": 0.27756938338279724,
+      "learning_rate": 0.0001364941569282137,
+      "loss": 0.4314,
+      "step": 957
+    },
+    {
+      "epoch": 0.00033626686391046595,
+      "grad_norm": 0.36129051446914673,
+      "learning_rate": 0.00013642737896494157,
+      "loss": 0.5837,
+      "step": 958
+    },
+    {
+      "epoch": 0.0003366178731629821,
+      "grad_norm": 0.35625776648521423,
+      "learning_rate": 0.00013636060100166944,
+      "loss": 0.5579,
+      "step": 959
+    },
+    {
+      "epoch": 0.00033696888241549825,
+      "grad_norm": 0.3735104501247406,
+      "learning_rate": 0.00013629382303839734,
+      "loss": 0.5283,
+      "step": 960
+    },
+    {
+      "epoch": 0.0003373198916680144,
+      "grad_norm": 0.34185606241226196,
+      "learning_rate": 0.00013622704507512521,
+      "loss": 0.5669,
+      "step": 961
+    },
+    {
+      "epoch": 0.00033767090092053054,
+      "grad_norm": 0.29324260354042053,
+      "learning_rate": 0.00013616026711185311,
+      "loss": 0.4468,
+      "step": 962
+    },
+    {
+      "epoch": 0.00033802191017304667,
+      "grad_norm": 0.3439052700996399,
+      "learning_rate": 0.000136093489148581,
+      "loss": 0.5196,
+      "step": 963
+    },
+    {
+      "epoch": 0.00033837291942556284,
+      "grad_norm": 0.3536570370197296,
+      "learning_rate": 0.00013602671118530886,
+      "loss": 0.5251,
+      "step": 964
+    },
+    {
+      "epoch": 0.00033872392867807897,
+      "grad_norm": 0.4759911298751831,
+      "learning_rate": 0.00013595993322203673,
+      "loss": 0.7017,
+      "step": 965
+    },
+    {
+      "epoch": 0.00033907493793059514,
+      "grad_norm": 0.2958674728870392,
+      "learning_rate": 0.0001358931552587646,
+      "loss": 0.4936,
+      "step": 966
+    },
+    {
+      "epoch": 0.00033942594718311126,
+      "grad_norm": 0.32770562171936035,
+      "learning_rate": 0.0001358263772954925,
+      "loss": 0.5741,
+      "step": 967
+    },
+    {
+      "epoch": 0.0003397769564356274,
+      "grad_norm": 0.35697153210639954,
+      "learning_rate": 0.00013575959933222038,
+      "loss": 0.428,
+      "step": 968
+    },
+    {
+      "epoch": 0.00034012796568814356,
+      "grad_norm": 0.3409043252468109,
+      "learning_rate": 0.00013569282136894825,
+      "loss": 0.6142,
+      "step": 969
+    },
+    {
+      "epoch": 0.0003404789749406597,
+      "grad_norm": 0.47055551409721375,
+      "learning_rate": 0.00013562604340567613,
+      "loss": 0.463,
+      "step": 970
+    },
+    {
+      "epoch": 0.00034082998419317586,
+      "grad_norm": 0.38270413875579834,
+      "learning_rate": 0.000135559265442404,
+      "loss": 0.462,
+      "step": 971
+    },
+    {
+      "epoch": 0.000341180993445692,
+      "grad_norm": 0.26209867000579834,
+      "learning_rate": 0.0001354924874791319,
+      "loss": 0.5341,
+      "step": 972
+    },
+    {
+      "epoch": 0.0003415320026982081,
+      "grad_norm": 0.37498748302459717,
+      "learning_rate": 0.00013542570951585977,
+      "loss": 0.5196,
+      "step": 973
+    },
+    {
+      "epoch": 0.0003418830119507243,
+      "grad_norm": 0.36789608001708984,
+      "learning_rate": 0.00013535893155258765,
+      "loss": 0.4723,
+      "step": 974
+    },
+    {
+      "epoch": 0.0003422340212032404,
+      "grad_norm": 0.33915975689888,
+      "learning_rate": 0.00013529215358931552,
+      "loss": 0.5511,
+      "step": 975
+    },
+    {
+      "epoch": 0.0003425850304557566,
+      "grad_norm": 0.43045058846473694,
+      "learning_rate": 0.0001352253756260434,
+      "loss": 0.5667,
+      "step": 976
+    },
+    {
+      "epoch": 0.0003429360397082727,
+      "grad_norm": 0.2948949933052063,
+      "learning_rate": 0.0001351585976627713,
+      "loss": 0.4804,
+      "step": 977
+    },
+    {
+      "epoch": 0.00034328704896078883,
+      "grad_norm": 0.3249470889568329,
+      "learning_rate": 0.00013509181969949917,
+      "loss": 0.6041,
+      "step": 978
+    },
+    {
+      "epoch": 0.000343638058213305,
+      "grad_norm": 0.2865908741950989,
+      "learning_rate": 0.00013502504173622707,
+      "loss": 0.5617,
+      "step": 979
+    },
+    {
+      "epoch": 0.0003439890674658211,
+      "grad_norm": 0.3190818428993225,
+      "learning_rate": 0.00013495826377295494,
+      "loss": 0.4902,
+      "step": 980
+    },
+    {
+      "epoch": 0.00034434007671833725,
+      "grad_norm": 0.3111664950847626,
+      "learning_rate": 0.0001348914858096828,
+      "loss": 0.5504,
+      "step": 981
+    },
+    {
+      "epoch": 0.0003446910859708534,
+      "grad_norm": 0.3255857229232788,
+      "learning_rate": 0.00013482470784641069,
+      "loss": 0.5592,
+      "step": 982
+    },
+    {
+      "epoch": 0.00034504209522336955,
+      "grad_norm": 0.30806589126586914,
+      "learning_rate": 0.00013475792988313859,
+      "loss": 0.5567,
+      "step": 983
+    },
+    {
+      "epoch": 0.0003453931044758857,
+      "grad_norm": 0.33785945177078247,
+      "learning_rate": 0.00013469115191986646,
+      "loss": 0.5881,
+      "step": 984
+    },
+    {
+      "epoch": 0.00034574411372840185,
+      "grad_norm": 0.34626781940460205,
+      "learning_rate": 0.00013462437395659433,
+      "loss": 0.578,
+      "step": 985
+    },
+    {
+      "epoch": 0.00034609512298091797,
+      "grad_norm": 0.367034912109375,
+      "learning_rate": 0.0001345575959933222,
+      "loss": 0.5893,
+      "step": 986
+    },
+    {
+      "epoch": 0.00034644613223343415,
+      "grad_norm": 0.37824952602386475,
+      "learning_rate": 0.00013449081803005008,
+      "loss": 0.5681,
+      "step": 987
+    },
+    {
+      "epoch": 0.00034679714148595027,
+      "grad_norm": 0.4054035544395447,
+      "learning_rate": 0.00013442404006677798,
+      "loss": 0.6108,
+      "step": 988
+    },
+    {
+      "epoch": 0.00034714815073846645,
+      "grad_norm": 0.4374067485332489,
+      "learning_rate": 0.00013435726210350585,
+      "loss": 0.6002,
+      "step": 989
+    },
+    {
+      "epoch": 0.00034749915999098257,
+      "grad_norm": 0.3554278016090393,
+      "learning_rate": 0.00013429048414023373,
+      "loss": 0.6444,
+      "step": 990
+    },
+    {
+      "epoch": 0.0003478501692434987,
+      "grad_norm": 0.3428646922111511,
+      "learning_rate": 0.0001342237061769616,
+      "loss": 0.6527,
+      "step": 991
+    },
+    {
+      "epoch": 0.00034820117849601487,
+      "grad_norm": 0.25603657960891724,
+      "learning_rate": 0.00013415692821368947,
+      "loss": 0.5244,
+      "step": 992
+    },
+    {
+      "epoch": 0.000348552187748531,
+      "grad_norm": 0.35237595438957214,
+      "learning_rate": 0.00013409015025041737,
+      "loss": 0.557,
+      "step": 993
+    },
+    {
+      "epoch": 0.00034890319700104717,
+      "grad_norm": 0.33666110038757324,
+      "learning_rate": 0.00013402337228714524,
+      "loss": 0.5674,
+      "step": 994
+    },
+    {
+      "epoch": 0.0003492542062535633,
+      "grad_norm": 0.30283182859420776,
+      "learning_rate": 0.00013395659432387312,
+      "loss": 0.6081,
+      "step": 995
+    },
+    {
+      "epoch": 0.0003496052155060794,
+      "grad_norm": 0.30893146991729736,
+      "learning_rate": 0.00013388981636060102,
+      "loss": 0.6089,
+      "step": 996
+    },
+    {
+      "epoch": 0.0003499562247585956,
+      "grad_norm": 0.2617473304271698,
+      "learning_rate": 0.0001338230383973289,
+      "loss": 0.6104,
+      "step": 997
+    },
+    {
+      "epoch": 0.0003503072340111117,
+      "grad_norm": 0.29493093490600586,
+      "learning_rate": 0.00013375626043405676,
+      "loss": 0.5047,
+      "step": 998
+    },
+    {
+      "epoch": 0.0003506582432636279,
+      "grad_norm": 0.3991663157939911,
+      "learning_rate": 0.00013368948247078466,
+      "loss": 0.5137,
+      "step": 999
+    },
+    {
+      "epoch": 0.000351009252516144,
+      "grad_norm": 0.31760329008102417,
+      "learning_rate": 0.00013362270450751254,
+      "loss": 0.4371,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 3000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.846828653872742e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}