diff --git "a/checkpoint-2000/trainer_state.json" "b/checkpoint-2000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-2000/trainer_state.json"
@@ -0,0 +1,14034 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.64,
+  "eval_steps": 500,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00032,
+      "grad_norm": 0.24713319540023804,
+      "learning_rate": 0.0,
+      "loss": 0.8884,
+      "step": 1
+    },
+    {
+      "epoch": 0.00064,
+      "grad_norm": 0.20603826642036438,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 0.8479,
+      "step": 2
+    },
+    {
+      "epoch": 0.00096,
+      "grad_norm": 0.24543610215187073,
+      "learning_rate": 2.666666666666667e-06,
+      "loss": 0.9853,
+      "step": 3
+    },
+    {
+      "epoch": 0.00128,
+      "grad_norm": 0.2051621973514557,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.8293,
+      "step": 4
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.23469021916389465,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 0.9811,
+      "step": 5
+    },
+    {
+      "epoch": 0.00192,
+      "grad_norm": 0.25180304050445557,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 1.039,
+      "step": 6
+    },
+    {
+      "epoch": 0.00224,
+      "grad_norm": 0.23135970532894135,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.8688,
+      "step": 7
+    },
+    {
+      "epoch": 0.00256,
+      "grad_norm": 0.22394850850105286,
+      "learning_rate": 9.333333333333334e-06,
+      "loss": 0.9214,
+      "step": 8
+    },
+    {
+      "epoch": 0.00288,
+      "grad_norm": 0.25566354393959045,
+      "learning_rate": 1.0666666666666667e-05,
+      "loss": 0.8667,
+      "step": 9
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.2881408631801605,
+      "learning_rate": 1.2e-05,
+      "loss": 0.9544,
+      "step": 10
+    },
+    {
+      "epoch": 0.00352,
+      "grad_norm": 0.22555342316627502,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 0.9383,
+      "step": 11
+    },
+    {
+      "epoch": 0.00384,
+      "grad_norm": 0.23086762428283691,
+      "learning_rate": 1.4666666666666668e-05,
+      "loss": 0.8595,
+      "step": 12
+    },
+    {
+      "epoch": 0.00416,
+      "grad_norm": 0.2487766444683075,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 0.9469,
+      "step": 13
+    },
+    {
+      "epoch": 0.00448,
+      "grad_norm": 0.3970007598400116,
+      "learning_rate": 1.7333333333333336e-05,
+      "loss": 1.1923,
+      "step": 14
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.25204387307167053,
+      "learning_rate": 1.866666666666667e-05,
+      "loss": 0.8854,
+      "step": 15
+    },
+    {
+      "epoch": 0.00512,
+      "grad_norm": 0.23282122611999512,
+      "learning_rate": 2e-05,
+      "loss": 0.932,
+      "step": 16
+    },
+    {
+      "epoch": 0.00544,
+      "grad_norm": 0.2607545852661133,
+      "learning_rate": 2.1333333333333335e-05,
+      "loss": 0.9094,
+      "step": 17
+    },
+    {
+      "epoch": 0.00576,
+      "grad_norm": 0.21402348577976227,
+      "learning_rate": 2.2666666666666668e-05,
+      "loss": 0.7879,
+      "step": 18
+    },
+    {
+      "epoch": 0.00608,
+      "grad_norm": 0.23922377824783325,
+      "learning_rate": 2.4e-05,
+      "loss": 0.9193,
+      "step": 19
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.18835392594337463,
+      "learning_rate": 2.5333333333333337e-05,
+      "loss": 0.9786,
+      "step": 20
+    },
+    {
+      "epoch": 0.00672,
+      "grad_norm": 0.1775645613670349,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 0.933,
+      "step": 21
+    },
+    {
+      "epoch": 0.00704,
+      "grad_norm": 0.20945614576339722,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 0.8171,
+      "step": 22
+    },
+    {
+      "epoch": 0.00736,
+      "grad_norm": 0.16134795546531677,
+      "learning_rate": 2.9333333333333336e-05,
+      "loss": 0.9665,
+      "step": 23
+    },
+    {
+      "epoch": 0.00768,
+      "grad_norm": 0.1454283446073532,
+      "learning_rate": 3.066666666666667e-05,
+      "loss": 0.7238,
+      "step": 24
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.15920202434062958,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 0.7932,
+      "step": 25
+    },
+    {
+      "epoch": 0.00832,
+      "grad_norm": 0.16204868257045746,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 0.8175,
+      "step": 26
+    },
+    {
+      "epoch": 0.00864,
+      "grad_norm": 0.15179912745952606,
+      "learning_rate": 3.466666666666667e-05,
+      "loss": 0.9067,
+      "step": 27
+    },
+    {
+      "epoch": 0.00896,
+      "grad_norm": 0.1287967711687088,
+      "learning_rate": 3.6e-05,
+      "loss": 0.8039,
+      "step": 28
+    },
+    {
+      "epoch": 0.00928,
+      "grad_norm": 0.13528791069984436,
+      "learning_rate": 3.733333333333334e-05,
+      "loss": 0.8892,
+      "step": 29
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.13059762120246887,
+      "learning_rate": 3.866666666666667e-05,
+      "loss": 0.7023,
+      "step": 30
+    },
+    {
+      "epoch": 0.00992,
+      "grad_norm": 0.13119487464427948,
+      "learning_rate": 4e-05,
+      "loss": 0.7607,
+      "step": 31
+    },
+    {
+      "epoch": 0.01024,
+      "grad_norm": 0.12565594911575317,
+      "learning_rate": 4.133333333333333e-05,
+      "loss": 0.808,
+      "step": 32
+    },
+    {
+      "epoch": 0.01056,
+      "grad_norm": 0.1259136199951172,
+      "learning_rate": 4.266666666666667e-05,
+      "loss": 0.8704,
+      "step": 33
+    },
+    {
+      "epoch": 0.01088,
+      "grad_norm": 0.1319507360458374,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 0.6421,
+      "step": 34
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.1317446082830429,
+      "learning_rate": 4.5333333333333335e-05,
+      "loss": 0.688,
+      "step": 35
+    },
+    {
+      "epoch": 0.01152,
+      "grad_norm": 0.12360246479511261,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.798,
+      "step": 36
+    },
+    {
+      "epoch": 0.01184,
+      "grad_norm": 0.11974922567605972,
+      "learning_rate": 4.8e-05,
+      "loss": 0.829,
+      "step": 37
+    },
+    {
+      "epoch": 0.01216,
+      "grad_norm": 0.13918066024780273,
+      "learning_rate": 4.933333333333334e-05,
+      "loss": 0.8626,
+      "step": 38
+    },
+    {
+      "epoch": 0.01248,
+      "grad_norm": 0.11004938930273056,
+      "learning_rate": 5.0666666666666674e-05,
+      "loss": 0.8056,
+      "step": 39
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.10854440927505493,
+      "learning_rate": 5.2000000000000004e-05,
+      "loss": 0.8773,
+      "step": 40
+    },
+    {
+      "epoch": 0.01312,
+      "grad_norm": 0.10877286642789841,
+      "learning_rate": 5.333333333333333e-05,
+      "loss": 0.8947,
+      "step": 41
+    },
+    {
+      "epoch": 0.01344,
+      "grad_norm": 0.12767033278942108,
+      "learning_rate": 5.466666666666666e-05,
+      "loss": 0.7892,
+      "step": 42
+    },
+    {
+      "epoch": 0.01376,
+      "grad_norm": 0.11700518429279327,
+      "learning_rate": 5.6000000000000006e-05,
+      "loss": 0.8598,
+      "step": 43
+    },
+    {
+      "epoch": 0.01408,
+      "grad_norm": 0.12549364566802979,
+      "learning_rate": 5.7333333333333336e-05,
+      "loss": 0.8498,
+      "step": 44
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.11921705305576324,
+      "learning_rate": 5.866666666666667e-05,
+      "loss": 0.7979,
+      "step": 45
+    },
+    {
+      "epoch": 0.01472,
+      "grad_norm": 0.11621945351362228,
+      "learning_rate": 6e-05,
+      "loss": 0.9252,
+      "step": 46
+    },
+    {
+      "epoch": 0.01504,
+      "grad_norm": 0.10912485420703888,
+      "learning_rate": 6.133333333333334e-05,
+      "loss": 0.7674,
+      "step": 47
+    },
+    {
+      "epoch": 0.01536,
+      "grad_norm": 0.11039536446332932,
+      "learning_rate": 6.266666666666667e-05,
+      "loss": 0.841,
+      "step": 48
+    },
+    {
+      "epoch": 0.01568,
+      "grad_norm": 0.10852135717868805,
+      "learning_rate": 6.400000000000001e-05,
+      "loss": 0.7991,
+      "step": 49
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.09653560072183609,
+      "learning_rate": 6.533333333333334e-05,
+      "loss": 0.6292,
+      "step": 50
+    },
+    {
+      "epoch": 0.01632,
+      "grad_norm": 0.11932919919490814,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.7327,
+      "step": 51
+    },
+    {
+      "epoch": 0.01664,
+      "grad_norm": 0.10563201457262039,
+      "learning_rate": 6.800000000000001e-05,
+      "loss": 0.7041,
+      "step": 52
+    },
+    {
+      "epoch": 0.01696,
+      "grad_norm": 0.1063474714756012,
+      "learning_rate": 6.933333333333334e-05,
+      "loss": 0.7953,
+      "step": 53
+    },
+    {
+      "epoch": 0.01728,
+      "grad_norm": 0.11396024376153946,
+      "learning_rate": 7.066666666666667e-05,
+      "loss": 0.899,
+      "step": 54
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.10861669480800629,
+      "learning_rate": 7.2e-05,
+      "loss": 0.9343,
+      "step": 55
+    },
+    {
+      "epoch": 0.01792,
+      "grad_norm": 0.12476427853107452,
+      "learning_rate": 7.333333333333333e-05,
+      "loss": 0.8145,
+      "step": 56
+    },
+    {
+      "epoch": 0.01824,
+      "grad_norm": 0.1240817978978157,
+      "learning_rate": 7.466666666666667e-05,
+      "loss": 0.7133,
+      "step": 57
+    },
+    {
+      "epoch": 0.01856,
+      "grad_norm": 0.10977518558502197,
+      "learning_rate": 7.6e-05,
+      "loss": 0.7576,
+      "step": 58
+    },
+    {
+      "epoch": 0.01888,
+      "grad_norm": 0.11463093012571335,
+      "learning_rate": 7.733333333333333e-05,
+      "loss": 0.8552,
+      "step": 59
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.1143040657043457,
+      "learning_rate": 7.866666666666666e-05,
+      "loss": 0.9198,
+      "step": 60
+    },
+    {
+      "epoch": 0.01952,
+      "grad_norm": 0.12839296460151672,
+      "learning_rate": 8e-05,
+      "loss": 0.6644,
+      "step": 61
+    },
+    {
+      "epoch": 0.01984,
+      "grad_norm": 0.11903133243322372,
+      "learning_rate": 8.133333333333334e-05,
+      "loss": 0.9053,
+      "step": 62
+    },
+    {
+      "epoch": 0.02016,
+      "grad_norm": 0.12296755611896515,
+      "learning_rate": 8.266666666666667e-05,
+      "loss": 0.7695,
+      "step": 63
+    },
+    {
+      "epoch": 0.02048,
+      "grad_norm": 0.11100097000598907,
+      "learning_rate": 8.4e-05,
+      "loss": 0.8909,
+      "step": 64
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.11848391592502594,
+      "learning_rate": 8.533333333333334e-05,
+      "loss": 0.8204,
+      "step": 65
+    },
+    {
+      "epoch": 0.02112,
+      "grad_norm": 0.11903996020555496,
+      "learning_rate": 8.666666666666667e-05,
+      "loss": 0.7778,
+      "step": 66
+    },
+    {
+      "epoch": 0.02144,
+      "grad_norm": 0.10964814573526382,
+      "learning_rate": 8.800000000000001e-05,
+      "loss": 0.6879,
+      "step": 67
+    },
+    {
+      "epoch": 0.02176,
+      "grad_norm": 0.11036152392625809,
+      "learning_rate": 8.933333333333334e-05,
+      "loss": 0.8051,
+      "step": 68
+    },
+    {
+      "epoch": 0.02208,
+      "grad_norm": 0.10888101160526276,
+      "learning_rate": 9.066666666666667e-05,
+      "loss": 0.759,
+      "step": 69
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.11158827692270279,
+      "learning_rate": 9.200000000000001e-05,
+      "loss": 0.7412,
+      "step": 70
+    },
+    {
+      "epoch": 0.02272,
+      "grad_norm": 0.12085185945034027,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 0.7978,
+      "step": 71
+    },
+    {
+      "epoch": 0.02304,
+      "grad_norm": 0.12325625121593475,
+      "learning_rate": 9.466666666666667e-05,
+      "loss": 0.7175,
+      "step": 72
+    },
+    {
+      "epoch": 0.02336,
+      "grad_norm": 0.10695890337228775,
+      "learning_rate": 9.6e-05,
+      "loss": 0.868,
+      "step": 73
+    },
+    {
+      "epoch": 0.02368,
+      "grad_norm": 0.11983097344636917,
+      "learning_rate": 9.733333333333335e-05,
+      "loss": 0.7965,
+      "step": 74
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.11521439999341965,
+      "learning_rate": 9.866666666666668e-05,
+      "loss": 0.9305,
+      "step": 75
+    },
+    {
+      "epoch": 0.02432,
+      "grad_norm": 0.1252303123474121,
+      "learning_rate": 0.0001,
+      "loss": 0.6807,
+      "step": 76
+    },
+    {
+      "epoch": 0.02464,
+      "grad_norm": 0.1265154629945755,
+      "learning_rate": 0.00010133333333333335,
+      "loss": 0.718,
+      "step": 77
+    },
+    {
+      "epoch": 0.02496,
+      "grad_norm": 0.1109502986073494,
+      "learning_rate": 0.00010266666666666666,
+      "loss": 0.769,
+      "step": 78
+    },
+    {
+      "epoch": 0.02528,
+      "grad_norm": 0.11158560961484909,
+      "learning_rate": 0.00010400000000000001,
+      "loss": 0.8579,
+      "step": 79
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.13526469469070435,
+      "learning_rate": 0.00010533333333333332,
+      "loss": 0.7704,
+      "step": 80
+    },
+    {
+      "epoch": 0.02592,
+      "grad_norm": 0.12420251965522766,
+      "learning_rate": 0.00010666666666666667,
+      "loss": 0.673,
+      "step": 81
+    },
+    {
+      "epoch": 0.02624,
+      "grad_norm": 0.11838296055793762,
+      "learning_rate": 0.00010800000000000001,
+      "loss": 0.8576,
+      "step": 82
+    },
+    {
+      "epoch": 0.02656,
+      "grad_norm": 0.11958430707454681,
+      "learning_rate": 0.00010933333333333333,
+      "loss": 0.8521,
+      "step": 83
+    },
+    {
+      "epoch": 0.02688,
+      "grad_norm": 0.12049078941345215,
+      "learning_rate": 0.00011066666666666667,
+      "loss": 0.6008,
+      "step": 84
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.10999640822410583,
+      "learning_rate": 0.00011200000000000001,
+      "loss": 0.7351,
+      "step": 85
+    },
+    {
+      "epoch": 0.02752,
+      "grad_norm": 0.11500285565853119,
+      "learning_rate": 0.00011333333333333334,
+      "loss": 0.7841,
+      "step": 86
+    },
+    {
+      "epoch": 0.02784,
+      "grad_norm": 0.11917781829833984,
+      "learning_rate": 0.00011466666666666667,
+      "loss": 0.8275,
+      "step": 87
+    },
+    {
+      "epoch": 0.02816,
+      "grad_norm": 0.11745665222406387,
+      "learning_rate": 0.000116,
+      "loss": 0.6096,
+      "step": 88
+    },
+    {
+      "epoch": 0.02848,
+      "grad_norm": 0.1208786815404892,
+      "learning_rate": 0.00011733333333333334,
+      "loss": 0.6842,
+      "step": 89
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.12839952111244202,
+      "learning_rate": 0.00011866666666666669,
+      "loss": 0.7467,
+      "step": 90
+    },
+    {
+      "epoch": 0.02912,
+      "grad_norm": 0.12688350677490234,
+      "learning_rate": 0.00012,
+      "loss": 0.6507,
+      "step": 91
+    },
+    {
+      "epoch": 0.02944,
+      "grad_norm": 0.11489767581224442,
+      "learning_rate": 0.00012133333333333335,
+      "loss": 0.8403,
+      "step": 92
+    },
+    {
+      "epoch": 0.02976,
+      "grad_norm": 0.1305433064699173,
+      "learning_rate": 0.00012266666666666668,
+      "loss": 0.748,
+      "step": 93
+    },
+    {
+      "epoch": 0.03008,
+      "grad_norm": 0.13992320001125336,
+      "learning_rate": 0.000124,
+      "loss": 0.796,
+      "step": 94
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.12120400369167328,
+      "learning_rate": 0.00012533333333333334,
+      "loss": 0.6445,
+      "step": 95
+    },
+    {
+      "epoch": 0.03072,
+      "grad_norm": 0.13254240155220032,
+      "learning_rate": 0.00012666666666666666,
+      "loss": 0.7488,
+      "step": 96
+    },
+    {
+      "epoch": 0.03104,
+      "grad_norm": 0.13790103793144226,
+      "learning_rate": 0.00012800000000000002,
+      "loss": 0.8005,
+      "step": 97
+    },
+    {
+      "epoch": 0.03136,
+      "grad_norm": 0.12862953543663025,
+      "learning_rate": 0.00012933333333333332,
+      "loss": 0.7975,
+      "step": 98
+    },
+    {
+      "epoch": 0.03168,
+      "grad_norm": 0.14197330176830292,
+      "learning_rate": 0.00013066666666666668,
+      "loss": 0.9026,
+      "step": 99
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.12494270503520966,
+      "learning_rate": 0.000132,
+      "loss": 0.7353,
+      "step": 100
+    },
+    {
+      "epoch": 0.03232,
+      "grad_norm": 0.12572987377643585,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.9271,
+      "step": 101
+    },
+    {
+      "epoch": 0.03264,
+      "grad_norm": 0.1314753144979477,
+      "learning_rate": 0.00013466666666666667,
+      "loss": 0.7816,
+      "step": 102
+    },
+    {
+      "epoch": 0.03296,
+      "grad_norm": 0.12902331352233887,
+      "learning_rate": 0.00013600000000000003,
+      "loss": 0.7348,
+      "step": 103
+    },
+    {
+      "epoch": 0.03328,
+      "grad_norm": 0.14086449146270752,
+      "learning_rate": 0.00013733333333333333,
+      "loss": 0.831,
+      "step": 104
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.12464585155248642,
+      "learning_rate": 0.00013866666666666669,
+      "loss": 0.6945,
+      "step": 105
+    },
+    {
+      "epoch": 0.03392,
+      "grad_norm": 0.11436092108488083,
+      "learning_rate": 0.00014,
+      "loss": 0.8192,
+      "step": 106
+    },
+    {
+      "epoch": 0.03424,
+      "grad_norm": 0.11779410392045975,
+      "learning_rate": 0.00014133333333333334,
+      "loss": 0.7989,
+      "step": 107
+    },
+    {
+      "epoch": 0.03456,
+      "grad_norm": 0.14601927995681763,
+      "learning_rate": 0.00014266666666666667,
+      "loss": 0.6256,
+      "step": 108
+    },
+    {
+      "epoch": 0.03488,
+      "grad_norm": 0.13231013715267181,
+      "learning_rate": 0.000144,
+      "loss": 0.7445,
+      "step": 109
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.11249889433383942,
+      "learning_rate": 0.00014533333333333333,
+      "loss": 0.7338,
+      "step": 110
+    },
+    {
+      "epoch": 0.03552,
+      "grad_norm": 0.11275426298379898,
+      "learning_rate": 0.00014666666666666666,
+      "loss": 0.7993,
+      "step": 111
+    },
+    {
+      "epoch": 0.03584,
+      "grad_norm": 0.11678344756364822,
+      "learning_rate": 0.000148,
+      "loss": 0.9234,
+      "step": 112
+    },
+    {
+      "epoch": 0.03616,
+      "grad_norm": 0.10879474133253098,
+      "learning_rate": 0.00014933333333333335,
+      "loss": 0.8732,
+      "step": 113
+    },
+    {
+      "epoch": 0.03648,
+      "grad_norm": 0.13673344254493713,
+      "learning_rate": 0.00015066666666666668,
+      "loss": 0.8698,
+      "step": 114
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.12564410269260406,
+      "learning_rate": 0.000152,
+      "loss": 0.7913,
+      "step": 115
+    },
+    {
+      "epoch": 0.03712,
+      "grad_norm": 0.12272034585475922,
+      "learning_rate": 0.00015333333333333334,
+      "loss": 0.7244,
+      "step": 116
+    },
+    {
+      "epoch": 0.03744,
+      "grad_norm": 0.11841209977865219,
+      "learning_rate": 0.00015466666666666667,
+      "loss": 0.7648,
+      "step": 117
+    },
+    {
+      "epoch": 0.03776,
+      "grad_norm": 0.11938042938709259,
+      "learning_rate": 0.00015600000000000002,
+      "loss": 0.8381,
+      "step": 118
+    },
+    {
+      "epoch": 0.03808,
+      "grad_norm": 0.13476668298244476,
+      "learning_rate": 0.00015733333333333333,
+      "loss": 0.6931,
+      "step": 119
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.1274396926164627,
+      "learning_rate": 0.00015866666666666668,
+      "loss": 0.854,
+      "step": 120
+    },
+    {
+      "epoch": 0.03872,
+      "grad_norm": 0.11518494784832001,
+      "learning_rate": 0.00016,
+      "loss": 0.7591,
+      "step": 121
+    },
+    {
+      "epoch": 0.03904,
+      "grad_norm": 0.12739014625549316,
+      "learning_rate": 0.00016133333333333334,
+      "loss": 0.6956,
+      "step": 122
+    },
+    {
+      "epoch": 0.03936,
+      "grad_norm": 0.13384665548801422,
+      "learning_rate": 0.00016266666666666667,
+      "loss": 0.6487,
+      "step": 123
+    },
+    {
+      "epoch": 0.03968,
+      "grad_norm": 0.11407941579818726,
+      "learning_rate": 0.000164,
+      "loss": 0.6822,
+      "step": 124
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.12307750433683395,
+      "learning_rate": 0.00016533333333333333,
+      "loss": 0.7038,
+      "step": 125
+    },
+    {
+      "epoch": 0.04032,
+      "grad_norm": 0.12420140206813812,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 0.6589,
+      "step": 126
+    },
+    {
+      "epoch": 0.04064,
+      "grad_norm": 0.1327451914548874,
+      "learning_rate": 0.000168,
+      "loss": 0.6002,
+      "step": 127
+    },
+    {
+      "epoch": 0.04096,
+      "grad_norm": 0.1248873695731163,
+      "learning_rate": 0.00016933333333333335,
+      "loss": 0.8812,
+      "step": 128
+    },
+    {
+      "epoch": 0.04128,
+      "grad_norm": 0.11638613790273666,
+      "learning_rate": 0.00017066666666666668,
+      "loss": 0.7455,
+      "step": 129
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.12473749369382858,
+      "learning_rate": 0.000172,
+      "loss": 0.6218,
+      "step": 130
+    },
+    {
+      "epoch": 0.04192,
+      "grad_norm": 0.1339467465877533,
+      "learning_rate": 0.00017333333333333334,
+      "loss": 0.729,
+      "step": 131
+    },
+    {
+      "epoch": 0.04224,
+      "grad_norm": 0.11078035831451416,
+      "learning_rate": 0.00017466666666666667,
+      "loss": 0.8507,
+      "step": 132
+    },
+    {
+      "epoch": 0.04256,
+      "grad_norm": 0.11885277926921844,
+      "learning_rate": 0.00017600000000000002,
+      "loss": 0.8817,
+      "step": 133
+    },
+    {
+      "epoch": 0.04288,
+      "grad_norm": 0.13378757238388062,
+      "learning_rate": 0.00017733333333333335,
+      "loss": 0.705,
+      "step": 134
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.11696331948041916,
+      "learning_rate": 0.00017866666666666668,
+      "loss": 0.7089,
+      "step": 135
+    },
+    {
+      "epoch": 0.04352,
+      "grad_norm": 0.12444671988487244,
+      "learning_rate": 0.00018,
+      "loss": 0.699,
+      "step": 136
+    },
+    {
+      "epoch": 0.04384,
+      "grad_norm": 0.11136961728334427,
+      "learning_rate": 0.00018133333333333334,
+      "loss": 0.7805,
+      "step": 137
+    },
+    {
+      "epoch": 0.04416,
+      "grad_norm": 0.13461412489414215,
+      "learning_rate": 0.00018266666666666667,
+      "loss": 0.7581,
+      "step": 138
+    },
+    {
+      "epoch": 0.04448,
+      "grad_norm": 0.11344683915376663,
+      "learning_rate": 0.00018400000000000003,
+      "loss": 0.7234,
+      "step": 139
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.12247714400291443,
+      "learning_rate": 0.00018533333333333333,
+      "loss": 0.9326,
+      "step": 140
+    },
+    {
+      "epoch": 0.04512,
+      "grad_norm": 0.12991179525852203,
+      "learning_rate": 0.0001866666666666667,
+      "loss": 0.7644,
+      "step": 141
+    },
+    {
+      "epoch": 0.04544,
+      "grad_norm": 0.1182120144367218,
+      "learning_rate": 0.000188,
+      "loss": 0.8437,
+      "step": 142
+    },
+    {
+      "epoch": 0.04576,
+      "grad_norm": 0.10737155377864838,
+      "learning_rate": 0.00018933333333333335,
+      "loss": 0.8047,
+      "step": 143
+    },
+    {
+      "epoch": 0.04608,
+      "grad_norm": 0.11993440985679626,
+      "learning_rate": 0.00019066666666666668,
+      "loss": 0.7247,
+      "step": 144
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.11705442517995834,
+      "learning_rate": 0.000192,
+      "loss": 0.7104,
+      "step": 145
+    },
+    {
+      "epoch": 0.04672,
+      "grad_norm": 0.12065937370061874,
+      "learning_rate": 0.00019333333333333333,
+      "loss": 0.7243,
+      "step": 146
+    },
+    {
+      "epoch": 0.04704,
+      "grad_norm": 0.1178629919886589,
+      "learning_rate": 0.0001946666666666667,
+      "loss": 0.6946,
+      "step": 147
+    },
+    {
+      "epoch": 0.04736,
+      "grad_norm": 0.12744340300559998,
+      "learning_rate": 0.000196,
+      "loss": 0.6946,
+      "step": 148
+    },
+    {
+      "epoch": 0.04768,
+      "grad_norm": 0.14068365097045898,
+      "learning_rate": 0.00019733333333333335,
+      "loss": 0.7905,
+      "step": 149
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.11114205420017242,
+      "learning_rate": 0.00019866666666666668,
+      "loss": 0.8352,
+      "step": 150
+    },
+    {
+      "epoch": 0.04832,
+      "grad_norm": 0.10463167726993561,
+      "learning_rate": 0.0002,
+      "loss": 0.6664,
+      "step": 151
+    },
+    {
+      "epoch": 0.04864,
+      "grad_norm": 0.12237433344125748,
+      "learning_rate": 0.00019995876288659794,
+      "loss": 0.8114,
+      "step": 152
+    },
+    {
+      "epoch": 0.04896,
+      "grad_norm": 0.12069843709468842,
+      "learning_rate": 0.0001999175257731959,
+      "loss": 0.7901,
+      "step": 153
+    },
+    {
+      "epoch": 0.04928,
+      "grad_norm": 0.11398322135210037,
+      "learning_rate": 0.00019987628865979383,
+      "loss": 0.7131,
+      "step": 154
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.12183695286512375,
+      "learning_rate": 0.00019983505154639176,
+      "loss": 0.7609,
+      "step": 155
+    },
+    {
+      "epoch": 0.04992,
+      "grad_norm": 0.12869080901145935,
+      "learning_rate": 0.0001997938144329897,
+      "loss": 0.8103,
+      "step": 156
+    },
+    {
+      "epoch": 0.05024,
+      "grad_norm": 0.1119738221168518,
+      "learning_rate": 0.00019975257731958762,
+      "loss": 0.6731,
+      "step": 157
+    },
+    {
+      "epoch": 0.05056,
+      "grad_norm": 0.12156844139099121,
+      "learning_rate": 0.00019971134020618558,
+      "loss": 0.7535,
+      "step": 158
+    },
+    {
+      "epoch": 0.05088,
+      "grad_norm": 0.12900514900684357,
+      "learning_rate": 0.00019967010309278351,
+      "loss": 0.7378,
+      "step": 159
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.12743592262268066,
+      "learning_rate": 0.00019962886597938147,
+      "loss": 0.6197,
+      "step": 160
+    },
+    {
+      "epoch": 0.05152,
+      "grad_norm": 0.128151997923851,
+      "learning_rate": 0.0001995876288659794,
+      "loss": 0.7812,
+      "step": 161
+    },
+    {
+      "epoch": 0.05184,
+      "grad_norm": 0.10753507912158966,
+      "learning_rate": 0.00019954639175257733,
+      "loss": 0.5774,
+      "step": 162
+    },
+    {
+      "epoch": 0.05216,
+      "grad_norm": 0.11813949793577194,
+      "learning_rate": 0.00019950515463917527,
+      "loss": 0.6894,
+      "step": 163
+    },
+    {
+      "epoch": 0.05248,
+      "grad_norm": 0.1340881586074829,
+      "learning_rate": 0.0001994639175257732,
+      "loss": 0.7193,
+      "step": 164
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.10451763868331909,
+      "learning_rate": 0.00019942268041237116,
+      "loss": 0.572,
+      "step": 165
+    },
+    {
+      "epoch": 0.05312,
+      "grad_norm": 0.10894517600536346,
+      "learning_rate": 0.0001993814432989691,
+      "loss": 0.7693,
+      "step": 166
+    },
+    {
+      "epoch": 0.05344,
+      "grad_norm": 0.1243942379951477,
+      "learning_rate": 0.00019934020618556702,
+      "loss": 0.7805,
+      "step": 167
+    },
+    {
+      "epoch": 0.05376,
+      "grad_norm": 0.14128854870796204,
+      "learning_rate": 0.00019929896907216498,
+      "loss": 0.8595,
+      "step": 168
+    },
+    {
+      "epoch": 0.05408,
+      "grad_norm": 0.12148380279541016,
+      "learning_rate": 0.00019925773195876288,
+      "loss": 0.8954,
+      "step": 169
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.10860492289066315,
+      "learning_rate": 0.0001992164948453608,
+      "loss": 0.8101,
+      "step": 170
+    },
+    {
+      "epoch": 0.05472,
+      "grad_norm": 0.12292741239070892,
+      "learning_rate": 0.00019917525773195877,
+      "loss": 0.7965,
+      "step": 171
+    },
+    {
+      "epoch": 0.05504,
+      "grad_norm": 0.13840395212173462,
+      "learning_rate": 0.0001991340206185567,
+      "loss": 0.7231,
+      "step": 172
+    },
+    {
+      "epoch": 0.05536,
+      "grad_norm": 0.11291699856519699,
+      "learning_rate": 0.00019909278350515466,
+      "loss": 0.8247,
+      "step": 173
+    },
+    {
+      "epoch": 0.05568,
+      "grad_norm": 0.11811841279268265,
+      "learning_rate": 0.0001990515463917526,
+      "loss": 0.8504,
+      "step": 174
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.11040613055229187,
+      "learning_rate": 0.00019901030927835052,
+      "loss": 0.6464,
+      "step": 175
+    },
+    {
+      "epoch": 0.05632,
+      "grad_norm": 0.10370033234357834,
+      "learning_rate": 0.00019896907216494845,
+      "loss": 0.8016,
+      "step": 176
+    },
+    {
+      "epoch": 0.05664,
+      "grad_norm": 0.11868111789226532,
+      "learning_rate": 0.00019892783505154639,
+      "loss": 0.7267,
+      "step": 177
+    },
+    {
+      "epoch": 0.05696,
+      "grad_norm": 0.11893659085035324,
+      "learning_rate": 0.00019888659793814434,
+      "loss": 0.814,
+      "step": 178
+    },
+    {
+      "epoch": 0.05728,
+      "grad_norm": 0.11829444020986557,
+      "learning_rate": 0.00019884536082474227,
+      "loss": 0.7852,
+      "step": 179
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.10493418574333191,
+      "learning_rate": 0.0001988041237113402,
+      "loss": 0.6154,
+      "step": 180
+    },
+    {
+      "epoch": 0.05792,
+      "grad_norm": 0.10895421355962753,
+      "learning_rate": 0.00019876288659793816,
+      "loss": 0.8663,
+      "step": 181
+    },
+    {
+      "epoch": 0.05824,
+      "grad_norm": 0.11782484501600266,
+      "learning_rate": 0.0001987216494845361,
+      "loss": 0.7902,
+      "step": 182
+    },
+    {
+      "epoch": 0.05856,
+      "grad_norm": 0.12639066576957703,
+      "learning_rate": 0.00019868041237113403,
+      "loss": 0.7381,
+      "step": 183
+    },
+    {
+      "epoch": 0.05888,
+      "grad_norm": 0.1321602165699005,
+      "learning_rate": 0.00019863917525773196,
+      "loss": 0.9283,
+      "step": 184
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.12468945235013962,
+      "learning_rate": 0.0001985979381443299,
+      "loss": 0.6641,
+      "step": 185
+    },
+    {
+      "epoch": 0.05952,
+      "grad_norm": 0.12850762903690338,
+      "learning_rate": 0.00019855670103092785,
+      "loss": 0.8694,
+      "step": 186
+    },
+    {
+      "epoch": 0.05984,
+      "grad_norm": 0.10586417466402054,
+      "learning_rate": 0.00019851546391752578,
+      "loss": 0.806,
+      "step": 187
+    },
+    {
+      "epoch": 0.06016,
+      "grad_norm": 0.10202761739492416,
+      "learning_rate": 0.00019847422680412374,
+      "loss": 0.684,
+      "step": 188
+    },
+    {
+      "epoch": 0.06048,
+      "grad_norm": 0.11566773056983948,
+      "learning_rate": 0.00019843298969072167,
+      "loss": 0.6737,
+      "step": 189
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.12295495718717575,
+      "learning_rate": 0.0001983917525773196,
+      "loss": 0.6539,
+      "step": 190
+    },
+    {
+      "epoch": 0.06112,
+      "grad_norm": 0.14126336574554443,
+      "learning_rate": 0.00019835051546391753,
+      "loss": 0.8539,
+      "step": 191
+    },
+    {
+      "epoch": 0.06144,
+      "grad_norm": 0.1215134933590889,
+      "learning_rate": 0.00019830927835051546,
+      "loss": 0.7051,
+      "step": 192
+    },
+    {
+      "epoch": 0.06176,
+      "grad_norm": 0.10785145312547684,
+      "learning_rate": 0.00019826804123711342,
+      "loss": 0.7877,
+      "step": 193
+    },
+    {
+      "epoch": 0.06208,
+      "grad_norm": 0.1299588680267334,
+      "learning_rate": 0.00019822680412371135,
+      "loss": 0.7942,
+      "step": 194
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.1072564497590065,
+      "learning_rate": 0.00019818556701030928,
+      "loss": 0.7055,
+      "step": 195
+    },
+    {
+      "epoch": 0.06272,
+      "grad_norm": 0.12069325149059296,
+      "learning_rate": 0.00019814432989690724,
+      "loss": 0.9036,
+      "step": 196
+    },
+    {
+      "epoch": 0.06304,
+      "grad_norm": 0.14454863965511322,
+      "learning_rate": 0.00019810309278350517,
+      "loss": 0.7567,
+      "step": 197
+    },
+    {
+      "epoch": 0.06336,
+      "grad_norm": 0.11882466822862625,
+      "learning_rate": 0.0001980618556701031,
+      "loss": 0.5986,
+      "step": 198
+    },
+    {
+      "epoch": 0.06368,
+      "grad_norm": 0.12222633510828018,
+      "learning_rate": 0.00019802061855670104,
+      "loss": 0.8493,
+      "step": 199
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.10818106681108475,
+      "learning_rate": 0.00019797938144329897,
+      "loss": 0.7932,
+      "step": 200
+    },
+    {
+      "epoch": 0.06432,
+      "grad_norm": 0.1225774884223938,
+      "learning_rate": 0.00019793814432989693,
+      "loss": 0.7501,
+      "step": 201
+    },
+    {
+      "epoch": 0.06464,
+      "grad_norm": 0.13827428221702576,
+      "learning_rate": 0.00019789690721649486,
+      "loss": 0.818,
+      "step": 202
+    },
+    {
+      "epoch": 0.06496,
+      "grad_norm": 0.1145445853471756,
+      "learning_rate": 0.0001978556701030928,
+      "loss": 0.6433,
+      "step": 203
+    },
+    {
+      "epoch": 0.06528,
+      "grad_norm": 0.11928769201040268,
+      "learning_rate": 0.00019781443298969075,
+      "loss": 0.7352,
+      "step": 204
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.11696071177721024,
+      "learning_rate": 0.00019777319587628865,
+      "loss": 0.8499,
+      "step": 205
+    },
+    {
+      "epoch": 0.06592,
+      "grad_norm": 0.1281200349330902,
+      "learning_rate": 0.0001977319587628866,
+      "loss": 0.7526,
+      "step": 206
+    },
+    {
+      "epoch": 0.06624,
+      "grad_norm": 0.1270955204963684,
+      "learning_rate": 0.00019769072164948454,
+      "loss": 0.8456,
+      "step": 207
+    },
+    {
+      "epoch": 0.06656,
+      "grad_norm": 0.12278109043836594,
+      "learning_rate": 0.00019764948453608247,
+      "loss": 0.5972,
+      "step": 208
+    },
+    {
+      "epoch": 0.06688,
+      "grad_norm": 0.10781408101320267,
+      "learning_rate": 0.00019760824742268043,
+      "loss": 0.5967,
+      "step": 209
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.12237494438886642,
+      "learning_rate": 0.00019756701030927836,
+      "loss": 0.7306,
+      "step": 210
+    },
+    {
+      "epoch": 0.06752,
+      "grad_norm": 0.1100955531001091,
+      "learning_rate": 0.00019752577319587632,
+      "loss": 0.7579,
+      "step": 211
+    },
+    {
+      "epoch": 0.06784,
+      "grad_norm": 0.11036559194326401,
+      "learning_rate": 0.00019748453608247422,
+      "loss": 0.6668,
+      "step": 212
+    },
+    {
+      "epoch": 0.06816,
+      "grad_norm": 0.14552150666713715,
+      "learning_rate": 0.00019744329896907216,
+      "loss": 0.6668,
+      "step": 213
+    },
+    {
+      "epoch": 0.06848,
+      "grad_norm": 0.11125028133392334,
+      "learning_rate": 0.0001974020618556701,
+      "loss": 0.7342,
+      "step": 214
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.14329232275485992,
+      "learning_rate": 0.00019736082474226804,
+      "loss": 0.8708,
+      "step": 215
+    },
+    {
+      "epoch": 0.06912,
+      "grad_norm": 0.11911016702651978,
+      "learning_rate": 0.000197319587628866,
+      "loss": 0.7224,
+      "step": 216
+    },
+    {
+      "epoch": 0.06944,
+      "grad_norm": 0.12713277339935303,
+      "learning_rate": 0.00019727835051546393,
+      "loss": 0.7222,
+      "step": 217
+    },
+    {
+      "epoch": 0.06976,
+      "grad_norm": 0.1140274852514267,
+      "learning_rate": 0.00019723711340206187,
+      "loss": 0.627,
+      "step": 218
+    },
+    {
+      "epoch": 0.07008,
+      "grad_norm": 0.1070765033364296,
+      "learning_rate": 0.0001971958762886598,
+      "loss": 0.7514,
+      "step": 219
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.09937503188848495,
+      "learning_rate": 0.00019715463917525773,
+      "loss": 0.7539,
+      "step": 220
+    },
+    {
+      "epoch": 0.07072,
+      "grad_norm": 0.12137589603662491,
+      "learning_rate": 0.0001971134020618557,
+      "loss": 0.68,
+      "step": 221
+    },
+    {
+      "epoch": 0.07104,
+      "grad_norm": 0.11091487109661102,
+      "learning_rate": 0.00019707216494845362,
+      "loss": 0.6904,
+      "step": 222
+    },
+    {
+      "epoch": 0.07136,
+      "grad_norm": 0.10996535420417786,
+      "learning_rate": 0.00019703092783505155,
+      "loss": 0.6854,
+      "step": 223
+    },
+    {
+      "epoch": 0.07168,
+      "grad_norm": 0.129950150847435,
+      "learning_rate": 0.0001969896907216495,
+      "loss": 0.7023,
+      "step": 224
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.13028115034103394,
+      "learning_rate": 0.00019694845360824744,
+      "loss": 0.6479,
+      "step": 225
+    },
+    {
+      "epoch": 0.07232,
+      "grad_norm": 0.1166912168264389,
+      "learning_rate": 0.00019690721649484537,
+      "loss": 0.6498,
+      "step": 226
+    },
+    {
+      "epoch": 0.07264,
+      "grad_norm": 0.1148250624537468,
+      "learning_rate": 0.0001968659793814433,
+      "loss": 0.7461,
+      "step": 227
+    },
+    {
+      "epoch": 0.07296,
+      "grad_norm": 0.12639757990837097,
+      "learning_rate": 0.00019682474226804123,
+      "loss": 0.712,
+      "step": 228
+    },
+    {
+      "epoch": 0.07328,
+      "grad_norm": 0.13229860365390778,
+      "learning_rate": 0.0001967835051546392,
+      "loss": 0.7447,
+      "step": 229
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.11198769509792328,
+      "learning_rate": 0.00019674226804123712,
+      "loss": 0.7329,
+      "step": 230
+    },
+    {
+      "epoch": 0.07392,
+      "grad_norm": 0.11425165086984634,
+      "learning_rate": 0.00019670103092783505,
+      "loss": 0.868,
+      "step": 231
+    },
+    {
+      "epoch": 0.07424,
+      "grad_norm": 0.12034732848405838,
+      "learning_rate": 0.000196659793814433,
+      "loss": 0.7276,
+      "step": 232
+    },
+    {
+      "epoch": 0.07456,
+      "grad_norm": 0.10866102576255798,
+      "learning_rate": 0.00019661855670103094,
+      "loss": 0.7338,
+      "step": 233
+    },
+    {
+      "epoch": 0.07488,
+      "grad_norm": 0.12432979047298431,
+      "learning_rate": 0.00019657731958762887,
+      "loss": 0.8257,
+      "step": 234
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.12356464564800262,
+      "learning_rate": 0.0001965360824742268,
+      "loss": 0.6813,
+      "step": 235
+    },
+    {
+      "epoch": 0.07552,
+      "grad_norm": 0.1002034917473793,
+      "learning_rate": 0.00019649484536082474,
+      "loss": 0.7406,
+      "step": 236
+    },
+    {
+      "epoch": 0.07584,
+      "grad_norm": 0.11528842151165009,
+      "learning_rate": 0.0001964536082474227,
+      "loss": 0.7597,
+      "step": 237
+    },
+    {
+      "epoch": 0.07616,
+      "grad_norm": 0.11711521446704865,
+      "learning_rate": 0.00019641237113402063,
+      "loss": 0.6925,
+      "step": 238
+    },
+    {
+      "epoch": 0.07648,
+      "grad_norm": 0.09948590397834778,
+      "learning_rate": 0.00019637113402061859,
+      "loss": 0.8017,
+      "step": 239
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.11264682561159134,
+      "learning_rate": 0.00019632989690721652,
+      "loss": 0.6871,
+      "step": 240
+    },
+    {
+      "epoch": 0.07712,
+      "grad_norm": 0.12238943576812744,
+      "learning_rate": 0.00019628865979381442,
+      "loss": 0.7113,
+      "step": 241
+    },
+    {
+      "epoch": 0.07744,
+      "grad_norm": 0.11449657380580902,
+      "learning_rate": 0.00019624742268041238,
+      "loss": 0.7144,
+      "step": 242
+    },
+    {
+      "epoch": 0.07776,
+      "grad_norm": 0.12212596088647842,
+      "learning_rate": 0.0001962061855670103,
+      "loss": 0.697,
+      "step": 243
+    },
+    {
+      "epoch": 0.07808,
+      "grad_norm": 0.12231374531984329,
+      "learning_rate": 0.00019616494845360827,
+      "loss": 0.6955,
+      "step": 244
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.11105109006166458,
+      "learning_rate": 0.0001961237113402062,
+      "loss": 0.8251,
+      "step": 245
+    },
+    {
+      "epoch": 0.07872,
+      "grad_norm": 0.12369677424430847,
+      "learning_rate": 0.00019608247422680413,
+      "loss": 0.8814,
+      "step": 246
+    },
+    {
+      "epoch": 0.07904,
+      "grad_norm": 0.12226764857769012,
+      "learning_rate": 0.0001960412371134021,
+      "loss": 0.7408,
+      "step": 247
+    },
+    {
+      "epoch": 0.07936,
+      "grad_norm": 0.10316195338964462,
+      "learning_rate": 0.000196,
+      "loss": 0.792,
+      "step": 248
+    },
+    {
+      "epoch": 0.07968,
+      "grad_norm": 0.12036872655153275,
+      "learning_rate": 0.00019595876288659795,
+      "loss": 0.6,
+      "step": 249
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.1140509843826294,
+      "learning_rate": 0.00019591752577319588,
+      "loss": 0.7599,
+      "step": 250
+    },
+    {
+      "epoch": 0.08032,
+      "grad_norm": 0.14200666546821594,
+      "learning_rate": 0.00019587628865979381,
+      "loss": 0.7723,
+      "step": 251
+    },
+    {
+      "epoch": 0.08064,
+      "grad_norm": 0.11529050767421722,
+      "learning_rate": 0.00019583505154639177,
+      "loss": 0.8305,
+      "step": 252
+    },
+    {
+      "epoch": 0.08096,
+      "grad_norm": 0.11324192583560944,
+      "learning_rate": 0.0001957938144329897,
+      "loss": 0.7913,
+      "step": 253
+    },
+    {
+      "epoch": 0.08128,
+      "grad_norm": 0.12280263006687164,
+      "learning_rate": 0.00019575257731958764,
+      "loss": 0.8388,
+      "step": 254
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.10810542851686478,
+      "learning_rate": 0.00019571134020618557,
+      "loss": 0.7025,
+      "step": 255
+    },
+    {
+      "epoch": 0.08192,
+      "grad_norm": 0.11314940452575684,
+      "learning_rate": 0.0001956701030927835,
+      "loss": 0.6696,
+      "step": 256
+    },
+    {
+      "epoch": 0.08224,
+      "grad_norm": 0.10368701070547104,
+      "learning_rate": 0.00019562886597938146,
+      "loss": 0.581,
+      "step": 257
+    },
+    {
+      "epoch": 0.08256,
+      "grad_norm": 0.11146122217178345,
+      "learning_rate": 0.0001955876288659794,
+      "loss": 0.8221,
+      "step": 258
+    },
+    {
+      "epoch": 0.08288,
+      "grad_norm": 0.11592487245798111,
+      "learning_rate": 0.00019554639175257732,
+      "loss": 0.8082,
+      "step": 259
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.11007897555828094,
+      "learning_rate": 0.00019550515463917528,
+      "loss": 0.8943,
+      "step": 260
+    },
+    {
+      "epoch": 0.08352,
+      "grad_norm": 0.11887793987989426,
+      "learning_rate": 0.0001954639175257732,
+      "loss": 0.7447,
+      "step": 261
+    },
+    {
+      "epoch": 0.08384,
+      "grad_norm": 0.10715821385383606,
+      "learning_rate": 0.00019542268041237114,
+      "loss": 0.6405,
+      "step": 262
+    },
+    {
+      "epoch": 0.08416,
+      "grad_norm": 0.11692032217979431,
+      "learning_rate": 0.00019538144329896907,
+      "loss": 0.863,
+      "step": 263
+    },
+    {
+      "epoch": 0.08448,
+      "grad_norm": 0.11917733401060104,
+      "learning_rate": 0.000195340206185567,
+      "loss": 0.6976,
+      "step": 264
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.1289709061384201,
+      "learning_rate": 0.00019529896907216496,
+      "loss": 0.9149,
+      "step": 265
+    },
+    {
+      "epoch": 0.08512,
+      "grad_norm": 0.10539647191762924,
+      "learning_rate": 0.0001952577319587629,
+      "loss": 0.8449,
+      "step": 266
+    },
+    {
+      "epoch": 0.08544,
+      "grad_norm": 0.10908807814121246,
+      "learning_rate": 0.00019521649484536085,
+      "loss": 0.7162,
+      "step": 267
+    },
+    {
+      "epoch": 0.08576,
+      "grad_norm": 0.10654058307409286,
+      "learning_rate": 0.00019517525773195878,
+      "loss": 0.6285,
+      "step": 268
+    },
+    {
+      "epoch": 0.08608,
+      "grad_norm": 0.111301951110363,
+      "learning_rate": 0.0001951340206185567,
+      "loss": 0.755,
+      "step": 269
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.10679474472999573,
+      "learning_rate": 0.00019509278350515464,
+      "loss": 0.7782,
+      "step": 270
+    },
+    {
+      "epoch": 0.08672,
+      "grad_norm": 0.10793033987283707,
+      "learning_rate": 0.00019505154639175258,
+      "loss": 0.7021,
+      "step": 271
+    },
+    {
+      "epoch": 0.08704,
+      "grad_norm": 0.13048402965068817,
+      "learning_rate": 0.00019501030927835053,
+      "loss": 0.7575,
+      "step": 272
+    },
+    {
+      "epoch": 0.08736,
+      "grad_norm": 0.11437942832708359,
+      "learning_rate": 0.00019496907216494847,
+      "loss": 0.7925,
+      "step": 273
+    },
+    {
+      "epoch": 0.08768,
+      "grad_norm": 0.1390543431043625,
+      "learning_rate": 0.0001949278350515464,
+      "loss": 0.8013,
+      "step": 274
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.1289997398853302,
+      "learning_rate": 0.00019488659793814435,
+      "loss": 0.6053,
+      "step": 275
+    },
+    {
+      "epoch": 0.08832,
+      "grad_norm": 0.09682965278625488,
+      "learning_rate": 0.00019484536082474229,
+      "loss": 0.8756,
+      "step": 276
+    },
+    {
+      "epoch": 0.08864,
+      "grad_norm": 0.12752555310726166,
+      "learning_rate": 0.00019480412371134022,
+      "loss": 0.6314,
+      "step": 277
+    },
+    {
+      "epoch": 0.08896,
+      "grad_norm": 0.11673011630773544,
+      "learning_rate": 0.00019476288659793815,
+      "loss": 0.8842,
+      "step": 278
+    },
+    {
+      "epoch": 0.08928,
+      "grad_norm": 0.108588308095932,
+      "learning_rate": 0.00019472164948453608,
+      "loss": 0.6348,
+      "step": 279
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.1195591390132904,
+      "learning_rate": 0.00019468041237113404,
+      "loss": 0.8097,
+      "step": 280
+    },
+    {
+      "epoch": 0.08992,
+      "grad_norm": 0.11936333775520325,
+      "learning_rate": 0.00019463917525773197,
+      "loss": 0.7858,
+      "step": 281
+    },
+    {
+      "epoch": 0.09024,
+      "grad_norm": 0.11627551168203354,
+      "learning_rate": 0.00019459793814432993,
+      "loss": 0.7322,
+      "step": 282
+    },
+    {
+      "epoch": 0.09056,
+      "grad_norm": 0.10832615196704865,
+      "learning_rate": 0.00019455670103092786,
+      "loss": 0.6215,
+      "step": 283
+    },
+    {
+      "epoch": 0.09088,
+      "grad_norm": 0.11125192046165466,
+      "learning_rate": 0.00019451546391752576,
+      "loss": 0.8058,
+      "step": 284
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.11165905743837357,
+      "learning_rate": 0.00019447422680412372,
+      "loss": 0.7104,
+      "step": 285
+    },
+    {
+      "epoch": 0.09152,
+      "grad_norm": 0.10910595208406448,
+      "learning_rate": 0.00019443298969072165,
+      "loss": 0.707,
+      "step": 286
+    },
+    {
+      "epoch": 0.09184,
+      "grad_norm": 0.11235277354717255,
+      "learning_rate": 0.0001943917525773196,
+      "loss": 0.8161,
+      "step": 287
+    },
+    {
+      "epoch": 0.09216,
+      "grad_norm": 0.11668789386749268,
+      "learning_rate": 0.00019435051546391754,
+      "loss": 0.6641,
+      "step": 288
+    },
+    {
+      "epoch": 0.09248,
+      "grad_norm": 0.11274772882461548,
+      "learning_rate": 0.00019430927835051547,
+      "loss": 0.6987,
+      "step": 289
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.12356577813625336,
+      "learning_rate": 0.0001942680412371134,
+      "loss": 0.7051,
+      "step": 290
+    },
+    {
+      "epoch": 0.09312,
+      "grad_norm": 0.0987682193517685,
+      "learning_rate": 0.00019422680412371134,
+      "loss": 0.7218,
+      "step": 291
+    },
+    {
+      "epoch": 0.09344,
+      "grad_norm": 0.12354995310306549,
+      "learning_rate": 0.00019418556701030927,
+      "loss": 0.7379,
+      "step": 292
+    },
+    {
+      "epoch": 0.09376,
+      "grad_norm": 0.1022811159491539,
+      "learning_rate": 0.00019414432989690723,
+      "loss": 0.65,
+      "step": 293
+    },
+    {
+      "epoch": 0.09408,
+      "grad_norm": 0.11507881432771683,
+      "learning_rate": 0.00019410309278350516,
+      "loss": 0.8681,
+      "step": 294
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.12279893457889557,
+      "learning_rate": 0.00019406185567010312,
+      "loss": 0.6837,
+      "step": 295
+    },
+    {
+      "epoch": 0.09472,
+      "grad_norm": 0.10548900812864304,
+      "learning_rate": 0.00019402061855670105,
+      "loss": 0.669,
+      "step": 296
+    },
+    {
+      "epoch": 0.09504,
+      "grad_norm": 0.1302396059036255,
+      "learning_rate": 0.00019397938144329898,
+      "loss": 0.7969,
+      "step": 297
+    },
+    {
+      "epoch": 0.09536,
+      "grad_norm": 0.11492603272199631,
+      "learning_rate": 0.0001939381443298969,
+      "loss": 1.0184,
+      "step": 298
+    },
+    {
+      "epoch": 0.09568,
+      "grad_norm": 0.131149023771286,
+      "learning_rate": 0.00019389690721649484,
+      "loss": 0.8521,
+      "step": 299
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.1252460777759552,
+      "learning_rate": 0.0001938556701030928,
+      "loss": 0.8791,
+      "step": 300
+    },
+    {
+      "epoch": 0.09632,
+      "grad_norm": 0.11322636157274246,
+      "learning_rate": 0.00019381443298969073,
+      "loss": 0.7305,
+      "step": 301
+    },
+    {
+      "epoch": 0.09664,
+      "grad_norm": 0.10373307764530182,
+      "learning_rate": 0.00019377319587628866,
+      "loss": 0.7844,
+      "step": 302
+    },
+    {
+      "epoch": 0.09696,
+      "grad_norm": 0.13182489573955536,
+      "learning_rate": 0.00019373195876288662,
+      "loss": 0.6581,
+      "step": 303
+    },
+    {
+      "epoch": 0.09728,
+      "grad_norm": 0.1276710480451584,
+      "learning_rate": 0.00019369072164948455,
+      "loss": 0.6664,
+      "step": 304
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.10770755261182785,
+      "learning_rate": 0.00019364948453608248,
+      "loss": 0.6315,
+      "step": 305
+    },
+    {
+      "epoch": 0.09792,
+      "grad_norm": 0.11755330115556717,
+      "learning_rate": 0.00019360824742268041,
+      "loss": 0.7562,
+      "step": 306
+    },
+    {
+      "epoch": 0.09824,
+      "grad_norm": 0.1252545267343521,
+      "learning_rate": 0.00019356701030927835,
+      "loss": 0.6606,
+      "step": 307
+    },
+    {
+      "epoch": 0.09856,
+      "grad_norm": 0.12284315377473831,
+      "learning_rate": 0.0001935257731958763,
+      "loss": 0.8037,
+      "step": 308
+    },
+    {
+      "epoch": 0.09888,
+      "grad_norm": 0.106113962829113,
+      "learning_rate": 0.00019348453608247424,
+      "loss": 0.6797,
+      "step": 309
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.09285733103752136,
+      "learning_rate": 0.0001934432989690722,
+      "loss": 0.6927,
+      "step": 310
+    },
+    {
+      "epoch": 0.09952,
+      "grad_norm": 0.10903103649616241,
+      "learning_rate": 0.00019340206185567012,
+      "loss": 0.7863,
+      "step": 311
+    },
+    {
+      "epoch": 0.09984,
+      "grad_norm": 0.12015190720558167,
+      "learning_rate": 0.00019336082474226806,
+      "loss": 0.6961,
+      "step": 312
+    },
+    {
+      "epoch": 0.10016,
+      "grad_norm": 0.12260061502456665,
+      "learning_rate": 0.000193319587628866,
+      "loss": 0.7965,
+      "step": 313
+    },
+    {
+      "epoch": 0.10048,
+      "grad_norm": 0.10297476500272751,
+      "learning_rate": 0.00019327835051546392,
+      "loss": 0.6935,
+      "step": 314
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.12614290416240692,
+      "learning_rate": 0.00019323711340206188,
+      "loss": 0.8751,
+      "step": 315
+    },
+    {
+      "epoch": 0.10112,
+      "grad_norm": 0.10241006314754486,
+      "learning_rate": 0.0001931958762886598,
+      "loss": 0.5642,
+      "step": 316
+    },
+    {
+      "epoch": 0.10144,
+      "grad_norm": 0.1053333505988121,
+      "learning_rate": 0.00019315463917525774,
+      "loss": 0.6425,
+      "step": 317
+    },
+    {
+      "epoch": 0.10176,
+      "grad_norm": 0.1024412289261818,
+      "learning_rate": 0.0001931134020618557,
+      "loss": 0.6414,
+      "step": 318
+    },
+    {
+      "epoch": 0.10208,
+      "grad_norm": 0.10628654062747955,
+      "learning_rate": 0.0001930721649484536,
+      "loss": 0.8259,
+      "step": 319
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.1291695237159729,
+      "learning_rate": 0.00019303092783505153,
+      "loss": 0.6887,
+      "step": 320
+    },
+    {
+      "epoch": 0.10272,
+      "grad_norm": 0.11315542459487915,
+      "learning_rate": 0.0001929896907216495,
+      "loss": 0.6316,
+      "step": 321
+    },
+    {
+      "epoch": 0.10304,
+      "grad_norm": 0.11541493982076645,
+      "learning_rate": 0.00019294845360824742,
+      "loss": 0.8023,
+      "step": 322
+    },
+    {
+      "epoch": 0.10336,
+      "grad_norm": 0.09769251197576523,
+      "learning_rate": 0.00019290721649484538,
+      "loss": 0.674,
+      "step": 323
+    },
+    {
+      "epoch": 0.10368,
+      "grad_norm": 0.10549357533454895,
+      "learning_rate": 0.0001928659793814433,
+      "loss": 0.7102,
+      "step": 324
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.1344941407442093,
+      "learning_rate": 0.00019282474226804124,
+      "loss": 0.8861,
+      "step": 325
+    },
+    {
+      "epoch": 0.10432,
+      "grad_norm": 0.13625279068946838,
+      "learning_rate": 0.00019278350515463918,
+      "loss": 0.8333,
+      "step": 326
+    },
+    {
+      "epoch": 0.10464,
+      "grad_norm": 0.1136624813079834,
+      "learning_rate": 0.0001927422680412371,
+      "loss": 0.7187,
+      "step": 327
+    },
+    {
+      "epoch": 0.10496,
+      "grad_norm": 0.12351350486278534,
+      "learning_rate": 0.00019270103092783506,
+      "loss": 0.6361,
+      "step": 328
+    },
+    {
+      "epoch": 0.10528,
+      "grad_norm": 0.12927868962287903,
+      "learning_rate": 0.000192659793814433,
+      "loss": 0.6716,
+      "step": 329
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.11619272083044052,
+      "learning_rate": 0.00019261855670103093,
+      "loss": 0.8108,
+      "step": 330
+    },
+    {
+      "epoch": 0.10592,
+      "grad_norm": 0.12765595316886902,
+      "learning_rate": 0.00019257731958762889,
+      "loss": 0.77,
+      "step": 331
+    },
+    {
+      "epoch": 0.10624,
+      "grad_norm": 0.13606730103492737,
+      "learning_rate": 0.00019253608247422682,
+      "loss": 0.7572,
+      "step": 332
+    },
+    {
+      "epoch": 0.10656,
+      "grad_norm": 0.1074293777346611,
+      "learning_rate": 0.00019249484536082475,
+      "loss": 0.8137,
+      "step": 333
+    },
+    {
+      "epoch": 0.10688,
+      "grad_norm": 0.12371189147233963,
+      "learning_rate": 0.00019245360824742268,
+      "loss": 0.7414,
+      "step": 334
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.1253589391708374,
+      "learning_rate": 0.0001924123711340206,
+      "loss": 0.6796,
+      "step": 335
+    },
+    {
+      "epoch": 0.10752,
+      "grad_norm": 0.11865826696157455,
+      "learning_rate": 0.00019237113402061857,
+      "loss": 0.7051,
+      "step": 336
+    },
+    {
+      "epoch": 0.10784,
+      "grad_norm": 0.12180513888597488,
+      "learning_rate": 0.0001923298969072165,
+      "loss": 0.8126,
+      "step": 337
+    },
+    {
+      "epoch": 0.10816,
+      "grad_norm": 0.11744590848684311,
+      "learning_rate": 0.00019228865979381446,
+      "loss": 0.7553,
+      "step": 338
+    },
+    {
+      "epoch": 0.10848,
+      "grad_norm": 0.11796604096889496,
+      "learning_rate": 0.0001922474226804124,
+      "loss": 0.814,
+      "step": 339
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.125954270362854,
+      "learning_rate": 0.00019220618556701032,
+      "loss": 0.64,
+      "step": 340
+    },
+    {
+      "epoch": 0.10912,
+      "grad_norm": 0.12020507454872131,
+      "learning_rate": 0.00019216494845360825,
+      "loss": 0.6175,
+      "step": 341
+    },
+    {
+      "epoch": 0.10944,
+      "grad_norm": 0.10076280683279037,
+      "learning_rate": 0.00019212371134020618,
+      "loss": 0.6176,
+      "step": 342
+    },
+    {
+      "epoch": 0.10976,
+      "grad_norm": 0.11676807701587677,
+      "learning_rate": 0.00019208247422680414,
+      "loss": 0.677,
+      "step": 343
+    },
+    {
+      "epoch": 0.11008,
+      "grad_norm": 0.11364617198705673,
+      "learning_rate": 0.00019204123711340207,
+      "loss": 0.5619,
+      "step": 344
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.10261007398366928,
+      "learning_rate": 0.000192,
+      "loss": 0.614,
+      "step": 345
+    },
+    {
+      "epoch": 0.11072,
+      "grad_norm": 0.09645744413137436,
+      "learning_rate": 0.00019195876288659796,
+      "loss": 0.7274,
+      "step": 346
+    },
+    {
+      "epoch": 0.11104,
+      "grad_norm": 0.12451738864183426,
+      "learning_rate": 0.0001919175257731959,
+      "loss": 0.8735,
+      "step": 347
+    },
+    {
+      "epoch": 0.11136,
+      "grad_norm": 0.11062555015087128,
+      "learning_rate": 0.00019187628865979383,
+      "loss": 0.6268,
+      "step": 348
+    },
+    {
+      "epoch": 0.11168,
+      "grad_norm": 0.1201324462890625,
+      "learning_rate": 0.00019183505154639176,
+      "loss": 0.793,
+      "step": 349
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.12797708809375763,
+      "learning_rate": 0.0001917938144329897,
+      "loss": 0.6075,
+      "step": 350
+    },
+    {
+      "epoch": 0.11232,
+      "grad_norm": 0.11388446390628815,
+      "learning_rate": 0.00019175257731958765,
+      "loss": 0.7343,
+      "step": 351
+    },
+    {
+      "epoch": 0.11264,
+      "grad_norm": 0.11379895359277725,
+      "learning_rate": 0.00019171134020618558,
+      "loss": 0.6767,
+      "step": 352
+    },
+    {
+      "epoch": 0.11296,
+      "grad_norm": 0.11296231299638748,
+      "learning_rate": 0.0001916701030927835,
+      "loss": 0.7473,
+      "step": 353
+    },
+    {
+      "epoch": 0.11328,
+      "grad_norm": 0.1344781517982483,
+      "learning_rate": 0.00019162886597938147,
+      "loss": 0.6533,
+      "step": 354
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.12438254803419113,
+      "learning_rate": 0.00019158762886597937,
+      "loss": 0.7511,
+      "step": 355
+    },
+    {
+      "epoch": 0.11392,
+      "grad_norm": 0.11343064159154892,
+      "learning_rate": 0.00019154639175257733,
+      "loss": 0.6876,
+      "step": 356
+    },
+    {
+      "epoch": 0.11424,
+      "grad_norm": 0.11751387268304825,
+      "learning_rate": 0.00019150515463917526,
+      "loss": 0.7198,
+      "step": 357
+    },
+    {
+      "epoch": 0.11456,
+      "grad_norm": 0.1194872185587883,
+      "learning_rate": 0.0001914639175257732,
+      "loss": 0.6149,
+      "step": 358
+    },
+    {
+      "epoch": 0.11488,
+      "grad_norm": 0.11200791597366333,
+      "learning_rate": 0.00019142268041237115,
+      "loss": 0.8605,
+      "step": 359
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.11708471924066544,
+      "learning_rate": 0.00019138144329896908,
+      "loss": 0.6603,
+      "step": 360
+    },
+    {
+      "epoch": 0.11552,
+      "grad_norm": 0.116248220205307,
+      "learning_rate": 0.00019134020618556704,
+      "loss": 0.7141,
+      "step": 361
+    },
+    {
+      "epoch": 0.11584,
+      "grad_norm": 0.11516683548688889,
+      "learning_rate": 0.00019129896907216494,
+      "loss": 0.7316,
+      "step": 362
+    },
+    {
+      "epoch": 0.11616,
+      "grad_norm": 0.10961434990167618,
+      "learning_rate": 0.00019125773195876288,
+      "loss": 0.6173,
+      "step": 363
+    },
+    {
+      "epoch": 0.11648,
+      "grad_norm": 0.12037073075771332,
+      "learning_rate": 0.00019121649484536083,
+      "loss": 0.6659,
+      "step": 364
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.12053418904542923,
+      "learning_rate": 0.00019117525773195877,
+      "loss": 0.6544,
+      "step": 365
+    },
+    {
+      "epoch": 0.11712,
+      "grad_norm": 0.12387484312057495,
+      "learning_rate": 0.00019113402061855672,
+      "loss": 0.6406,
+      "step": 366
+    },
+    {
+      "epoch": 0.11744,
+      "grad_norm": 0.12213553488254547,
+      "learning_rate": 0.00019109278350515466,
+      "loss": 0.6592,
+      "step": 367
+    },
+    {
+      "epoch": 0.11776,
+      "grad_norm": 0.1237708106637001,
+      "learning_rate": 0.0001910515463917526,
+      "loss": 0.779,
+      "step": 368
+    },
+    {
+      "epoch": 0.11808,
+      "grad_norm": 0.10254320502281189,
+      "learning_rate": 0.00019101030927835052,
+      "loss": 0.7242,
+      "step": 369
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.11179213225841522,
+      "learning_rate": 0.00019096907216494845,
+      "loss": 0.765,
+      "step": 370
+    },
+    {
+      "epoch": 0.11872,
+      "grad_norm": 0.11824121326208115,
+      "learning_rate": 0.0001909278350515464,
+      "loss": 0.6971,
+      "step": 371
+    },
+    {
+      "epoch": 0.11904,
+      "grad_norm": 0.13179510831832886,
+      "learning_rate": 0.00019088659793814434,
+      "loss": 0.7179,
+      "step": 372
+    },
+    {
+      "epoch": 0.11936,
+      "grad_norm": 0.1161201074719429,
+      "learning_rate": 0.00019084536082474227,
+      "loss": 0.6249,
+      "step": 373
+    },
+    {
+      "epoch": 0.11968,
+      "grad_norm": 0.12930230796337128,
+      "learning_rate": 0.00019080412371134023,
+      "loss": 0.6636,
+      "step": 374
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.12602809071540833,
+      "learning_rate": 0.00019076288659793816,
+      "loss": 0.7044,
+      "step": 375
+    },
+    {
+      "epoch": 0.12032,
+      "grad_norm": 0.1393459141254425,
+      "learning_rate": 0.0001907216494845361,
+      "loss": 0.6518,
+      "step": 376
+    },
+    {
+      "epoch": 0.12064,
+      "grad_norm": 0.11460117250680923,
+      "learning_rate": 0.00019068041237113402,
+      "loss": 0.7716,
+      "step": 377
+    },
+    {
+      "epoch": 0.12096,
+      "grad_norm": 0.14296174049377441,
+      "learning_rate": 0.00019063917525773195,
+      "loss": 0.7899,
+      "step": 378
+    },
+    {
+      "epoch": 0.12128,
+      "grad_norm": 0.14445006847381592,
+      "learning_rate": 0.0001905979381443299,
+      "loss": 0.7617,
+      "step": 379
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.12754163146018982,
+      "learning_rate": 0.00019055670103092784,
+      "loss": 0.7888,
+      "step": 380
+    },
+    {
+      "epoch": 0.12192,
+      "grad_norm": 0.11828102916479111,
+      "learning_rate": 0.00019051546391752577,
+      "loss": 0.6641,
+      "step": 381
+    },
+    {
+      "epoch": 0.12224,
+      "grad_norm": 0.11433565616607666,
+      "learning_rate": 0.00019047422680412373,
+      "loss": 0.5984,
+      "step": 382
+    },
+    {
+      "epoch": 0.12256,
+      "grad_norm": 0.11538711935281754,
+      "learning_rate": 0.00019043298969072166,
+      "loss": 0.7464,
+      "step": 383
+    },
+    {
+      "epoch": 0.12288,
+      "grad_norm": 0.10493431240320206,
+      "learning_rate": 0.0001903917525773196,
+      "loss": 0.7104,
+      "step": 384
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.13114306330680847,
+      "learning_rate": 0.00019035051546391753,
+      "loss": 0.9104,
+      "step": 385
+    },
+    {
+      "epoch": 0.12352,
+      "grad_norm": 0.11584331095218658,
+      "learning_rate": 0.00019030927835051546,
+      "loss": 0.7205,
+      "step": 386
+    },
+    {
+      "epoch": 0.12384,
+      "grad_norm": 0.1123526394367218,
+      "learning_rate": 0.00019026804123711342,
+      "loss": 0.7061,
+      "step": 387
+    },
+    {
+      "epoch": 0.12416,
+      "grad_norm": 0.11471172422170639,
+      "learning_rate": 0.00019022680412371135,
+      "loss": 0.7491,
+      "step": 388
+    },
+    {
+      "epoch": 0.12448,
+      "grad_norm": 0.1361629068851471,
+      "learning_rate": 0.0001901855670103093,
+      "loss": 0.8168,
+      "step": 389
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1272808462381363,
+      "learning_rate": 0.00019014432989690724,
+      "loss": 0.6907,
+      "step": 390
+    },
+    {
+      "epoch": 0.12512,
+      "grad_norm": 0.12672847509384155,
+      "learning_rate": 0.00019010309278350514,
+      "loss": 0.7511,
+      "step": 391
+    },
+    {
+      "epoch": 0.12544,
+      "grad_norm": 0.12205267697572708,
+      "learning_rate": 0.0001900618556701031,
+      "loss": 0.8164,
+      "step": 392
+    },
+    {
+      "epoch": 0.12576,
+      "grad_norm": 0.12806954979896545,
+      "learning_rate": 0.00019002061855670103,
+      "loss": 0.8247,
+      "step": 393
+    },
+    {
+      "epoch": 0.12608,
+      "grad_norm": 0.11856722086668015,
+      "learning_rate": 0.000189979381443299,
+      "loss": 0.7324,
+      "step": 394
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.11742661893367767,
+      "learning_rate": 0.00018993814432989692,
+      "loss": 0.8151,
+      "step": 395
+    },
+    {
+      "epoch": 0.12672,
+      "grad_norm": 0.11036640405654907,
+      "learning_rate": 0.00018989690721649485,
+      "loss": 0.5336,
+      "step": 396
+    },
+    {
+      "epoch": 0.12704,
+      "grad_norm": 0.11546391993761063,
+      "learning_rate": 0.0001898556701030928,
+      "loss": 0.8078,
+      "step": 397
+    },
+    {
+      "epoch": 0.12736,
+      "grad_norm": 0.11293994635343552,
+      "learning_rate": 0.00018981443298969071,
+      "loss": 0.7818,
+      "step": 398
+    },
+    {
+      "epoch": 0.12768,
+      "grad_norm": 0.11465566605329514,
+      "learning_rate": 0.00018977319587628867,
+      "loss": 0.8291,
+      "step": 399
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.1251228302717209,
+      "learning_rate": 0.0001897319587628866,
+      "loss": 0.7199,
+      "step": 400
+    },
+    {
+      "epoch": 0.12832,
+      "grad_norm": 0.11204738169908524,
+      "learning_rate": 0.00018969072164948454,
+      "loss": 0.7193,
+      "step": 401
+    },
+    {
+      "epoch": 0.12864,
+      "grad_norm": 0.1072632372379303,
+      "learning_rate": 0.0001896494845360825,
+      "loss": 0.7763,
+      "step": 402
+    },
+    {
+      "epoch": 0.12896,
+      "grad_norm": 0.11516649276018143,
+      "learning_rate": 0.00018960824742268043,
+      "loss": 0.6853,
+      "step": 403
+    },
+    {
+      "epoch": 0.12928,
+      "grad_norm": 0.1176360473036766,
+      "learning_rate": 0.00018956701030927836,
+      "loss": 0.6184,
+      "step": 404
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.11547468602657318,
+      "learning_rate": 0.0001895257731958763,
+      "loss": 0.8048,
+      "step": 405
+    },
+    {
+      "epoch": 0.12992,
+      "grad_norm": 0.11308492720127106,
+      "learning_rate": 0.00018948453608247422,
+      "loss": 0.7213,
+      "step": 406
+    },
+    {
+      "epoch": 0.13024,
+      "grad_norm": 0.11973299831151962,
+      "learning_rate": 0.00018944329896907218,
+      "loss": 0.8809,
+      "step": 407
+    },
+    {
+      "epoch": 0.13056,
+      "grad_norm": 0.1213008239865303,
+      "learning_rate": 0.0001894020618556701,
+      "loss": 0.7794,
+      "step": 408
+    },
+    {
+      "epoch": 0.13088,
+      "grad_norm": 0.13353341817855835,
+      "learning_rate": 0.00018936082474226804,
+      "loss": 0.6674,
+      "step": 409
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.1155238002538681,
+      "learning_rate": 0.000189319587628866,
+      "loss": 0.7668,
+      "step": 410
+    },
+    {
+      "epoch": 0.13152,
+      "grad_norm": 0.13428455591201782,
+      "learning_rate": 0.00018927835051546393,
+      "loss": 0.6864,
+      "step": 411
+    },
+    {
+      "epoch": 0.13184,
+      "grad_norm": 0.13777495920658112,
+      "learning_rate": 0.00018923711340206186,
+      "loss": 0.8271,
+      "step": 412
+    },
+    {
+      "epoch": 0.13216,
+      "grad_norm": 0.12846726179122925,
+      "learning_rate": 0.0001891958762886598,
+      "loss": 0.9027,
+      "step": 413
+    },
+    {
+      "epoch": 0.13248,
+      "grad_norm": 0.11394570022821426,
+      "learning_rate": 0.00018915463917525772,
+      "loss": 0.7531,
+      "step": 414
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.11797945201396942,
+      "learning_rate": 0.00018911340206185568,
+      "loss": 0.7393,
+      "step": 415
+    },
+    {
+      "epoch": 0.13312,
+      "grad_norm": 0.11778296530246735,
+      "learning_rate": 0.0001890721649484536,
+      "loss": 0.8172,
+      "step": 416
+    },
+    {
+      "epoch": 0.13344,
+      "grad_norm": 0.12857432663440704,
+      "learning_rate": 0.00018903092783505157,
+      "loss": 0.5693,
+      "step": 417
+    },
+    {
+      "epoch": 0.13376,
+      "grad_norm": 0.1395915150642395,
+      "learning_rate": 0.0001889896907216495,
+      "loss": 0.7187,
+      "step": 418
+    },
+    {
+      "epoch": 0.13408,
+      "grad_norm": 0.12571577727794647,
+      "learning_rate": 0.00018894845360824743,
+      "loss": 0.7347,
+      "step": 419
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.11062180995941162,
+      "learning_rate": 0.00018890721649484537,
+      "loss": 0.8797,
+      "step": 420
+    },
+    {
+      "epoch": 0.13472,
+      "grad_norm": 0.12011471390724182,
+      "learning_rate": 0.0001888659793814433,
+      "loss": 0.7638,
+      "step": 421
+    },
+    {
+      "epoch": 0.13504,
+      "grad_norm": 0.11402884870767593,
+      "learning_rate": 0.00018882474226804126,
+      "loss": 0.6198,
+      "step": 422
+    },
+    {
+      "epoch": 0.13536,
+      "grad_norm": 0.11654789745807648,
+      "learning_rate": 0.00018878350515463919,
+      "loss": 0.8276,
+      "step": 423
+    },
+    {
+      "epoch": 0.13568,
+      "grad_norm": 0.110030896961689,
+      "learning_rate": 0.00018874226804123712,
+      "loss": 0.7828,
+      "step": 424
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.1173168271780014,
+      "learning_rate": 0.00018870103092783508,
+      "loss": 0.6445,
+      "step": 425
+    },
+    {
+      "epoch": 0.13632,
+      "grad_norm": 0.11195468157529831,
+      "learning_rate": 0.000188659793814433,
+      "loss": 0.7336,
+      "step": 426
+    },
+    {
+      "epoch": 0.13664,
+      "grad_norm": 0.1138012707233429,
+      "learning_rate": 0.00018861855670103094,
+      "loss": 0.7085,
+      "step": 427
+    },
+    {
+      "epoch": 0.13696,
+      "grad_norm": 0.11757896840572357,
+      "learning_rate": 0.00018857731958762887,
+      "loss": 0.593,
+      "step": 428
+    },
+    {
+      "epoch": 0.13728,
+      "grad_norm": 0.13230633735656738,
+      "learning_rate": 0.0001885360824742268,
+      "loss": 0.705,
+      "step": 429
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.13503113389015198,
+      "learning_rate": 0.00018849484536082476,
+      "loss": 0.7134,
+      "step": 430
+    },
+    {
+      "epoch": 0.13792,
+      "grad_norm": 0.11426805704832077,
+      "learning_rate": 0.0001884536082474227,
+      "loss": 0.6781,
+      "step": 431
+    },
+    {
+      "epoch": 0.13824,
+      "grad_norm": 0.11934298276901245,
+      "learning_rate": 0.00018841237113402065,
+      "loss": 0.7727,
+      "step": 432
+    },
+    {
+      "epoch": 0.13856,
+      "grad_norm": 0.12700718641281128,
+      "learning_rate": 0.00018837113402061858,
+      "loss": 0.7153,
+      "step": 433
+    },
+    {
+      "epoch": 0.13888,
+      "grad_norm": 0.15089921653270721,
+      "learning_rate": 0.00018832989690721648,
+      "loss": 0.7326,
+      "step": 434
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.10799750685691833,
+      "learning_rate": 0.00018828865979381444,
+      "loss": 0.7738,
+      "step": 435
+    },
+    {
+      "epoch": 0.13952,
+      "grad_norm": 0.1076594665646553,
+      "learning_rate": 0.00018824742268041237,
+      "loss": 0.545,
+      "step": 436
+    },
+    {
+      "epoch": 0.13984,
+      "grad_norm": 0.1204753890633583,
+      "learning_rate": 0.00018820618556701033,
+      "loss": 0.73,
+      "step": 437
+    },
+    {
+      "epoch": 0.14016,
+      "grad_norm": 0.11598300188779831,
+      "learning_rate": 0.00018816494845360826,
+      "loss": 0.635,
+      "step": 438
+    },
+    {
+      "epoch": 0.14048,
+      "grad_norm": 0.11010400205850601,
+      "learning_rate": 0.0001881237113402062,
+      "loss": 0.7891,
+      "step": 439
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.12184228003025055,
+      "learning_rate": 0.00018808247422680413,
+      "loss": 0.7332,
+      "step": 440
+    },
+    {
+      "epoch": 0.14112,
+      "grad_norm": 0.11696261167526245,
+      "learning_rate": 0.00018804123711340206,
+      "loss": 0.8299,
+      "step": 441
+    },
+    {
+      "epoch": 0.14144,
+      "grad_norm": 0.11464802920818329,
+      "learning_rate": 0.000188,
+      "loss": 0.7107,
+      "step": 442
+    },
+    {
+      "epoch": 0.14176,
+      "grad_norm": 0.12120334804058075,
+      "learning_rate": 0.00018795876288659795,
+      "loss": 0.7729,
+      "step": 443
+    },
+    {
+      "epoch": 0.14208,
+      "grad_norm": 0.11571822315454483,
+      "learning_rate": 0.00018791752577319588,
+      "loss": 0.6551,
+      "step": 444
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.11872228980064392,
+      "learning_rate": 0.00018787628865979384,
+      "loss": 0.698,
+      "step": 445
+    },
+    {
+      "epoch": 0.14272,
+      "grad_norm": 0.12505149841308594,
+      "learning_rate": 0.00018783505154639177,
+      "loss": 0.751,
+      "step": 446
+    },
+    {
+      "epoch": 0.14304,
+      "grad_norm": 0.10618813335895538,
+      "learning_rate": 0.0001877938144329897,
+      "loss": 0.7299,
+      "step": 447
+    },
+    {
+      "epoch": 0.14336,
+      "grad_norm": 0.11824540048837662,
+      "learning_rate": 0.00018775257731958763,
+      "loss": 0.6796,
+      "step": 448
+    },
+    {
+      "epoch": 0.14368,
+      "grad_norm": 0.10695328563451767,
+      "learning_rate": 0.00018771134020618556,
+      "loss": 0.7424,
+      "step": 449
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.12764234840869904,
+      "learning_rate": 0.00018767010309278352,
+      "loss": 0.7301,
+      "step": 450
+    },
+    {
+      "epoch": 0.14432,
+      "grad_norm": 0.12354765832424164,
+      "learning_rate": 0.00018762886597938145,
+      "loss": 0.7945,
+      "step": 451
+    },
+    {
+      "epoch": 0.14464,
+      "grad_norm": 0.12733551859855652,
+      "learning_rate": 0.00018758762886597938,
+      "loss": 0.7565,
+      "step": 452
+    },
+    {
+      "epoch": 0.14496,
+      "grad_norm": 0.1533960998058319,
+      "learning_rate": 0.00018754639175257734,
+      "loss": 0.7964,
+      "step": 453
+    },
+    {
+      "epoch": 0.14528,
+      "grad_norm": 0.11748246848583221,
+      "learning_rate": 0.00018750515463917527,
+      "loss": 0.7715,
+      "step": 454
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.12309489399194717,
+      "learning_rate": 0.0001874639175257732,
+      "loss": 0.7758,
+      "step": 455
+    },
+    {
+      "epoch": 0.14592,
+      "grad_norm": 0.10743677616119385,
+      "learning_rate": 0.00018742268041237114,
+      "loss": 0.8084,
+      "step": 456
+    },
+    {
+      "epoch": 0.14624,
+      "grad_norm": 0.15399467945098877,
+      "learning_rate": 0.00018738144329896907,
+      "loss": 0.7689,
+      "step": 457
+    },
+    {
+      "epoch": 0.14656,
+      "grad_norm": 0.1381152868270874,
+      "learning_rate": 0.00018734020618556702,
+      "loss": 0.7902,
+      "step": 458
+    },
+    {
+      "epoch": 0.14688,
+      "grad_norm": 0.12086453288793564,
+      "learning_rate": 0.00018729896907216496,
+      "loss": 0.739,
+      "step": 459
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.14193426072597504,
+      "learning_rate": 0.00018725773195876291,
+      "loss": 0.8245,
+      "step": 460
+    },
+    {
+      "epoch": 0.14752,
+      "grad_norm": 0.1121336966753006,
+      "learning_rate": 0.00018721649484536085,
+      "loss": 0.6877,
+      "step": 461
+    },
+    {
+      "epoch": 0.14784,
+      "grad_norm": 0.12167156487703323,
+      "learning_rate": 0.00018717525773195878,
+      "loss": 0.7017,
+      "step": 462
+    },
+    {
+      "epoch": 0.14816,
+      "grad_norm": 0.1287817507982254,
+      "learning_rate": 0.0001871340206185567,
+      "loss": 0.6229,
+      "step": 463
+    },
+    {
+      "epoch": 0.14848,
+      "grad_norm": 0.128828227519989,
+      "learning_rate": 0.00018709278350515464,
+      "loss": 0.7917,
+      "step": 464
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.13771773874759674,
+      "learning_rate": 0.0001870515463917526,
+      "loss": 0.6883,
+      "step": 465
+    },
+    {
+      "epoch": 0.14912,
+      "grad_norm": 0.10570216923952103,
+      "learning_rate": 0.00018701030927835053,
+      "loss": 0.7791,
+      "step": 466
+    },
+    {
+      "epoch": 0.14944,
+      "grad_norm": 0.1250445395708084,
+      "learning_rate": 0.00018696907216494846,
+      "loss": 0.7435,
+      "step": 467
+    },
+    {
+      "epoch": 0.14976,
+      "grad_norm": 0.1064952090382576,
+      "learning_rate": 0.00018692783505154642,
+      "loss": 0.7621,
+      "step": 468
+    },
+    {
+      "epoch": 0.15008,
+      "grad_norm": 0.10514791309833527,
+      "learning_rate": 0.00018688659793814432,
+      "loss": 0.6403,
+      "step": 469
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.10278081893920898,
+      "learning_rate": 0.00018684536082474225,
+      "loss": 0.8246,
+      "step": 470
+    },
+    {
+      "epoch": 0.15072,
+      "grad_norm": 0.11813073605298996,
+      "learning_rate": 0.0001868041237113402,
+      "loss": 0.8403,
+      "step": 471
+    },
+    {
+      "epoch": 0.15104,
+      "grad_norm": 0.10654576867818832,
+      "learning_rate": 0.00018676288659793814,
+      "loss": 0.7483,
+      "step": 472
+    },
+    {
+      "epoch": 0.15136,
+      "grad_norm": 0.11642228811979294,
+      "learning_rate": 0.0001867216494845361,
+      "loss": 0.6114,
+      "step": 473
+    },
+    {
+      "epoch": 0.15168,
+      "grad_norm": 0.12933100759983063,
+      "learning_rate": 0.00018668041237113403,
+      "loss": 0.7902,
+      "step": 474
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.1200195848941803,
+      "learning_rate": 0.00018663917525773196,
+      "loss": 0.7298,
+      "step": 475
+    },
+    {
+      "epoch": 0.15232,
+      "grad_norm": 0.10747142881155014,
+      "learning_rate": 0.0001865979381443299,
+      "loss": 0.7376,
+      "step": 476
+    },
+    {
+      "epoch": 0.15264,
+      "grad_norm": 0.10868332535028458,
+      "learning_rate": 0.00018655670103092783,
+      "loss": 0.7013,
+      "step": 477
+    },
+    {
+      "epoch": 0.15296,
+      "grad_norm": 0.12750379741191864,
+      "learning_rate": 0.00018651546391752579,
+      "loss": 0.7718,
+      "step": 478
+    },
+    {
+      "epoch": 0.15328,
+      "grad_norm": 0.10860800743103027,
+      "learning_rate": 0.00018647422680412372,
+      "loss": 0.5887,
+      "step": 479
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.11445030570030212,
+      "learning_rate": 0.00018643298969072165,
+      "loss": 0.7955,
+      "step": 480
+    },
+    {
+      "epoch": 0.15392,
+      "grad_norm": 0.11774427443742752,
+      "learning_rate": 0.0001863917525773196,
+      "loss": 0.7095,
+      "step": 481
+    },
+    {
+      "epoch": 0.15424,
+      "grad_norm": 0.13853546977043152,
+      "learning_rate": 0.00018635051546391754,
+      "loss": 0.9395,
+      "step": 482
+    },
+    {
+      "epoch": 0.15456,
+      "grad_norm": 0.120772585272789,
+      "learning_rate": 0.00018630927835051547,
+      "loss": 0.8,
+      "step": 483
+    },
+    {
+      "epoch": 0.15488,
+      "grad_norm": 0.13263949751853943,
+      "learning_rate": 0.0001862680412371134,
+      "loss": 0.9118,
+      "step": 484
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.12179119139909744,
+      "learning_rate": 0.00018622680412371133,
+      "loss": 0.6567,
+      "step": 485
+    },
+    {
+      "epoch": 0.15552,
+      "grad_norm": 0.1244128867983818,
+      "learning_rate": 0.0001861855670103093,
+      "loss": 0.7702,
+      "step": 486
+    },
+    {
+      "epoch": 0.15584,
+      "grad_norm": 0.12456309050321579,
+      "learning_rate": 0.00018614432989690722,
+      "loss": 0.7687,
+      "step": 487
+    },
+    {
+      "epoch": 0.15616,
+      "grad_norm": 0.12738095223903656,
+      "learning_rate": 0.00018610309278350518,
+      "loss": 0.7489,
+      "step": 488
+    },
+    {
+      "epoch": 0.15648,
+      "grad_norm": 0.11148428916931152,
+      "learning_rate": 0.0001860618556701031,
+      "loss": 0.7362,
+      "step": 489
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.1029975637793541,
+      "learning_rate": 0.00018602061855670104,
+      "loss": 0.5808,
+      "step": 490
+    },
+    {
+      "epoch": 0.15712,
+      "grad_norm": 0.12028798460960388,
+      "learning_rate": 0.00018597938144329897,
+      "loss": 0.8541,
+      "step": 491
+    },
+    {
+      "epoch": 0.15744,
+      "grad_norm": 0.12013960629701614,
+      "learning_rate": 0.0001859381443298969,
+      "loss": 0.6814,
+      "step": 492
+    },
+    {
+      "epoch": 0.15776,
+      "grad_norm": 0.12026184052228928,
+      "learning_rate": 0.00018589690721649486,
+      "loss": 0.8444,
+      "step": 493
+    },
+    {
+      "epoch": 0.15808,
+      "grad_norm": 0.11788841336965561,
+      "learning_rate": 0.0001858556701030928,
+      "loss": 0.5718,
+      "step": 494
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.11122492700815201,
+      "learning_rate": 0.00018581443298969073,
+      "loss": 0.5172,
+      "step": 495
+    },
+    {
+      "epoch": 0.15872,
+      "grad_norm": 0.12228988111019135,
+      "learning_rate": 0.00018577319587628868,
+      "loss": 0.724,
+      "step": 496
+    },
+    {
+      "epoch": 0.15904,
+      "grad_norm": 0.12170293927192688,
+      "learning_rate": 0.00018573195876288662,
+      "loss": 0.6863,
+      "step": 497
+    },
+    {
+      "epoch": 0.15936,
+      "grad_norm": 0.12947559356689453,
+      "learning_rate": 0.00018569072164948455,
+      "loss": 0.897,
+      "step": 498
+    },
+    {
+      "epoch": 0.15968,
+      "grad_norm": 0.13217338919639587,
+      "learning_rate": 0.00018564948453608248,
+      "loss": 0.7,
+      "step": 499
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.12307247519493103,
+      "learning_rate": 0.0001856082474226804,
+      "loss": 0.6686,
+      "step": 500
+    },
+    {
+      "epoch": 0.16032,
+      "grad_norm": 0.12643639743328094,
+      "learning_rate": 0.00018556701030927837,
+      "loss": 0.6166,
+      "step": 501
+    },
+    {
+      "epoch": 0.16064,
+      "grad_norm": 0.10645804554224014,
+      "learning_rate": 0.0001855257731958763,
+      "loss": 0.785,
+      "step": 502
+    },
+    {
+      "epoch": 0.16096,
+      "grad_norm": 0.12195251882076263,
+      "learning_rate": 0.00018548453608247423,
+      "loss": 0.6931,
+      "step": 503
+    },
+    {
+      "epoch": 0.16128,
+      "grad_norm": 0.10586483776569366,
+      "learning_rate": 0.0001854432989690722,
+      "loss": 0.6172,
+      "step": 504
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.12337654829025269,
+      "learning_rate": 0.0001854020618556701,
+      "loss": 0.7811,
+      "step": 505
+    },
+    {
+      "epoch": 0.16192,
+      "grad_norm": 0.11356624215841293,
+      "learning_rate": 0.00018536082474226805,
+      "loss": 0.7117,
+      "step": 506
+    },
+    {
+      "epoch": 0.16224,
+      "grad_norm": 0.11793960630893707,
+      "learning_rate": 0.00018531958762886598,
+      "loss": 0.7778,
+      "step": 507
+    },
+    {
+      "epoch": 0.16256,
+      "grad_norm": 0.12677061557769775,
+      "learning_rate": 0.00018527835051546391,
+      "loss": 0.6465,
+      "step": 508
+    },
+    {
+      "epoch": 0.16288,
+      "grad_norm": 0.12064032256603241,
+      "learning_rate": 0.00018523711340206187,
+      "loss": 0.7658,
+      "step": 509
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.12942875921726227,
+      "learning_rate": 0.0001851958762886598,
+      "loss": 0.7735,
+      "step": 510
+    },
+    {
+      "epoch": 0.16352,
+      "grad_norm": 0.1350182294845581,
+      "learning_rate": 0.00018515463917525776,
+      "loss": 0.7512,
+      "step": 511
+    },
+    {
+      "epoch": 0.16384,
+      "grad_norm": 0.12585826218128204,
+      "learning_rate": 0.00018511340206185567,
+      "loss": 0.5842,
+      "step": 512
+    },
+    {
+      "epoch": 0.16416,
+      "grad_norm": 0.1233866736292839,
+      "learning_rate": 0.0001850721649484536,
+      "loss": 0.6707,
+      "step": 513
+    },
+    {
+      "epoch": 0.16448,
+      "grad_norm": 0.13590243458747864,
+      "learning_rate": 0.00018503092783505156,
+      "loss": 0.645,
+      "step": 514
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.11508861184120178,
+      "learning_rate": 0.0001849896907216495,
+      "loss": 0.698,
+      "step": 515
+    },
+    {
+      "epoch": 0.16512,
+      "grad_norm": 0.11506541818380356,
+      "learning_rate": 0.00018494845360824745,
+      "loss": 0.6834,
+      "step": 516
+    },
+    {
+      "epoch": 0.16544,
+      "grad_norm": 0.1177835762500763,
+      "learning_rate": 0.00018490721649484538,
+      "loss": 0.8327,
+      "step": 517
+    },
+    {
+      "epoch": 0.16576,
+      "grad_norm": 0.12268577516078949,
+      "learning_rate": 0.0001848659793814433,
+      "loss": 0.7925,
+      "step": 518
+    },
+    {
+      "epoch": 0.16608,
+      "grad_norm": 0.11970438808202744,
+      "learning_rate": 0.00018482474226804124,
+      "loss": 0.9855,
+      "step": 519
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.11444945633411407,
+      "learning_rate": 0.00018478350515463917,
+      "loss": 0.6759,
+      "step": 520
+    },
+    {
+      "epoch": 0.16672,
+      "grad_norm": 0.1250583827495575,
+      "learning_rate": 0.00018474226804123713,
+      "loss": 0.7223,
+      "step": 521
+    },
+    {
+      "epoch": 0.16704,
+      "grad_norm": 0.128667950630188,
+      "learning_rate": 0.00018470103092783506,
+      "loss": 0.7038,
+      "step": 522
+    },
+    {
+      "epoch": 0.16736,
+      "grad_norm": 0.1111631840467453,
+      "learning_rate": 0.000184659793814433,
+      "loss": 0.8037,
+      "step": 523
+    },
+    {
+      "epoch": 0.16768,
+      "grad_norm": 0.10735559463500977,
+      "learning_rate": 0.00018461855670103095,
+      "loss": 0.7687,
+      "step": 524
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.12646177411079407,
+      "learning_rate": 0.00018457731958762888,
+      "loss": 0.7467,
+      "step": 525
+    },
+    {
+      "epoch": 0.16832,
+      "grad_norm": 0.11589492112398148,
+      "learning_rate": 0.0001845360824742268,
+      "loss": 0.5764,
+      "step": 526
+    },
+    {
+      "epoch": 0.16864,
+      "grad_norm": 0.11384910345077515,
+      "learning_rate": 0.00018449484536082474,
+      "loss": 0.7205,
+      "step": 527
+    },
+    {
+      "epoch": 0.16896,
+      "grad_norm": 0.11103667318820953,
+      "learning_rate": 0.00018445360824742267,
+      "loss": 0.7596,
+      "step": 528
+    },
+    {
+      "epoch": 0.16928,
+      "grad_norm": 0.11777044087648392,
+      "learning_rate": 0.00018441237113402063,
+      "loss": 0.8103,
+      "step": 529
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.13625359535217285,
+      "learning_rate": 0.00018437113402061856,
+      "loss": 0.8057,
+      "step": 530
+    },
+    {
+      "epoch": 0.16992,
+      "grad_norm": 0.12384586781263351,
+      "learning_rate": 0.0001843298969072165,
+      "loss": 0.6888,
+      "step": 531
+    },
+    {
+      "epoch": 0.17024,
+      "grad_norm": 0.12003818899393082,
+      "learning_rate": 0.00018428865979381445,
+      "loss": 0.6938,
+      "step": 532
+    },
+    {
+      "epoch": 0.17056,
+      "grad_norm": 0.11006835103034973,
+      "learning_rate": 0.00018424742268041239,
+      "loss": 0.8389,
+      "step": 533
+    },
+    {
+      "epoch": 0.17088,
+      "grad_norm": 0.1308056265115738,
+      "learning_rate": 0.00018420618556701032,
+      "loss": 0.6838,
+      "step": 534
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.13694193959236145,
+      "learning_rate": 0.00018416494845360825,
+      "loss": 0.7372,
+      "step": 535
+    },
+    {
+      "epoch": 0.17152,
+      "grad_norm": 0.12474847584962845,
+      "learning_rate": 0.00018412371134020618,
+      "loss": 0.7316,
+      "step": 536
+    },
+    {
+      "epoch": 0.17184,
+      "grad_norm": 0.12851357460021973,
+      "learning_rate": 0.00018408247422680414,
+      "loss": 0.8742,
+      "step": 537
+    },
+    {
+      "epoch": 0.17216,
+      "grad_norm": 0.12235444784164429,
+      "learning_rate": 0.00018404123711340207,
+      "loss": 0.798,
+      "step": 538
+    },
+    {
+      "epoch": 0.17248,
+      "grad_norm": 0.13520556688308716,
+      "learning_rate": 0.00018400000000000003,
+      "loss": 0.8435,
+      "step": 539
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.10702179372310638,
+      "learning_rate": 0.00018395876288659796,
+      "loss": 0.5605,
+      "step": 540
+    },
+    {
+      "epoch": 0.17312,
+      "grad_norm": 0.13347023725509644,
+      "learning_rate": 0.00018391752577319586,
+      "loss": 0.7539,
+      "step": 541
+    },
+    {
+      "epoch": 0.17344,
+      "grad_norm": 0.12964722514152527,
+      "learning_rate": 0.00018387628865979382,
+      "loss": 0.7302,
+      "step": 542
+    },
+    {
+      "epoch": 0.17376,
+      "grad_norm": 0.1171535775065422,
+      "learning_rate": 0.00018383505154639175,
+      "loss": 0.6735,
+      "step": 543
+    },
+    {
+      "epoch": 0.17408,
+      "grad_norm": 0.11912510544061661,
+      "learning_rate": 0.0001837938144329897,
+      "loss": 0.7924,
+      "step": 544
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.13499023020267487,
+      "learning_rate": 0.00018375257731958764,
+      "loss": 0.6884,
+      "step": 545
+    },
+    {
+      "epoch": 0.17472,
+      "grad_norm": 0.11606519669294357,
+      "learning_rate": 0.00018371134020618557,
+      "loss": 0.6533,
+      "step": 546
+    },
+    {
+      "epoch": 0.17504,
+      "grad_norm": 0.11444912850856781,
+      "learning_rate": 0.00018367010309278353,
+      "loss": 0.6173,
+      "step": 547
+    },
+    {
+      "epoch": 0.17536,
+      "grad_norm": 0.12102066725492477,
+      "learning_rate": 0.00018362886597938144,
+      "loss": 0.7732,
+      "step": 548
+    },
+    {
+      "epoch": 0.17568,
+      "grad_norm": 0.12623833119869232,
+      "learning_rate": 0.0001835876288659794,
+      "loss": 0.8299,
+      "step": 549
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.13104446232318878,
+      "learning_rate": 0.00018354639175257733,
+      "loss": 0.6679,
+      "step": 550
+    },
+    {
+      "epoch": 0.17632,
+      "grad_norm": 0.12092466652393341,
+      "learning_rate": 0.00018350515463917526,
+      "loss": 0.7737,
+      "step": 551
+    },
+    {
+      "epoch": 0.17664,
+      "grad_norm": 0.11604610830545425,
+      "learning_rate": 0.00018346391752577322,
+      "loss": 0.8411,
+      "step": 552
+    },
+    {
+      "epoch": 0.17696,
+      "grad_norm": 0.11258059740066528,
+      "learning_rate": 0.00018342268041237115,
+      "loss": 0.7016,
+      "step": 553
+    },
+    {
+      "epoch": 0.17728,
+      "grad_norm": 0.11493982374668121,
+      "learning_rate": 0.00018338144329896908,
+      "loss": 0.7129,
+      "step": 554
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.13074208796024323,
+      "learning_rate": 0.000183340206185567,
+      "loss": 0.7522,
+      "step": 555
+    },
+    {
+      "epoch": 0.17792,
+      "grad_norm": 0.1120564267039299,
+      "learning_rate": 0.00018329896907216494,
+      "loss": 0.6846,
+      "step": 556
+    },
+    {
+      "epoch": 0.17824,
+      "grad_norm": 0.1323213428258896,
+      "learning_rate": 0.0001832577319587629,
+      "loss": 0.8367,
+      "step": 557
+    },
+    {
+      "epoch": 0.17856,
+      "grad_norm": 0.14067520201206207,
+      "learning_rate": 0.00018321649484536083,
+      "loss": 0.9111,
+      "step": 558
+    },
+    {
+      "epoch": 0.17888,
+      "grad_norm": 0.12386526167392731,
+      "learning_rate": 0.0001831752577319588,
+      "loss": 0.7513,
+      "step": 559
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.11267178505659103,
+      "learning_rate": 0.00018313402061855672,
+      "loss": 0.7561,
+      "step": 560
+    },
+    {
+      "epoch": 0.17952,
+      "grad_norm": 0.12407040596008301,
+      "learning_rate": 0.00018309278350515465,
+      "loss": 0.9464,
+      "step": 561
+    },
+    {
+      "epoch": 0.17984,
+      "grad_norm": 0.12141172587871552,
+      "learning_rate": 0.00018305154639175258,
+      "loss": 0.8564,
+      "step": 562
+    },
+    {
+      "epoch": 0.18016,
+      "grad_norm": 0.11525171250104904,
+      "learning_rate": 0.0001830103092783505,
+      "loss": 0.7085,
+      "step": 563
+    },
+    {
+      "epoch": 0.18048,
+      "grad_norm": 0.1141604334115982,
+      "learning_rate": 0.00018296907216494844,
+      "loss": 0.89,
+      "step": 564
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.117414191365242,
+      "learning_rate": 0.0001829278350515464,
+      "loss": 0.6754,
+      "step": 565
+    },
+    {
+      "epoch": 0.18112,
+      "grad_norm": 0.13193415105342865,
+      "learning_rate": 0.00018288659793814433,
+      "loss": 0.8465,
+      "step": 566
+    },
+    {
+      "epoch": 0.18144,
+      "grad_norm": 0.12107145041227341,
+      "learning_rate": 0.0001828453608247423,
+      "loss": 0.7464,
+      "step": 567
+    },
+    {
+      "epoch": 0.18176,
+      "grad_norm": 0.11851692944765091,
+      "learning_rate": 0.00018280412371134022,
+      "loss": 0.6608,
+      "step": 568
+    },
+    {
+      "epoch": 0.18208,
+      "grad_norm": 0.11890149861574173,
+      "learning_rate": 0.00018276288659793816,
+      "loss": 0.9179,
+      "step": 569
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.11981701850891113,
+      "learning_rate": 0.0001827216494845361,
+      "loss": 0.7348,
+      "step": 570
+    },
+    {
+      "epoch": 0.18272,
+      "grad_norm": 0.13188837468624115,
+      "learning_rate": 0.00018268041237113402,
+      "loss": 0.8143,
+      "step": 571
+    },
+    {
+      "epoch": 0.18304,
+      "grad_norm": 0.12455637753009796,
+      "learning_rate": 0.00018263917525773198,
+      "loss": 0.5797,
+      "step": 572
+    },
+    {
+      "epoch": 0.18336,
+      "grad_norm": 0.12442684173583984,
+      "learning_rate": 0.0001825979381443299,
+      "loss": 0.8271,
+      "step": 573
+    },
+    {
+      "epoch": 0.18368,
+      "grad_norm": 0.12401217222213745,
+      "learning_rate": 0.00018255670103092784,
+      "loss": 0.6963,
+      "step": 574
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.12561100721359253,
+      "learning_rate": 0.0001825154639175258,
+      "loss": 0.7436,
+      "step": 575
+    },
+    {
+      "epoch": 0.18432,
+      "grad_norm": 0.1348034292459488,
+      "learning_rate": 0.00018247422680412373,
+      "loss": 0.677,
+      "step": 576
+    },
+    {
+      "epoch": 0.18464,
+      "grad_norm": 0.12489503622055054,
+      "learning_rate": 0.00018243298969072166,
+      "loss": 0.7475,
+      "step": 577
+    },
+    {
+      "epoch": 0.18496,
+      "grad_norm": 0.11799902468919754,
+      "learning_rate": 0.0001823917525773196,
+      "loss": 0.6301,
+      "step": 578
+    },
+    {
+      "epoch": 0.18528,
+      "grad_norm": 0.12443645298480988,
+      "learning_rate": 0.00018235051546391752,
+      "loss": 0.8342,
+      "step": 579
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.1418801099061966,
+      "learning_rate": 0.00018230927835051548,
+      "loss": 0.7459,
+      "step": 580
+    },
+    {
+      "epoch": 0.18592,
+      "grad_norm": 0.13329613208770752,
+      "learning_rate": 0.0001822680412371134,
+      "loss": 0.8183,
+      "step": 581
+    },
+    {
+      "epoch": 0.18624,
+      "grad_norm": 0.11498089879751205,
+      "learning_rate": 0.00018222680412371137,
+      "loss": 0.586,
+      "step": 582
+    },
+    {
+      "epoch": 0.18656,
+      "grad_norm": 0.11994834244251251,
+      "learning_rate": 0.0001821855670103093,
+      "loss": 0.6292,
+      "step": 583
+    },
+    {
+      "epoch": 0.18688,
+      "grad_norm": 0.11894108355045319,
+      "learning_rate": 0.0001821443298969072,
+      "loss": 0.79,
+      "step": 584
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.13187873363494873,
+      "learning_rate": 0.00018210309278350516,
+      "loss": 0.6799,
+      "step": 585
+    },
+    {
+      "epoch": 0.18752,
+      "grad_norm": 0.12553033232688904,
+      "learning_rate": 0.0001820618556701031,
+      "loss": 0.7286,
+      "step": 586
+    },
+    {
+      "epoch": 0.18784,
+      "grad_norm": 0.1164742261171341,
+      "learning_rate": 0.00018202061855670105,
+      "loss": 0.6721,
+      "step": 587
+    },
+    {
+      "epoch": 0.18816,
+      "grad_norm": 0.10284867882728577,
+      "learning_rate": 0.00018197938144329898,
+      "loss": 0.5752,
+      "step": 588
+    },
+    {
+      "epoch": 0.18848,
+      "grad_norm": 0.14761145412921906,
+      "learning_rate": 0.00018193814432989692,
+      "loss": 0.8899,
+      "step": 589
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.11473073065280914,
+      "learning_rate": 0.00018189690721649485,
+      "loss": 0.856,
+      "step": 590
+    },
+    {
+      "epoch": 0.18912,
+      "grad_norm": 0.12335352599620819,
+      "learning_rate": 0.00018185567010309278,
+      "loss": 0.7842,
+      "step": 591
+    },
+    {
+      "epoch": 0.18944,
+      "grad_norm": 0.10649198293685913,
+      "learning_rate": 0.0001818144329896907,
+      "loss": 0.7395,
+      "step": 592
+    },
+    {
+      "epoch": 0.18976,
+      "grad_norm": 0.11945925652980804,
+      "learning_rate": 0.00018177319587628867,
+      "loss": 0.6553,
+      "step": 593
+    },
+    {
+      "epoch": 0.19008,
+      "grad_norm": 0.1164558157324791,
+      "learning_rate": 0.0001817319587628866,
+      "loss": 0.6454,
+      "step": 594
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.12806792557239532,
+      "learning_rate": 0.00018169072164948456,
+      "loss": 0.7041,
+      "step": 595
+    },
+    {
+      "epoch": 0.19072,
+      "grad_norm": 0.12284845858812332,
+      "learning_rate": 0.0001816494845360825,
+      "loss": 0.8651,
+      "step": 596
+    },
+    {
+      "epoch": 0.19104,
+      "grad_norm": 0.12949353456497192,
+      "learning_rate": 0.00018160824742268042,
+      "loss": 0.761,
+      "step": 597
+    },
+    {
+      "epoch": 0.19136,
+      "grad_norm": 0.11860032379627228,
+      "learning_rate": 0.00018156701030927835,
+      "loss": 0.7484,
+      "step": 598
+    },
+    {
+      "epoch": 0.19168,
+      "grad_norm": 0.12866483628749847,
+      "learning_rate": 0.00018152577319587628,
+      "loss": 0.7962,
+      "step": 599
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.1290092021226883,
+      "learning_rate": 0.00018148453608247424,
+      "loss": 0.6357,
+      "step": 600
+    },
+    {
+      "epoch": 0.19232,
+      "grad_norm": 0.12456108629703522,
+      "learning_rate": 0.00018144329896907217,
+      "loss": 0.8116,
+      "step": 601
+    },
+    {
+      "epoch": 0.19264,
+      "grad_norm": 0.11560390144586563,
+      "learning_rate": 0.0001814020618556701,
+      "loss": 0.6152,
+      "step": 602
+    },
+    {
+      "epoch": 0.19296,
+      "grad_norm": 0.1171826645731926,
+      "learning_rate": 0.00018136082474226806,
+      "loss": 0.7314,
+      "step": 603
+    },
+    {
+      "epoch": 0.19328,
+      "grad_norm": 0.11141692847013474,
+      "learning_rate": 0.000181319587628866,
+      "loss": 0.831,
+      "step": 604
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.12226671725511551,
+      "learning_rate": 0.00018127835051546393,
+      "loss": 0.6946,
+      "step": 605
+    },
+    {
+      "epoch": 0.19392,
+      "grad_norm": 0.12684857845306396,
+      "learning_rate": 0.00018123711340206186,
+      "loss": 0.7299,
+      "step": 606
+    },
+    {
+      "epoch": 0.19424,
+      "grad_norm": 0.12957026064395905,
+      "learning_rate": 0.0001811958762886598,
+      "loss": 0.7208,
+      "step": 607
+    },
+    {
+      "epoch": 0.19456,
+      "grad_norm": 0.11711089313030243,
+      "learning_rate": 0.00018115463917525775,
+      "loss": 0.6967,
+      "step": 608
+    },
+    {
+      "epoch": 0.19488,
+      "grad_norm": 0.12728916108608246,
+      "learning_rate": 0.00018111340206185568,
+      "loss": 0.6866,
+      "step": 609
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.11292556673288345,
+      "learning_rate": 0.00018107216494845364,
+      "loss": 0.6493,
+      "step": 610
+    },
+    {
+      "epoch": 0.19552,
+      "grad_norm": 0.11602400243282318,
+      "learning_rate": 0.00018103092783505157,
+      "loss": 0.7802,
+      "step": 611
+    },
+    {
+      "epoch": 0.19584,
+      "grad_norm": 0.12061775475740433,
+      "learning_rate": 0.0001809896907216495,
+      "loss": 0.7336,
+      "step": 612
+    },
+    {
+      "epoch": 0.19616,
+      "grad_norm": 0.13388070464134216,
+      "learning_rate": 0.00018094845360824743,
+      "loss": 0.6978,
+      "step": 613
+    },
+    {
+      "epoch": 0.19648,
+      "grad_norm": 0.11619807779788971,
+      "learning_rate": 0.00018090721649484536,
+      "loss": 0.7327,
+      "step": 614
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.12670999765396118,
+      "learning_rate": 0.00018086597938144332,
+      "loss": 0.7561,
+      "step": 615
+    },
+    {
+      "epoch": 0.19712,
+      "grad_norm": 0.11523107439279556,
+      "learning_rate": 0.00018082474226804125,
+      "loss": 0.7343,
+      "step": 616
+    },
+    {
+      "epoch": 0.19744,
+      "grad_norm": 0.13320648670196533,
+      "learning_rate": 0.00018078350515463918,
+      "loss": 0.8011,
+      "step": 617
+    },
+    {
+      "epoch": 0.19776,
+      "grad_norm": 0.1562347114086151,
+      "learning_rate": 0.00018074226804123714,
+      "loss": 0.8,
+      "step": 618
+    },
+    {
+      "epoch": 0.19808,
+      "grad_norm": 0.13111111521720886,
+      "learning_rate": 0.00018070103092783507,
+      "loss": 0.7242,
+      "step": 619
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.13651488721370697,
+      "learning_rate": 0.000180659793814433,
+      "loss": 0.7653,
+      "step": 620
+    },
+    {
+      "epoch": 0.19872,
+      "grad_norm": 0.12414918839931488,
+      "learning_rate": 0.00018061855670103093,
+      "loss": 0.803,
+      "step": 621
+    },
+    {
+      "epoch": 0.19904,
+      "grad_norm": 0.11437831819057465,
+      "learning_rate": 0.00018057731958762887,
+      "loss": 0.6214,
+      "step": 622
+    },
+    {
+      "epoch": 0.19936,
+      "grad_norm": 0.10947408527135849,
+      "learning_rate": 0.00018053608247422682,
+      "loss": 0.6069,
+      "step": 623
+    },
+    {
+      "epoch": 0.19968,
+      "grad_norm": 0.11896886676549911,
+      "learning_rate": 0.00018049484536082475,
+      "loss": 0.8071,
+      "step": 624
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.10557591170072556,
+      "learning_rate": 0.00018045360824742269,
+      "loss": 0.6428,
+      "step": 625
+    },
+    {
+      "epoch": 0.20032,
+      "grad_norm": 0.11610836535692215,
+      "learning_rate": 0.00018041237113402062,
+      "loss": 0.6034,
+      "step": 626
+    },
+    {
+      "epoch": 0.20064,
+      "grad_norm": 0.11742173135280609,
+      "learning_rate": 0.00018037113402061855,
+      "loss": 0.7368,
+      "step": 627
+    },
+    {
+      "epoch": 0.20096,
+      "grad_norm": 0.11513591557741165,
+      "learning_rate": 0.0001803298969072165,
+      "loss": 0.5753,
+      "step": 628
+    },
+    {
+      "epoch": 0.20128,
+      "grad_norm": 0.12790197134017944,
+      "learning_rate": 0.00018028865979381444,
+      "loss": 0.6109,
+      "step": 629
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.12305860221385956,
+      "learning_rate": 0.00018024742268041237,
+      "loss": 0.6894,
+      "step": 630
+    },
+    {
+      "epoch": 0.20192,
+      "grad_norm": 0.1050051599740982,
+      "learning_rate": 0.00018020618556701033,
+      "loss": 0.7507,
+      "step": 631
+    },
+    {
+      "epoch": 0.20224,
+      "grad_norm": 0.12011159211397171,
+      "learning_rate": 0.00018016494845360826,
+      "loss": 0.6234,
+      "step": 632
+    },
+    {
+      "epoch": 0.20256,
+      "grad_norm": 0.13481462001800537,
+      "learning_rate": 0.0001801237113402062,
+      "loss": 0.7408,
+      "step": 633
+    },
+    {
+      "epoch": 0.20288,
+      "grad_norm": 0.11771773546934128,
+      "learning_rate": 0.00018008247422680412,
+      "loss": 0.7283,
+      "step": 634
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.12805534899234772,
+      "learning_rate": 0.00018004123711340205,
+      "loss": 0.6377,
+      "step": 635
+    },
+    {
+      "epoch": 0.20352,
+      "grad_norm": 0.1451328545808792,
+      "learning_rate": 0.00018,
+      "loss": 0.6929,
+      "step": 636
+    },
+    {
+      "epoch": 0.20384,
+      "grad_norm": 0.116309255361557,
+      "learning_rate": 0.00017995876288659794,
+      "loss": 0.8133,
+      "step": 637
+    },
+    {
+      "epoch": 0.20416,
+      "grad_norm": 0.11883087456226349,
+      "learning_rate": 0.0001799175257731959,
+      "loss": 0.7036,
+      "step": 638
+    },
+    {
+      "epoch": 0.20448,
+      "grad_norm": 0.11787105351686478,
+      "learning_rate": 0.00017987628865979383,
+      "loss": 0.7313,
+      "step": 639
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.10839900374412537,
+      "learning_rate": 0.00017983505154639176,
+      "loss": 0.5725,
+      "step": 640
+    },
+    {
+      "epoch": 0.20512,
+      "grad_norm": 0.12540479004383087,
+      "learning_rate": 0.0001797938144329897,
+      "loss": 0.6349,
+      "step": 641
+    },
+    {
+      "epoch": 0.20544,
+      "grad_norm": 0.1231970563530922,
+      "learning_rate": 0.00017975257731958763,
+      "loss": 0.7068,
+      "step": 642
+    },
+    {
+      "epoch": 0.20576,
+      "grad_norm": 0.1263824850320816,
+      "learning_rate": 0.00017971134020618558,
+      "loss": 0.861,
+      "step": 643
+    },
+    {
+      "epoch": 0.20608,
+      "grad_norm": 0.134733647108078,
+      "learning_rate": 0.00017967010309278352,
+      "loss": 0.67,
+      "step": 644
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.14626337587833405,
+      "learning_rate": 0.00017962886597938145,
+      "loss": 0.6905,
+      "step": 645
+    },
+    {
+      "epoch": 0.20672,
+      "grad_norm": 0.11077899485826492,
+      "learning_rate": 0.0001795876288659794,
+      "loss": 0.6866,
+      "step": 646
+    },
+    {
+      "epoch": 0.20704,
+      "grad_norm": 0.12598447501659393,
+      "learning_rate": 0.00017954639175257734,
+      "loss": 0.7583,
+      "step": 647
+    },
+    {
+      "epoch": 0.20736,
+      "grad_norm": 0.12656626105308533,
+      "learning_rate": 0.00017950515463917527,
+      "loss": 0.679,
+      "step": 648
+    },
+    {
+      "epoch": 0.20768,
+      "grad_norm": 0.1403527706861496,
+      "learning_rate": 0.0001794639175257732,
+      "loss": 0.8081,
+      "step": 649
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.11713801324367523,
+      "learning_rate": 0.00017942268041237113,
+      "loss": 0.7322,
+      "step": 650
+    },
+    {
+      "epoch": 0.20832,
+      "grad_norm": 0.1281721144914627,
+      "learning_rate": 0.0001793814432989691,
+      "loss": 0.8457,
+      "step": 651
+    },
+    {
+      "epoch": 0.20864,
+      "grad_norm": 0.11331729590892792,
+      "learning_rate": 0.00017934020618556702,
+      "loss": 0.6479,
+      "step": 652
+    },
+    {
+      "epoch": 0.20896,
+      "grad_norm": 0.11600172519683838,
+      "learning_rate": 0.00017929896907216495,
+      "loss": 0.7144,
+      "step": 653
+    },
+    {
+      "epoch": 0.20928,
+      "grad_norm": 0.13482129573822021,
+      "learning_rate": 0.0001792577319587629,
+      "loss": 0.7639,
+      "step": 654
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.12466888129711151,
+      "learning_rate": 0.00017921649484536081,
+      "loss": 0.6413,
+      "step": 655
+    },
+    {
+      "epoch": 0.20992,
+      "grad_norm": 0.11935215443372726,
+      "learning_rate": 0.00017917525773195877,
+      "loss": 0.7551,
+      "step": 656
+    },
+    {
+      "epoch": 0.21024,
+      "grad_norm": 0.12447444349527359,
+      "learning_rate": 0.0001791340206185567,
+      "loss": 0.7171,
+      "step": 657
+    },
+    {
+      "epoch": 0.21056,
+      "grad_norm": 0.1334541141986847,
+      "learning_rate": 0.00017909278350515463,
+      "loss": 0.9159,
+      "step": 658
+    },
+    {
+      "epoch": 0.21088,
+      "grad_norm": 0.111027792096138,
+      "learning_rate": 0.0001790515463917526,
+      "loss": 0.6894,
+      "step": 659
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.12192652374505997,
+      "learning_rate": 0.00017901030927835052,
+      "loss": 0.7354,
+      "step": 660
+    },
+    {
+      "epoch": 0.21152,
+      "grad_norm": 0.12837322056293488,
+      "learning_rate": 0.00017896907216494848,
+      "loss": 0.7535,
+      "step": 661
+    },
+    {
+      "epoch": 0.21184,
+      "grad_norm": 0.1143856793642044,
+      "learning_rate": 0.0001789278350515464,
+      "loss": 0.6189,
+      "step": 662
+    },
+    {
+      "epoch": 0.21216,
+      "grad_norm": 0.12996436655521393,
+      "learning_rate": 0.00017888659793814432,
+      "loss": 0.7194,
+      "step": 663
+    },
+    {
+      "epoch": 0.21248,
+      "grad_norm": 0.13088072836399078,
+      "learning_rate": 0.00017884536082474228,
+      "loss": 0.6356,
+      "step": 664
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.11904153227806091,
+      "learning_rate": 0.0001788041237113402,
+      "loss": 0.5998,
+      "step": 665
+    },
+    {
+      "epoch": 0.21312,
+      "grad_norm": 0.11976154148578644,
+      "learning_rate": 0.00017876288659793817,
+      "loss": 0.7417,
+      "step": 666
+    },
+    {
+      "epoch": 0.21344,
+      "grad_norm": 0.1175193041563034,
+      "learning_rate": 0.0001787216494845361,
+      "loss": 0.741,
+      "step": 667
+    },
+    {
+      "epoch": 0.21376,
+      "grad_norm": 0.1151885837316513,
+      "learning_rate": 0.00017868041237113403,
+      "loss": 0.7779,
+      "step": 668
+    },
+    {
+      "epoch": 0.21408,
+      "grad_norm": 0.1385083645582199,
+      "learning_rate": 0.00017863917525773196,
+      "loss": 0.7296,
+      "step": 669
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.1304243803024292,
+      "learning_rate": 0.0001785979381443299,
+      "loss": 0.7085,
+      "step": 670
+    },
+    {
+      "epoch": 0.21472,
+      "grad_norm": 0.11901744455099106,
+      "learning_rate": 0.00017855670103092785,
+      "loss": 0.7005,
+      "step": 671
+    },
+    {
+      "epoch": 0.21504,
+      "grad_norm": 0.11635670065879822,
+      "learning_rate": 0.00017851546391752578,
+      "loss": 0.7824,
+      "step": 672
+    },
+    {
+      "epoch": 0.21536,
+      "grad_norm": 0.14305877685546875,
+      "learning_rate": 0.0001784742268041237,
+      "loss": 0.6263,
+      "step": 673
+    },
+    {
+      "epoch": 0.21568,
+      "grad_norm": 0.11984144151210785,
+      "learning_rate": 0.00017843298969072167,
+      "loss": 0.6518,
+      "step": 674
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.10570491850376129,
+      "learning_rate": 0.0001783917525773196,
+      "loss": 0.6311,
+      "step": 675
+    },
+    {
+      "epoch": 0.21632,
+      "grad_norm": 0.13266734778881073,
+      "learning_rate": 0.00017835051546391753,
+      "loss": 0.7975,
+      "step": 676
+    },
+    {
+      "epoch": 0.21664,
+      "grad_norm": 0.12022270262241364,
+      "learning_rate": 0.00017830927835051546,
+      "loss": 0.6691,
+      "step": 677
+    },
+    {
+      "epoch": 0.21696,
+      "grad_norm": 0.1266927272081375,
+      "learning_rate": 0.0001782680412371134,
+      "loss": 0.7211,
+      "step": 678
+    },
+    {
+      "epoch": 0.21728,
+      "grad_norm": 0.13459637761116028,
+      "learning_rate": 0.00017822680412371135,
+      "loss": 0.7911,
+      "step": 679
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.14075347781181335,
+      "learning_rate": 0.00017818556701030929,
+      "loss": 0.8012,
+      "step": 680
+    },
+    {
+      "epoch": 0.21792,
+      "grad_norm": 0.12798281013965607,
+      "learning_rate": 0.00017814432989690724,
+      "loss": 0.6135,
+      "step": 681
+    },
+    {
+      "epoch": 0.21824,
+      "grad_norm": 0.14813600480556488,
+      "learning_rate": 0.00017810309278350518,
+      "loss": 0.7988,
+      "step": 682
+    },
+    {
+      "epoch": 0.21856,
+      "grad_norm": 0.14158020913600922,
+      "learning_rate": 0.0001780618556701031,
+      "loss": 0.7725,
+      "step": 683
+    },
+    {
+      "epoch": 0.21888,
+      "grad_norm": 0.10573446750640869,
+      "learning_rate": 0.00017802061855670104,
+      "loss": 0.7956,
+      "step": 684
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.14443108439445496,
+      "learning_rate": 0.00017797938144329897,
+      "loss": 0.8615,
+      "step": 685
+    },
+    {
+      "epoch": 0.21952,
+      "grad_norm": 0.1228407770395279,
+      "learning_rate": 0.0001779381443298969,
+      "loss": 0.706,
+      "step": 686
+    },
+    {
+      "epoch": 0.21984,
+      "grad_norm": 0.1218361034989357,
+      "learning_rate": 0.00017789690721649486,
+      "loss": 0.6525,
+      "step": 687
+    },
+    {
+      "epoch": 0.22016,
+      "grad_norm": 0.12609677016735077,
+      "learning_rate": 0.0001778556701030928,
+      "loss": 0.5638,
+      "step": 688
+    },
+    {
+      "epoch": 0.22048,
+      "grad_norm": 0.1489102840423584,
+      "learning_rate": 0.00017781443298969075,
+      "loss": 0.641,
+      "step": 689
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.11327382177114487,
+      "learning_rate": 0.00017777319587628868,
+      "loss": 0.7372,
+      "step": 690
+    },
+    {
+      "epoch": 0.22112,
+      "grad_norm": 0.12614741921424866,
+      "learning_rate": 0.00017773195876288658,
+      "loss": 0.7819,
+      "step": 691
+    },
+    {
+      "epoch": 0.22144,
+      "grad_norm": 0.11078184098005295,
+      "learning_rate": 0.00017769072164948454,
+      "loss": 0.8376,
+      "step": 692
+    },
+    {
+      "epoch": 0.22176,
+      "grad_norm": 0.13330969214439392,
+      "learning_rate": 0.00017764948453608247,
+      "loss": 0.771,
+      "step": 693
+    },
+    {
+      "epoch": 0.22208,
+      "grad_norm": 0.14074283838272095,
+      "learning_rate": 0.00017760824742268043,
+      "loss": 0.7675,
+      "step": 694
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.12653911113739014,
+      "learning_rate": 0.00017756701030927836,
+      "loss": 0.7578,
+      "step": 695
+    },
+    {
+      "epoch": 0.22272,
+      "grad_norm": 0.13229238986968994,
+      "learning_rate": 0.0001775257731958763,
+      "loss": 0.5843,
+      "step": 696
+    },
+    {
+      "epoch": 0.22304,
+      "grad_norm": 0.14447958767414093,
+      "learning_rate": 0.00017748453608247425,
+      "loss": 0.8251,
+      "step": 697
+    },
+    {
+      "epoch": 0.22336,
+      "grad_norm": 0.12437491863965988,
+      "learning_rate": 0.00017744329896907216,
+      "loss": 0.7624,
+      "step": 698
+    },
+    {
+      "epoch": 0.22368,
+      "grad_norm": 0.13665013015270233,
+      "learning_rate": 0.00017740206185567012,
+      "loss": 0.6546,
+      "step": 699
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.12415925413370132,
+      "learning_rate": 0.00017736082474226805,
+      "loss": 0.5989,
+      "step": 700
+    },
+    {
+      "epoch": 0.22432,
+      "grad_norm": 0.13429298996925354,
+      "learning_rate": 0.00017731958762886598,
+      "loss": 0.7283,
+      "step": 701
+    },
+    {
+      "epoch": 0.22464,
+      "grad_norm": 0.11220762878656387,
+      "learning_rate": 0.00017727835051546394,
+      "loss": 0.844,
+      "step": 702
+    },
+    {
+      "epoch": 0.22496,
+      "grad_norm": 0.15322859585285187,
+      "learning_rate": 0.00017723711340206187,
+      "loss": 0.744,
+      "step": 703
+    },
+    {
+      "epoch": 0.22528,
+      "grad_norm": 0.10617350786924362,
+      "learning_rate": 0.0001771958762886598,
+      "loss": 0.7462,
+      "step": 704
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.12569446861743927,
+      "learning_rate": 0.00017715463917525773,
+      "loss": 0.7465,
+      "step": 705
+    },
+    {
+      "epoch": 0.22592,
+      "grad_norm": 0.13956156373023987,
+      "learning_rate": 0.00017711340206185566,
+      "loss": 0.7465,
+      "step": 706
+    },
+    {
+      "epoch": 0.22624,
+      "grad_norm": 0.13465113937854767,
+      "learning_rate": 0.00017707216494845362,
+      "loss": 0.8502,
+      "step": 707
+    },
+    {
+      "epoch": 0.22656,
+      "grad_norm": 0.11806920915842056,
+      "learning_rate": 0.00017703092783505155,
+      "loss": 0.6246,
+      "step": 708
+    },
+    {
+      "epoch": 0.22688,
+      "grad_norm": 0.1319315880537033,
+      "learning_rate": 0.0001769896907216495,
+      "loss": 0.6639,
+      "step": 709
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.14387674629688263,
+      "learning_rate": 0.00017694845360824744,
+      "loss": 0.8698,
+      "step": 710
+    },
+    {
+      "epoch": 0.22752,
+      "grad_norm": 0.12522293627262115,
+      "learning_rate": 0.00017690721649484537,
+      "loss": 0.7884,
+      "step": 711
+    },
+    {
+      "epoch": 0.22784,
+      "grad_norm": 0.11482946574687958,
+      "learning_rate": 0.0001768659793814433,
+      "loss": 0.7562,
+      "step": 712
+    },
+    {
+      "epoch": 0.22816,
+      "grad_norm": 0.11533328890800476,
+      "learning_rate": 0.00017682474226804123,
+      "loss": 0.6228,
+      "step": 713
+    },
+    {
+      "epoch": 0.22848,
+      "grad_norm": 0.12137199193239212,
+      "learning_rate": 0.00017678350515463917,
+      "loss": 0.8835,
+      "step": 714
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.11885856837034225,
+      "learning_rate": 0.00017674226804123712,
+      "loss": 0.6241,
+      "step": 715
+    },
+    {
+      "epoch": 0.22912,
+      "grad_norm": 0.1262994110584259,
+      "learning_rate": 0.00017670103092783506,
+      "loss": 0.715,
+      "step": 716
+    },
+    {
+      "epoch": 0.22944,
+      "grad_norm": 0.1313248574733734,
+      "learning_rate": 0.00017665979381443301,
+      "loss": 0.8368,
+      "step": 717
+    },
+    {
+      "epoch": 0.22976,
+      "grad_norm": 0.15882809460163116,
+      "learning_rate": 0.00017661855670103095,
+      "loss": 0.8688,
+      "step": 718
+    },
+    {
+      "epoch": 0.23008,
+      "grad_norm": 0.13610002398490906,
+      "learning_rate": 0.00017657731958762888,
+      "loss": 0.7367,
+      "step": 719
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.11154987663030624,
+      "learning_rate": 0.0001765360824742268,
+      "loss": 0.8026,
+      "step": 720
+    },
+    {
+      "epoch": 0.23072,
+      "grad_norm": 0.14375466108322144,
+      "learning_rate": 0.00017649484536082474,
+      "loss": 0.8038,
+      "step": 721
+    },
+    {
+      "epoch": 0.23104,
+      "grad_norm": 0.13212132453918457,
+      "learning_rate": 0.0001764536082474227,
+      "loss": 0.9008,
+      "step": 722
+    },
+    {
+      "epoch": 0.23136,
+      "grad_norm": 0.1395123153924942,
+      "learning_rate": 0.00017641237113402063,
+      "loss": 0.7491,
+      "step": 723
+    },
+    {
+      "epoch": 0.23168,
+      "grad_norm": 0.11533952504396439,
+      "learning_rate": 0.00017637113402061856,
+      "loss": 0.6714,
+      "step": 724
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.10333327949047089,
+      "learning_rate": 0.00017632989690721652,
+      "loss": 0.6643,
+      "step": 725
+    },
+    {
+      "epoch": 0.23232,
+      "grad_norm": 0.12025081366300583,
+      "learning_rate": 0.00017628865979381445,
+      "loss": 0.6639,
+      "step": 726
+    },
+    {
+      "epoch": 0.23264,
+      "grad_norm": 0.13901305198669434,
+      "learning_rate": 0.00017624742268041238,
+      "loss": 0.7138,
+      "step": 727
+    },
+    {
+      "epoch": 0.23296,
+      "grad_norm": 0.14389030635356903,
+      "learning_rate": 0.0001762061855670103,
+      "loss": 0.7785,
+      "step": 728
+    },
+    {
+      "epoch": 0.23328,
+      "grad_norm": 0.11535743623971939,
+      "learning_rate": 0.00017616494845360824,
+      "loss": 0.715,
+      "step": 729
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.11448106914758682,
+      "learning_rate": 0.0001761237113402062,
+      "loss": 0.7739,
+      "step": 730
+    },
+    {
+      "epoch": 0.23392,
+      "grad_norm": 0.1345527321100235,
+      "learning_rate": 0.00017608247422680413,
+      "loss": 0.6493,
+      "step": 731
+    },
+    {
+      "epoch": 0.23424,
+      "grad_norm": 0.12693467736244202,
+      "learning_rate": 0.0001760412371134021,
+      "loss": 0.7164,
+      "step": 732
+    },
+    {
+      "epoch": 0.23456,
+      "grad_norm": 0.12556055188179016,
+      "learning_rate": 0.00017600000000000002,
+      "loss": 0.7162,
+      "step": 733
+    },
+    {
+      "epoch": 0.23488,
+      "grad_norm": 0.13427414000034332,
+      "learning_rate": 0.00017595876288659793,
+      "loss": 0.7169,
+      "step": 734
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.1384212076663971,
+      "learning_rate": 0.00017591752577319589,
+      "loss": 0.8806,
+      "step": 735
+    },
+    {
+      "epoch": 0.23552,
+      "grad_norm": 0.1685527116060257,
+      "learning_rate": 0.00017587628865979382,
+      "loss": 0.8755,
+      "step": 736
+    },
+    {
+      "epoch": 0.23584,
+      "grad_norm": 0.12363595515489578,
+      "learning_rate": 0.00017583505154639177,
+      "loss": 0.7556,
+      "step": 737
+    },
+    {
+      "epoch": 0.23616,
+      "grad_norm": 0.121871218085289,
+      "learning_rate": 0.0001757938144329897,
+      "loss": 0.7021,
+      "step": 738
+    },
+    {
+      "epoch": 0.23648,
+      "grad_norm": 0.133682981133461,
+      "learning_rate": 0.00017575257731958764,
+      "loss": 0.8063,
+      "step": 739
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.10897425562143326,
+      "learning_rate": 0.00017571134020618557,
+      "loss": 0.7153,
+      "step": 740
+    },
+    {
+      "epoch": 0.23712,
+      "grad_norm": 0.1330791562795639,
+      "learning_rate": 0.0001756701030927835,
+      "loss": 0.7005,
+      "step": 741
+    },
+    {
+      "epoch": 0.23744,
+      "grad_norm": 0.13002124428749084,
+      "learning_rate": 0.00017562886597938146,
+      "loss": 0.7188,
+      "step": 742
+    },
+    {
+      "epoch": 0.23776,
+      "grad_norm": 0.11166704446077347,
+      "learning_rate": 0.0001755876288659794,
+      "loss": 0.6922,
+      "step": 743
+    },
+    {
+      "epoch": 0.23808,
+      "grad_norm": 0.1195744052529335,
+      "learning_rate": 0.00017554639175257732,
+      "loss": 0.6846,
+      "step": 744
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.12304572016000748,
+      "learning_rate": 0.00017550515463917528,
+      "loss": 0.7575,
+      "step": 745
+    },
+    {
+      "epoch": 0.23872,
+      "grad_norm": 0.15100136399269104,
+      "learning_rate": 0.0001754639175257732,
+      "loss": 0.9129,
+      "step": 746
+    },
+    {
+      "epoch": 0.23904,
+      "grad_norm": 0.11369301378726959,
+      "learning_rate": 0.00017542268041237114,
+      "loss": 0.8557,
+      "step": 747
+    },
+    {
+      "epoch": 0.23936,
+      "grad_norm": 0.14192970097064972,
+      "learning_rate": 0.00017538144329896907,
+      "loss": 0.7254,
+      "step": 748
+    },
+    {
+      "epoch": 0.23968,
+      "grad_norm": 0.11873924732208252,
+      "learning_rate": 0.000175340206185567,
+      "loss": 0.7028,
+      "step": 749
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1388227790594101,
+      "learning_rate": 0.00017529896907216496,
+      "loss": 0.8156,
+      "step": 750
+    },
+    {
+      "epoch": 0.24032,
+      "grad_norm": 0.1263178288936615,
+      "learning_rate": 0.0001752577319587629,
+      "loss": 0.7294,
+      "step": 751
+    },
+    {
+      "epoch": 0.24064,
+      "grad_norm": 0.1199144572019577,
+      "learning_rate": 0.00017521649484536083,
+      "loss": 0.6154,
+      "step": 752
+    },
+    {
+      "epoch": 0.24096,
+      "grad_norm": 0.1196129247546196,
+      "learning_rate": 0.00017517525773195878,
+      "loss": 0.6223,
+      "step": 753
+    },
+    {
+      "epoch": 0.24128,
+      "grad_norm": 0.1296255737543106,
+      "learning_rate": 0.00017513402061855671,
+      "loss": 0.7155,
+      "step": 754
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.13744045794010162,
+      "learning_rate": 0.00017509278350515465,
+      "loss": 0.7899,
+      "step": 755
+    },
+    {
+      "epoch": 0.24192,
+      "grad_norm": 0.136393204331398,
+      "learning_rate": 0.00017505154639175258,
+      "loss": 0.7931,
+      "step": 756
+    },
+    {
+      "epoch": 0.24224,
+      "grad_norm": 0.12587614357471466,
+      "learning_rate": 0.0001750103092783505,
+      "loss": 0.7043,
+      "step": 757
+    },
+    {
+      "epoch": 0.24256,
+      "grad_norm": 0.12651780247688293,
+      "learning_rate": 0.00017496907216494847,
+      "loss": 0.711,
+      "step": 758
+    },
+    {
+      "epoch": 0.24288,
+      "grad_norm": 0.12235745042562485,
+      "learning_rate": 0.0001749278350515464,
+      "loss": 0.8063,
+      "step": 759
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.14242300391197205,
+      "learning_rate": 0.00017488659793814436,
+      "loss": 0.6157,
+      "step": 760
+    },
+    {
+      "epoch": 0.24352,
+      "grad_norm": 0.12730719149112701,
+      "learning_rate": 0.0001748453608247423,
+      "loss": 0.8741,
+      "step": 761
+    },
+    {
+      "epoch": 0.24384,
+      "grad_norm": 0.12228512763977051,
+      "learning_rate": 0.00017480412371134022,
+      "loss": 0.6424,
+      "step": 762
+    },
+    {
+      "epoch": 0.24416,
+      "grad_norm": 0.12545928359031677,
+      "learning_rate": 0.00017476288659793815,
+      "loss": 0.7231,
+      "step": 763
+    },
+    {
+      "epoch": 0.24448,
+      "grad_norm": 0.14945553243160248,
+      "learning_rate": 0.00017472164948453608,
+      "loss": 0.7147,
+      "step": 764
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.12359154969453812,
+      "learning_rate": 0.00017468041237113404,
+      "loss": 0.689,
+      "step": 765
+    },
+    {
+      "epoch": 0.24512,
+      "grad_norm": 0.1245129331946373,
+      "learning_rate": 0.00017463917525773197,
+      "loss": 0.7739,
+      "step": 766
+    },
+    {
+      "epoch": 0.24544,
+      "grad_norm": 0.12101677060127258,
+      "learning_rate": 0.0001745979381443299,
+      "loss": 0.6389,
+      "step": 767
+    },
+    {
+      "epoch": 0.24576,
+      "grad_norm": 0.12827037274837494,
+      "learning_rate": 0.00017455670103092786,
+      "loss": 0.7213,
+      "step": 768
+    },
+    {
+      "epoch": 0.24608,
+      "grad_norm": 0.11057371646165848,
+      "learning_rate": 0.0001745154639175258,
+      "loss": 0.5549,
+      "step": 769
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.10602620244026184,
+      "learning_rate": 0.00017447422680412372,
+      "loss": 0.663,
+      "step": 770
+    },
+    {
+      "epoch": 0.24672,
+      "grad_norm": 0.1476941555738449,
+      "learning_rate": 0.00017443298969072165,
+      "loss": 0.6113,
+      "step": 771
+    },
+    {
+      "epoch": 0.24704,
+      "grad_norm": 0.1098361536860466,
+      "learning_rate": 0.00017439175257731959,
+      "loss": 0.6189,
+      "step": 772
+    },
+    {
+      "epoch": 0.24736,
+      "grad_norm": 0.12340857088565826,
+      "learning_rate": 0.00017435051546391754,
+      "loss": 0.6615,
+      "step": 773
+    },
+    {
+      "epoch": 0.24768,
+      "grad_norm": 0.1320737898349762,
+      "learning_rate": 0.00017430927835051548,
+      "loss": 0.6293,
+      "step": 774
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.12213713675737381,
+      "learning_rate": 0.0001742680412371134,
+      "loss": 0.6703,
+      "step": 775
+    },
+    {
+      "epoch": 0.24832,
+      "grad_norm": 0.13890238106250763,
+      "learning_rate": 0.00017422680412371134,
+      "loss": 0.7381,
+      "step": 776
+    },
+    {
+      "epoch": 0.24864,
+      "grad_norm": 0.13525603711605072,
+      "learning_rate": 0.00017418556701030927,
+      "loss": 0.8244,
+      "step": 777
+    },
+    {
+      "epoch": 0.24896,
+      "grad_norm": 0.1357133686542511,
+      "learning_rate": 0.00017414432989690723,
+      "loss": 0.6428,
+      "step": 778
+    },
+    {
+      "epoch": 0.24928,
+      "grad_norm": 0.13172458112239838,
+      "learning_rate": 0.00017410309278350516,
+      "loss": 0.7103,
+      "step": 779
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.14605680108070374,
+      "learning_rate": 0.0001740618556701031,
+      "loss": 0.7102,
+      "step": 780
+    },
+    {
+      "epoch": 0.24992,
+      "grad_norm": 0.14995813369750977,
+      "learning_rate": 0.00017402061855670105,
+      "loss": 0.9,
+      "step": 781
+    },
+    {
+      "epoch": 0.25024,
+      "grad_norm": 0.13214722275733948,
+      "learning_rate": 0.00017397938144329898,
+      "loss": 0.8016,
+      "step": 782
+    },
+    {
+      "epoch": 0.25056,
+      "grad_norm": 0.12392943352460861,
+      "learning_rate": 0.0001739381443298969,
+      "loss": 0.5207,
+      "step": 783
+    },
+    {
+      "epoch": 0.25088,
+      "grad_norm": 0.13737192749977112,
+      "learning_rate": 0.00017389690721649484,
+      "loss": 0.7711,
+      "step": 784
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.1214001253247261,
+      "learning_rate": 0.00017385567010309277,
+      "loss": 0.6856,
+      "step": 785
+    },
+    {
+      "epoch": 0.25152,
+      "grad_norm": 0.11796026676893234,
+      "learning_rate": 0.00017381443298969073,
+      "loss": 0.6661,
+      "step": 786
+    },
+    {
+      "epoch": 0.25184,
+      "grad_norm": 0.11840114742517471,
+      "learning_rate": 0.00017377319587628866,
+      "loss": 0.6285,
+      "step": 787
+    },
+    {
+      "epoch": 0.25216,
+      "grad_norm": 0.12398465722799301,
+      "learning_rate": 0.00017373195876288662,
+      "loss": 0.6363,
+      "step": 788
+    },
+    {
+      "epoch": 0.25248,
+      "grad_norm": 0.12396920472383499,
+      "learning_rate": 0.00017369072164948455,
+      "loss": 0.8027,
+      "step": 789
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.10319101810455322,
+      "learning_rate": 0.00017364948453608248,
+      "loss": 0.5851,
+      "step": 790
+    },
+    {
+      "epoch": 0.25312,
+      "grad_norm": 0.14459432661533356,
+      "learning_rate": 0.00017360824742268042,
+      "loss": 0.9323,
+      "step": 791
+    },
+    {
+      "epoch": 0.25344,
+      "grad_norm": 0.1221664696931839,
+      "learning_rate": 0.00017356701030927835,
+      "loss": 0.6041,
+      "step": 792
+    },
+    {
+      "epoch": 0.25376,
+      "grad_norm": 0.118996761739254,
+      "learning_rate": 0.0001735257731958763,
+      "loss": 0.8632,
+      "step": 793
+    },
+    {
+      "epoch": 0.25408,
+      "grad_norm": 0.13408184051513672,
+      "learning_rate": 0.00017348453608247424,
+      "loss": 0.7143,
+      "step": 794
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.13923995196819305,
+      "learning_rate": 0.00017344329896907217,
+      "loss": 0.7232,
+      "step": 795
+    },
+    {
+      "epoch": 0.25472,
+      "grad_norm": 0.12153901159763336,
+      "learning_rate": 0.00017340206185567013,
+      "loss": 0.6634,
+      "step": 796
+    },
+    {
+      "epoch": 0.25504,
+      "grad_norm": 0.14275431632995605,
+      "learning_rate": 0.00017336082474226806,
+      "loss": 0.6741,
+      "step": 797
+    },
+    {
+      "epoch": 0.25536,
+      "grad_norm": 0.12189670652151108,
+      "learning_rate": 0.000173319587628866,
+      "loss": 0.776,
+      "step": 798
+    },
+    {
+      "epoch": 0.25568,
+      "grad_norm": 0.12858492136001587,
+      "learning_rate": 0.00017327835051546392,
+      "loss": 0.7927,
+      "step": 799
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.13049420714378357,
+      "learning_rate": 0.00017323711340206185,
+      "loss": 0.717,
+      "step": 800
+    },
+    {
+      "epoch": 0.25632,
+      "grad_norm": 0.13621200621128082,
+      "learning_rate": 0.0001731958762886598,
+      "loss": 0.6823,
+      "step": 801
+    },
+    {
+      "epoch": 0.25664,
+      "grad_norm": 0.13778063654899597,
+      "learning_rate": 0.00017315463917525774,
+      "loss": 0.7343,
+      "step": 802
+    },
+    {
+      "epoch": 0.25696,
+      "grad_norm": 0.11168331652879715,
+      "learning_rate": 0.00017311340206185567,
+      "loss": 0.6899,
+      "step": 803
+    },
+    {
+      "epoch": 0.25728,
+      "grad_norm": 0.11649805307388306,
+      "learning_rate": 0.00017307216494845363,
+      "loss": 0.7278,
+      "step": 804
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.13680049777030945,
+      "learning_rate": 0.00017303092783505154,
+      "loss": 0.59,
+      "step": 805
+    },
+    {
+      "epoch": 0.25792,
+      "grad_norm": 0.1319751888513565,
+      "learning_rate": 0.0001729896907216495,
+      "loss": 0.7789,
+      "step": 806
+    },
+    {
+      "epoch": 0.25824,
+      "grad_norm": 0.15065547823905945,
+      "learning_rate": 0.00017294845360824742,
+      "loss": 0.7411,
+      "step": 807
+    },
+    {
+      "epoch": 0.25856,
+      "grad_norm": 0.13833020627498627,
+      "learning_rate": 0.00017290721649484536,
+      "loss": 0.7369,
+      "step": 808
+    },
+    {
+      "epoch": 0.25888,
+      "grad_norm": 0.12196467071771622,
+      "learning_rate": 0.00017286597938144331,
+      "loss": 0.9695,
+      "step": 809
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.117258720099926,
+      "learning_rate": 0.00017282474226804125,
+      "loss": 0.6831,
+      "step": 810
+    },
+    {
+      "epoch": 0.25952,
+      "grad_norm": 0.13261094689369202,
+      "learning_rate": 0.0001727835051546392,
+      "loss": 0.6081,
+      "step": 811
+    },
+    {
+      "epoch": 0.25984,
+      "grad_norm": 0.12902230024337769,
+      "learning_rate": 0.0001727422680412371,
+      "loss": 0.6816,
+      "step": 812
+    },
+    {
+      "epoch": 0.26016,
+      "grad_norm": 0.12592282891273499,
+      "learning_rate": 0.00017270103092783504,
+      "loss": 0.7926,
+      "step": 813
+    },
+    {
+      "epoch": 0.26048,
+      "grad_norm": 0.11538536846637726,
+      "learning_rate": 0.000172659793814433,
+      "loss": 0.6386,
+      "step": 814
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.14629842340946198,
+      "learning_rate": 0.00017261855670103093,
+      "loss": 0.6989,
+      "step": 815
+    },
+    {
+      "epoch": 0.26112,
+      "grad_norm": 0.12829239666461945,
+      "learning_rate": 0.0001725773195876289,
+      "loss": 0.5876,
+      "step": 816
+    },
+    {
+      "epoch": 0.26144,
+      "grad_norm": 0.13584017753601074,
+      "learning_rate": 0.00017253608247422682,
+      "loss": 0.8126,
+      "step": 817
+    },
+    {
+      "epoch": 0.26176,
+      "grad_norm": 0.13379888236522675,
+      "learning_rate": 0.00017249484536082475,
+      "loss": 0.661,
+      "step": 818
+    },
+    {
+      "epoch": 0.26208,
+      "grad_norm": 0.14031822979450226,
+      "learning_rate": 0.00017245360824742268,
+      "loss": 0.8392,
+      "step": 819
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.12089665979146957,
+      "learning_rate": 0.0001724123711340206,
+      "loss": 0.7065,
+      "step": 820
+    },
+    {
+      "epoch": 0.26272,
+      "grad_norm": 0.11851520836353302,
+      "learning_rate": 0.00017237113402061857,
+      "loss": 0.5666,
+      "step": 821
+    },
+    {
+      "epoch": 0.26304,
+      "grad_norm": 0.11357667297124863,
+      "learning_rate": 0.0001723298969072165,
+      "loss": 0.9064,
+      "step": 822
+    },
+    {
+      "epoch": 0.26336,
+      "grad_norm": 0.12621043622493744,
+      "learning_rate": 0.00017228865979381443,
+      "loss": 0.6377,
+      "step": 823
+    },
+    {
+      "epoch": 0.26368,
+      "grad_norm": 0.12150184065103531,
+      "learning_rate": 0.0001722474226804124,
+      "loss": 0.5876,
+      "step": 824
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.14796634018421173,
+      "learning_rate": 0.00017220618556701032,
+      "loss": 0.7244,
+      "step": 825
+    },
+    {
+      "epoch": 0.26432,
+      "grad_norm": 0.1227513924241066,
+      "learning_rate": 0.00017216494845360825,
+      "loss": 0.5743,
+      "step": 826
+    },
+    {
+      "epoch": 0.26464,
+      "grad_norm": 0.13068315386772156,
+      "learning_rate": 0.00017212371134020619,
+      "loss": 0.6706,
+      "step": 827
+    },
+    {
+      "epoch": 0.26496,
+      "grad_norm": 0.136739581823349,
+      "learning_rate": 0.00017208247422680412,
+      "loss": 0.6284,
+      "step": 828
+    },
+    {
+      "epoch": 0.26528,
+      "grad_norm": 0.11144915968179703,
+      "learning_rate": 0.00017204123711340208,
+      "loss": 0.8231,
+      "step": 829
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.11677107214927673,
+      "learning_rate": 0.000172,
+      "loss": 0.7092,
+      "step": 830
+    },
+    {
+      "epoch": 0.26592,
+      "grad_norm": 0.10681087523698807,
+      "learning_rate": 0.00017195876288659796,
+      "loss": 0.6751,
+      "step": 831
+    },
+    {
+      "epoch": 0.26624,
+      "grad_norm": 0.1254301369190216,
+      "learning_rate": 0.0001719175257731959,
+      "loss": 0.8069,
+      "step": 832
+    },
+    {
+      "epoch": 0.26656,
+      "grad_norm": 0.12878485023975372,
+      "learning_rate": 0.00017187628865979383,
+      "loss": 0.7752,
+      "step": 833
+    },
+    {
+      "epoch": 0.26688,
+      "grad_norm": 0.1316794753074646,
+      "learning_rate": 0.00017183505154639176,
+      "loss": 0.6108,
+      "step": 834
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.1294477880001068,
+      "learning_rate": 0.0001717938144329897,
+      "loss": 0.8802,
+      "step": 835
+    },
+    {
+      "epoch": 0.26752,
+      "grad_norm": 0.1193527951836586,
+      "learning_rate": 0.00017175257731958762,
+      "loss": 0.7253,
+      "step": 836
+    },
+    {
+      "epoch": 0.26784,
+      "grad_norm": 0.13821986317634583,
+      "learning_rate": 0.00017171134020618558,
+      "loss": 0.8121,
+      "step": 837
+    },
+    {
+      "epoch": 0.26816,
+      "grad_norm": 0.12366246432065964,
+      "learning_rate": 0.0001716701030927835,
+      "loss": 0.7343,
+      "step": 838
+    },
+    {
+      "epoch": 0.26848,
+      "grad_norm": 0.12942105531692505,
+      "learning_rate": 0.00017162886597938147,
+      "loss": 0.7198,
+      "step": 839
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.1249607726931572,
+      "learning_rate": 0.0001715876288659794,
+      "loss": 0.7917,
+      "step": 840
+    },
+    {
+      "epoch": 0.26912,
+      "grad_norm": 0.13555577397346497,
+      "learning_rate": 0.0001715463917525773,
+      "loss": 0.7047,
+      "step": 841
+    },
+    {
+      "epoch": 0.26944,
+      "grad_norm": 0.10971818119287491,
+      "learning_rate": 0.00017150515463917526,
+      "loss": 0.68,
+      "step": 842
+    },
+    {
+      "epoch": 0.26976,
+      "grad_norm": 0.1281420886516571,
+      "learning_rate": 0.0001714639175257732,
+      "loss": 0.765,
+      "step": 843
+    },
+    {
+      "epoch": 0.27008,
+      "grad_norm": 0.13114331662654877,
+      "learning_rate": 0.00017142268041237115,
+      "loss": 0.7864,
+      "step": 844
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.13292206823825836,
+      "learning_rate": 0.00017138144329896908,
+      "loss": 0.8031,
+      "step": 845
+    },
+    {
+      "epoch": 0.27072,
+      "grad_norm": 0.11316345632076263,
+      "learning_rate": 0.00017134020618556702,
+      "loss": 0.7657,
+      "step": 846
+    },
+    {
+      "epoch": 0.27104,
+      "grad_norm": 0.12936796247959137,
+      "learning_rate": 0.00017129896907216497,
+      "loss": 0.6948,
+      "step": 847
+    },
+    {
+      "epoch": 0.27136,
+      "grad_norm": 0.14884187281131744,
+      "learning_rate": 0.00017125773195876288,
+      "loss": 0.5843,
+      "step": 848
+    },
+    {
+      "epoch": 0.27168,
+      "grad_norm": 0.1351289451122284,
+      "learning_rate": 0.00017121649484536084,
+      "loss": 0.6618,
+      "step": 849
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.12176259607076645,
+      "learning_rate": 0.00017117525773195877,
+      "loss": 0.5761,
+      "step": 850
+    },
+    {
+      "epoch": 0.27232,
+      "grad_norm": 0.15906400978565216,
+      "learning_rate": 0.0001711340206185567,
+      "loss": 0.8627,
+      "step": 851
+    },
+    {
+      "epoch": 0.27264,
+      "grad_norm": 0.1319553554058075,
+      "learning_rate": 0.00017109278350515466,
+      "loss": 0.6524,
+      "step": 852
+    },
+    {
+      "epoch": 0.27296,
+      "grad_norm": 0.13609087467193604,
+      "learning_rate": 0.0001710515463917526,
+      "loss": 0.666,
+      "step": 853
+    },
+    {
+      "epoch": 0.27328,
+      "grad_norm": 0.13189756870269775,
+      "learning_rate": 0.00017101030927835055,
+      "loss": 0.7922,
+      "step": 854
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.13593921065330505,
+      "learning_rate": 0.00017096907216494845,
+      "loss": 0.7556,
+      "step": 855
+    },
+    {
+      "epoch": 0.27392,
+      "grad_norm": 0.13250242173671722,
+      "learning_rate": 0.00017092783505154638,
+      "loss": 0.732,
+      "step": 856
+    },
+    {
+      "epoch": 0.27424,
+      "grad_norm": 0.12137633562088013,
+      "learning_rate": 0.00017088659793814434,
+      "loss": 0.7496,
+      "step": 857
+    },
+    {
+      "epoch": 0.27456,
+      "grad_norm": 0.1237613707780838,
+      "learning_rate": 0.00017084536082474227,
+      "loss": 0.5882,
+      "step": 858
+    },
+    {
+      "epoch": 0.27488,
+      "grad_norm": 0.13498494029045105,
+      "learning_rate": 0.00017080412371134023,
+      "loss": 0.7405,
+      "step": 859
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.11016131192445755,
+      "learning_rate": 0.00017076288659793816,
+      "loss": 0.9321,
+      "step": 860
+    },
+    {
+      "epoch": 0.27552,
+      "grad_norm": 0.14061477780342102,
+      "learning_rate": 0.0001707216494845361,
+      "loss": 0.7053,
+      "step": 861
+    },
+    {
+      "epoch": 0.27584,
+      "grad_norm": 0.14012236893177032,
+      "learning_rate": 0.00017068041237113402,
+      "loss": 0.7159,
+      "step": 862
+    },
+    {
+      "epoch": 0.27616,
+      "grad_norm": 0.13183481991291046,
+      "learning_rate": 0.00017063917525773196,
+      "loss": 0.7325,
+      "step": 863
+    },
+    {
+      "epoch": 0.27648,
+      "grad_norm": 0.11476800590753555,
+      "learning_rate": 0.0001705979381443299,
+      "loss": 0.7055,
+      "step": 864
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.10209871083498001,
+      "learning_rate": 0.00017055670103092785,
+      "loss": 0.7001,
+      "step": 865
+    },
+    {
+      "epoch": 0.27712,
+      "grad_norm": 0.11558199673891068,
+      "learning_rate": 0.00017051546391752578,
+      "loss": 0.6917,
+      "step": 866
+    },
+    {
+      "epoch": 0.27744,
+      "grad_norm": 0.12456093728542328,
+      "learning_rate": 0.00017047422680412373,
+      "loss": 0.7678,
+      "step": 867
+    },
+    {
+      "epoch": 0.27776,
+      "grad_norm": 0.1426069140434265,
+      "learning_rate": 0.00017043298969072167,
+      "loss": 0.6215,
+      "step": 868
+    },
+    {
+      "epoch": 0.27808,
+      "grad_norm": 0.12650014460086823,
+      "learning_rate": 0.0001703917525773196,
+      "loss": 0.7265,
+      "step": 869
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.12991954386234283,
+      "learning_rate": 0.00017035051546391753,
+      "loss": 0.7828,
+      "step": 870
+    },
+    {
+      "epoch": 0.27872,
+      "grad_norm": 0.1391548216342926,
+      "learning_rate": 0.00017030927835051546,
+      "loss": 0.6953,
+      "step": 871
+    },
+    {
+      "epoch": 0.27904,
+      "grad_norm": 0.12694767117500305,
+      "learning_rate": 0.00017026804123711342,
+      "loss": 0.6607,
+      "step": 872
+    },
+    {
+      "epoch": 0.27936,
+      "grad_norm": 0.10944822430610657,
+      "learning_rate": 0.00017022680412371135,
+      "loss": 0.6494,
+      "step": 873
+    },
+    {
+      "epoch": 0.27968,
+      "grad_norm": 0.12811000645160675,
+      "learning_rate": 0.00017018556701030928,
+      "loss": 0.5901,
+      "step": 874
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.12633275985717773,
+      "learning_rate": 0.00017014432989690724,
+      "loss": 0.7171,
+      "step": 875
+    },
+    {
+      "epoch": 0.28032,
+      "grad_norm": 0.13156813383102417,
+      "learning_rate": 0.00017010309278350517,
+      "loss": 0.7358,
+      "step": 876
+    },
+    {
+      "epoch": 0.28064,
+      "grad_norm": 0.11998021602630615,
+      "learning_rate": 0.0001700618556701031,
+      "loss": 0.7273,
+      "step": 877
+    },
+    {
+      "epoch": 0.28096,
+      "grad_norm": 0.11613285541534424,
+      "learning_rate": 0.00017002061855670103,
+      "loss": 0.7788,
+      "step": 878
+    },
+    {
+      "epoch": 0.28128,
+      "grad_norm": 0.14196628332138062,
+      "learning_rate": 0.00016997938144329896,
+      "loss": 0.8546,
+      "step": 879
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.11939578503370285,
+      "learning_rate": 0.00016993814432989692,
+      "loss": 0.7764,
+      "step": 880
+    },
+    {
+      "epoch": 0.28192,
+      "grad_norm": 0.13980519771575928,
+      "learning_rate": 0.00016989690721649485,
+      "loss": 0.6063,
+      "step": 881
+    },
+    {
+      "epoch": 0.28224,
+      "grad_norm": 0.12722016870975494,
+      "learning_rate": 0.0001698556701030928,
+      "loss": 0.6469,
+      "step": 882
+    },
+    {
+      "epoch": 0.28256,
+      "grad_norm": 0.12645535171031952,
+      "learning_rate": 0.00016981443298969074,
+      "loss": 0.9321,
+      "step": 883
+    },
+    {
+      "epoch": 0.28288,
+      "grad_norm": 0.13195961713790894,
+      "learning_rate": 0.00016977319587628865,
+      "loss": 0.7791,
+      "step": 884
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.1446891874074936,
+      "learning_rate": 0.0001697319587628866,
+      "loss": 0.651,
+      "step": 885
+    },
+    {
+      "epoch": 0.28352,
+      "grad_norm": 0.1256592720746994,
+      "learning_rate": 0.00016969072164948454,
+      "loss": 0.7208,
+      "step": 886
+    },
+    {
+      "epoch": 0.28384,
+      "grad_norm": 0.11931982636451721,
+      "learning_rate": 0.0001696494845360825,
+      "loss": 0.6168,
+      "step": 887
+    },
+    {
+      "epoch": 0.28416,
+      "grad_norm": 0.11525401473045349,
+      "learning_rate": 0.00016960824742268043,
+      "loss": 0.6315,
+      "step": 888
+    },
+    {
+      "epoch": 0.28448,
+      "grad_norm": 0.12549898028373718,
+      "learning_rate": 0.00016956701030927836,
+      "loss": 0.636,
+      "step": 889
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.1158369705080986,
+      "learning_rate": 0.0001695257731958763,
+      "loss": 0.7766,
+      "step": 890
+    },
+    {
+      "epoch": 0.28512,
+      "grad_norm": 0.12144404649734497,
+      "learning_rate": 0.00016948453608247422,
+      "loss": 0.6679,
+      "step": 891
+    },
+    {
+      "epoch": 0.28544,
+      "grad_norm": 0.13141587376594543,
+      "learning_rate": 0.00016944329896907218,
+      "loss": 0.7929,
+      "step": 892
+    },
+    {
+      "epoch": 0.28576,
+      "grad_norm": 0.11869249492883682,
+      "learning_rate": 0.0001694020618556701,
+      "loss": 0.7054,
+      "step": 893
+    },
+    {
+      "epoch": 0.28608,
+      "grad_norm": 0.1343916803598404,
+      "learning_rate": 0.00016936082474226804,
+      "loss": 0.7307,
+      "step": 894
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.12864436209201813,
+      "learning_rate": 0.000169319587628866,
+      "loss": 0.7858,
+      "step": 895
+    },
+    {
+      "epoch": 0.28672,
+      "grad_norm": 0.11874640733003616,
+      "learning_rate": 0.00016927835051546393,
+      "loss": 0.7739,
+      "step": 896
+    },
+    {
+      "epoch": 0.28704,
+      "grad_norm": 0.1262228637933731,
+      "learning_rate": 0.00016923711340206186,
+      "loss": 0.8163,
+      "step": 897
+    },
+    {
+      "epoch": 0.28736,
+      "grad_norm": 0.1348988115787506,
+      "learning_rate": 0.0001691958762886598,
+      "loss": 0.8096,
+      "step": 898
+    },
+    {
+      "epoch": 0.28768,
+      "grad_norm": 0.11836285144090652,
+      "learning_rate": 0.00016915463917525773,
+      "loss": 0.8158,
+      "step": 899
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.12167097628116608,
+      "learning_rate": 0.00016911340206185568,
+      "loss": 0.7138,
+      "step": 900
+    },
+    {
+      "epoch": 0.28832,
+      "grad_norm": 0.13505558669567108,
+      "learning_rate": 0.00016907216494845361,
+      "loss": 0.9069,
+      "step": 901
+    },
+    {
+      "epoch": 0.28864,
+      "grad_norm": 0.13912267982959747,
+      "learning_rate": 0.00016903092783505155,
+      "loss": 0.7116,
+      "step": 902
+    },
+    {
+      "epoch": 0.28896,
+      "grad_norm": 0.15731939673423767,
+      "learning_rate": 0.0001689896907216495,
+      "loss": 0.6876,
+      "step": 903
+    },
+    {
+      "epoch": 0.28928,
+      "grad_norm": 0.12537138164043427,
+      "learning_rate": 0.00016894845360824744,
+      "loss": 0.6171,
+      "step": 904
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.13225343823432922,
+      "learning_rate": 0.00016890721649484537,
+      "loss": 0.6375,
+      "step": 905
+    },
+    {
+      "epoch": 0.28992,
+      "grad_norm": 0.12211292237043381,
+      "learning_rate": 0.0001688659793814433,
+      "loss": 0.732,
+      "step": 906
+    },
+    {
+      "epoch": 0.29024,
+      "grad_norm": 0.12123337388038635,
+      "learning_rate": 0.00016882474226804123,
+      "loss": 0.8403,
+      "step": 907
+    },
+    {
+      "epoch": 0.29056,
+      "grad_norm": 0.11805325746536255,
+      "learning_rate": 0.0001687835051546392,
+      "loss": 0.7806,
+      "step": 908
+    },
+    {
+      "epoch": 0.29088,
+      "grad_norm": 0.12412726879119873,
+      "learning_rate": 0.00016874226804123712,
+      "loss": 0.5854,
+      "step": 909
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.12389811873435974,
+      "learning_rate": 0.00016870103092783508,
+      "loss": 0.6711,
+      "step": 910
+    },
+    {
+      "epoch": 0.29152,
+      "grad_norm": 0.12261383980512619,
+      "learning_rate": 0.000168659793814433,
+      "loss": 0.7261,
+      "step": 911
+    },
+    {
+      "epoch": 0.29184,
+      "grad_norm": 0.14904655516147614,
+      "learning_rate": 0.00016861855670103094,
+      "loss": 0.6976,
+      "step": 912
+    },
+    {
+      "epoch": 0.29216,
+      "grad_norm": 0.11271169781684875,
+      "learning_rate": 0.00016857731958762887,
+      "loss": 0.7185,
+      "step": 913
+    },
+    {
+      "epoch": 0.29248,
+      "grad_norm": 0.11681025475263596,
+      "learning_rate": 0.0001685360824742268,
+      "loss": 0.7155,
+      "step": 914
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.12385556101799011,
+      "learning_rate": 0.00016849484536082476,
+      "loss": 0.7431,
+      "step": 915
+    },
+    {
+      "epoch": 0.29312,
+      "grad_norm": 0.13570012152194977,
+      "learning_rate": 0.0001684536082474227,
+      "loss": 0.8334,
+      "step": 916
+    },
+    {
+      "epoch": 0.29344,
+      "grad_norm": 0.14303314685821533,
+      "learning_rate": 0.00016841237113402062,
+      "loss": 0.7757,
+      "step": 917
+    },
+    {
+      "epoch": 0.29376,
+      "grad_norm": 0.13650397956371307,
+      "learning_rate": 0.00016837113402061858,
+      "loss": 0.6664,
+      "step": 918
+    },
+    {
+      "epoch": 0.29408,
+      "grad_norm": 0.14622308313846588,
+      "learning_rate": 0.0001683298969072165,
+      "loss": 0.6503,
+      "step": 919
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.1545690894126892,
+      "learning_rate": 0.00016828865979381444,
+      "loss": 0.6802,
+      "step": 920
+    },
+    {
+      "epoch": 0.29472,
+      "grad_norm": 0.12770286202430725,
+      "learning_rate": 0.00016824742268041238,
+      "loss": 0.5676,
+      "step": 921
+    },
+    {
+      "epoch": 0.29504,
+      "grad_norm": 0.1302863508462906,
+      "learning_rate": 0.0001682061855670103,
+      "loss": 0.7108,
+      "step": 922
+    },
+    {
+      "epoch": 0.29536,
+      "grad_norm": 0.12539339065551758,
+      "learning_rate": 0.00016816494845360827,
+      "loss": 0.7109,
+      "step": 923
+    },
+    {
+      "epoch": 0.29568,
+      "grad_norm": 0.13586951792240143,
+      "learning_rate": 0.0001681237113402062,
+      "loss": 0.7819,
+      "step": 924
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.13007107377052307,
+      "learning_rate": 0.00016808247422680413,
+      "loss": 0.7138,
+      "step": 925
+    },
+    {
+      "epoch": 0.29632,
+      "grad_norm": 0.11121190339326859,
+      "learning_rate": 0.00016804123711340206,
+      "loss": 0.6099,
+      "step": 926
+    },
+    {
+      "epoch": 0.29664,
+      "grad_norm": 0.1328391581773758,
+      "learning_rate": 0.000168,
+      "loss": 0.8115,
+      "step": 927
+    },
+    {
+      "epoch": 0.29696,
+      "grad_norm": 0.12316712737083435,
+      "learning_rate": 0.00016795876288659795,
+      "loss": 0.6743,
+      "step": 928
+    },
+    {
+      "epoch": 0.29728,
+      "grad_norm": 0.13188757002353668,
+      "learning_rate": 0.00016791752577319588,
+      "loss": 0.7733,
+      "step": 929
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.1288207471370697,
+      "learning_rate": 0.0001678762886597938,
+      "loss": 0.6478,
+      "step": 930
+    },
+    {
+      "epoch": 0.29792,
+      "grad_norm": 0.13399000465869904,
+      "learning_rate": 0.00016783505154639177,
+      "loss": 0.7069,
+      "step": 931
+    },
+    {
+      "epoch": 0.29824,
+      "grad_norm": 0.14570803940296173,
+      "learning_rate": 0.0001677938144329897,
+      "loss": 0.7701,
+      "step": 932
+    },
+    {
+      "epoch": 0.29856,
+      "grad_norm": 0.12334295362234116,
+      "learning_rate": 0.00016775257731958763,
+      "loss": 0.7331,
+      "step": 933
+    },
+    {
+      "epoch": 0.29888,
+      "grad_norm": 0.12136422842741013,
+      "learning_rate": 0.00016771134020618556,
+      "loss": 0.6297,
+      "step": 934
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.12953773140907288,
+      "learning_rate": 0.0001676701030927835,
+      "loss": 0.6675,
+      "step": 935
+    },
+    {
+      "epoch": 0.29952,
+      "grad_norm": 0.13175757229328156,
+      "learning_rate": 0.00016762886597938145,
+      "loss": 0.8289,
+      "step": 936
+    },
+    {
+      "epoch": 0.29984,
+      "grad_norm": 0.12782573699951172,
+      "learning_rate": 0.00016758762886597938,
+      "loss": 0.7401,
+      "step": 937
+    },
+    {
+      "epoch": 0.30016,
+      "grad_norm": 0.13902167975902557,
+      "learning_rate": 0.00016754639175257734,
+      "loss": 0.6949,
+      "step": 938
+    },
+    {
+      "epoch": 0.30048,
+      "grad_norm": 0.1445343792438507,
+      "learning_rate": 0.00016750515463917527,
+      "loss": 0.6779,
+      "step": 939
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.12114536762237549,
+      "learning_rate": 0.0001674639175257732,
+      "loss": 0.6658,
+      "step": 940
+    },
+    {
+      "epoch": 0.30112,
+      "grad_norm": 0.13642756640911102,
+      "learning_rate": 0.00016742268041237114,
+      "loss": 0.6336,
+      "step": 941
+    },
+    {
+      "epoch": 0.30144,
+      "grad_norm": 0.12124267965555191,
+      "learning_rate": 0.00016738144329896907,
+      "loss": 0.6917,
+      "step": 942
+    },
+    {
+      "epoch": 0.30176,
+      "grad_norm": 0.1574261337518692,
+      "learning_rate": 0.00016734020618556703,
+      "loss": 0.6723,
+      "step": 943
+    },
+    {
+      "epoch": 0.30208,
+      "grad_norm": 0.1327444165945053,
+      "learning_rate": 0.00016729896907216496,
+      "loss": 0.7765,
+      "step": 944
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.12544023990631104,
+      "learning_rate": 0.0001672577319587629,
+      "loss": 0.7213,
+      "step": 945
+    },
+    {
+      "epoch": 0.30272,
+      "grad_norm": 0.12831197679042816,
+      "learning_rate": 0.00016721649484536085,
+      "loss": 0.624,
+      "step": 946
+    },
+    {
+      "epoch": 0.30304,
+      "grad_norm": 0.13576100766658783,
+      "learning_rate": 0.00016717525773195878,
+      "loss": 0.6694,
+      "step": 947
+    },
+    {
+      "epoch": 0.30336,
+      "grad_norm": 0.14174100756645203,
+      "learning_rate": 0.0001671340206185567,
+      "loss": 0.7847,
+      "step": 948
+    },
+    {
+      "epoch": 0.30368,
+      "grad_norm": 0.14193561673164368,
+      "learning_rate": 0.00016709278350515464,
+      "loss": 0.638,
+      "step": 949
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.12588457763195038,
+      "learning_rate": 0.00016705154639175257,
+      "loss": 0.7496,
+      "step": 950
+    },
+    {
+      "epoch": 0.30432,
+      "grad_norm": 0.14540141820907593,
+      "learning_rate": 0.00016701030927835053,
+      "loss": 0.7611,
+      "step": 951
+    },
+    {
+      "epoch": 0.30464,
+      "grad_norm": 0.1433524489402771,
+      "learning_rate": 0.00016696907216494846,
+      "loss": 0.7602,
+      "step": 952
+    },
+    {
+      "epoch": 0.30496,
+      "grad_norm": 0.12800182402133942,
+      "learning_rate": 0.00016692783505154642,
+      "loss": 0.7136,
+      "step": 953
+    },
+    {
+      "epoch": 0.30528,
+      "grad_norm": 0.14627426862716675,
+      "learning_rate": 0.00016688659793814435,
+      "loss": 0.7954,
+      "step": 954
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.11950758844614029,
+      "learning_rate": 0.00016684536082474226,
+      "loss": 0.6012,
+      "step": 955
+    },
+    {
+      "epoch": 0.30592,
+      "grad_norm": 0.14980708062648773,
+      "learning_rate": 0.00016680412371134021,
+      "loss": 0.6755,
+      "step": 956
+    },
+    {
+      "epoch": 0.30624,
+      "grad_norm": 0.1280095875263214,
+      "learning_rate": 0.00016676288659793815,
+      "loss": 0.691,
+      "step": 957
+    },
+    {
+      "epoch": 0.30656,
+      "grad_norm": 0.12384190410375595,
+      "learning_rate": 0.00016672164948453608,
+      "loss": 0.7097,
+      "step": 958
+    },
+    {
+      "epoch": 0.30688,
+      "grad_norm": 0.11612387746572495,
+      "learning_rate": 0.00016668041237113404,
+      "loss": 0.6998,
+      "step": 959
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.11533758789300919,
+      "learning_rate": 0.00016663917525773197,
+      "loss": 0.5825,
+      "step": 960
+    },
+    {
+      "epoch": 0.30752,
+      "grad_norm": 0.13979220390319824,
+      "learning_rate": 0.00016659793814432993,
+      "loss": 0.887,
+      "step": 961
+    },
+    {
+      "epoch": 0.30784,
+      "grad_norm": 0.12515367567539215,
+      "learning_rate": 0.00016655670103092783,
+      "loss": 0.7497,
+      "step": 962
+    },
+    {
+      "epoch": 0.30816,
+      "grad_norm": 0.11801932007074356,
+      "learning_rate": 0.00016651546391752576,
+      "loss": 0.6204,
+      "step": 963
+    },
+    {
+      "epoch": 0.30848,
+      "grad_norm": 0.1284782588481903,
+      "learning_rate": 0.00016647422680412372,
+      "loss": 0.8,
+      "step": 964
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.11822401732206345,
+      "learning_rate": 0.00016643298969072165,
+      "loss": 0.6733,
+      "step": 965
+    },
+    {
+      "epoch": 0.30912,
+      "grad_norm": 0.1323615163564682,
+      "learning_rate": 0.0001663917525773196,
+      "loss": 0.6603,
+      "step": 966
+    },
+    {
+      "epoch": 0.30944,
+      "grad_norm": 0.1403515785932541,
+      "learning_rate": 0.00016635051546391754,
+      "loss": 0.7228,
+      "step": 967
+    },
+    {
+      "epoch": 0.30976,
+      "grad_norm": 0.13995344936847687,
+      "learning_rate": 0.00016630927835051547,
+      "loss": 0.7326,
+      "step": 968
+    },
+    {
+      "epoch": 0.31008,
+      "grad_norm": 0.12779420614242554,
+      "learning_rate": 0.0001662680412371134,
+      "loss": 0.7643,
+      "step": 969
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.13067853450775146,
+      "learning_rate": 0.00016622680412371133,
+      "loss": 0.584,
+      "step": 970
+    },
+    {
+      "epoch": 0.31072,
+      "grad_norm": 0.1376170516014099,
+      "learning_rate": 0.0001661855670103093,
+      "loss": 0.7033,
+      "step": 971
+    },
+    {
+      "epoch": 0.31104,
+      "grad_norm": 0.13981421291828156,
+      "learning_rate": 0.00016614432989690722,
+      "loss": 0.6585,
+      "step": 972
+    },
+    {
+      "epoch": 0.31136,
+      "grad_norm": 0.13018488883972168,
+      "learning_rate": 0.00016610309278350515,
+      "loss": 0.7505,
+      "step": 973
+    },
+    {
+      "epoch": 0.31168,
+      "grad_norm": 0.13596870005130768,
+      "learning_rate": 0.0001660618556701031,
+      "loss": 0.6731,
+      "step": 974
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.12909188866615295,
+      "learning_rate": 0.00016602061855670104,
+      "loss": 0.7431,
+      "step": 975
+    },
+    {
+      "epoch": 0.31232,
+      "grad_norm": 0.12600624561309814,
+      "learning_rate": 0.00016597938144329898,
+      "loss": 0.8064,
+      "step": 976
+    },
+    {
+      "epoch": 0.31264,
+      "grad_norm": 0.12609797716140747,
+      "learning_rate": 0.0001659381443298969,
+      "loss": 0.856,
+      "step": 977
+    },
+    {
+      "epoch": 0.31296,
+      "grad_norm": 0.1194164827466011,
+      "learning_rate": 0.00016589690721649484,
+      "loss": 0.7126,
+      "step": 978
+    },
+    {
+      "epoch": 0.31328,
+      "grad_norm": 0.12827666103839874,
+      "learning_rate": 0.0001658556701030928,
+      "loss": 0.6233,
+      "step": 979
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.12019889056682587,
+      "learning_rate": 0.00016581443298969073,
+      "loss": 0.5571,
+      "step": 980
+    },
+    {
+      "epoch": 0.31392,
+      "grad_norm": 0.12774929404258728,
+      "learning_rate": 0.00016577319587628869,
+      "loss": 0.6896,
+      "step": 981
+    },
+    {
+      "epoch": 0.31424,
+      "grad_norm": 0.13669483363628387,
+      "learning_rate": 0.00016573195876288662,
+      "loss": 0.6872,
+      "step": 982
+    },
+    {
+      "epoch": 0.31456,
+      "grad_norm": 0.12400145828723907,
+      "learning_rate": 0.00016569072164948455,
+      "loss": 0.8399,
+      "step": 983
+    },
+    {
+      "epoch": 0.31488,
+      "grad_norm": 0.14067703485488892,
+      "learning_rate": 0.00016564948453608248,
+      "loss": 0.7675,
+      "step": 984
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.13461582362651825,
+      "learning_rate": 0.0001656082474226804,
+      "loss": 0.7943,
+      "step": 985
+    },
+    {
+      "epoch": 0.31552,
+      "grad_norm": 0.13829675316810608,
+      "learning_rate": 0.00016556701030927834,
+      "loss": 0.8666,
+      "step": 986
+    },
+    {
+      "epoch": 0.31584,
+      "grad_norm": 0.13706457614898682,
+      "learning_rate": 0.0001655257731958763,
+      "loss": 0.757,
+      "step": 987
+    },
+    {
+      "epoch": 0.31616,
+      "grad_norm": 0.1262834221124649,
+      "learning_rate": 0.00016548453608247423,
+      "loss": 0.6941,
+      "step": 988
+    },
+    {
+      "epoch": 0.31648,
+      "grad_norm": 0.15362948179244995,
+      "learning_rate": 0.0001654432989690722,
+      "loss": 0.8656,
+      "step": 989
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.11257731169462204,
+      "learning_rate": 0.00016540206185567012,
+      "loss": 0.6426,
+      "step": 990
+    },
+    {
+      "epoch": 0.31712,
+      "grad_norm": 0.14325182139873505,
+      "learning_rate": 0.00016536082474226803,
+      "loss": 0.7389,
+      "step": 991
+    },
+    {
+      "epoch": 0.31744,
+      "grad_norm": 0.12336672842502594,
+      "learning_rate": 0.00016531958762886598,
+      "loss": 0.7493,
+      "step": 992
+    },
+    {
+      "epoch": 0.31776,
+      "grad_norm": 0.14396019279956818,
+      "learning_rate": 0.00016527835051546392,
+      "loss": 0.7598,
+      "step": 993
+    },
+    {
+      "epoch": 0.31808,
+      "grad_norm": 0.1333467960357666,
+      "learning_rate": 0.00016523711340206187,
+      "loss": 0.7863,
+      "step": 994
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.1304144263267517,
+      "learning_rate": 0.0001651958762886598,
+      "loss": 0.7304,
+      "step": 995
+    },
+    {
+      "epoch": 0.31872,
+      "grad_norm": 0.12357225269079208,
+      "learning_rate": 0.00016515463917525774,
+      "loss": 0.6275,
+      "step": 996
+    },
+    {
+      "epoch": 0.31904,
+      "grad_norm": 0.11936910450458527,
+      "learning_rate": 0.0001651134020618557,
+      "loss": 0.7266,
+      "step": 997
+    },
+    {
+      "epoch": 0.31936,
+      "grad_norm": 0.12361844629049301,
+      "learning_rate": 0.0001650721649484536,
+      "loss": 0.7156,
+      "step": 998
+    },
+    {
+      "epoch": 0.31968,
+      "grad_norm": 0.12392809987068176,
+      "learning_rate": 0.00016503092783505156,
+      "loss": 0.7804,
+      "step": 999
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.13606688380241394,
+      "learning_rate": 0.0001649896907216495,
+      "loss": 0.9582,
+      "step": 1000
+    },
+    {
+      "epoch": 0.32032,
+      "grad_norm": 0.1291663497686386,
+      "learning_rate": 0.00016494845360824742,
+      "loss": 0.6795,
+      "step": 1001
+    },
+    {
+      "epoch": 0.32064,
+      "grad_norm": 0.13042348623275757,
+      "learning_rate": 0.00016490721649484538,
+      "loss": 0.729,
+      "step": 1002
+    },
+    {
+      "epoch": 0.32096,
+      "grad_norm": 0.12246596813201904,
+      "learning_rate": 0.0001648659793814433,
+      "loss": 0.7758,
+      "step": 1003
+    },
+    {
+      "epoch": 0.32128,
+      "grad_norm": 0.12324332445859909,
+      "learning_rate": 0.00016482474226804127,
+      "loss": 0.6839,
+      "step": 1004
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.13313184678554535,
+      "learning_rate": 0.00016478350515463917,
+      "loss": 0.7324,
+      "step": 1005
+    },
+    {
+      "epoch": 0.32192,
+      "grad_norm": 0.1304756999015808,
+      "learning_rate": 0.0001647422680412371,
+      "loss": 0.6432,
+      "step": 1006
+    },
+    {
+      "epoch": 0.32224,
+      "grad_norm": 0.12304502725601196,
+      "learning_rate": 0.00016470103092783506,
+      "loss": 0.722,
+      "step": 1007
+    },
+    {
+      "epoch": 0.32256,
+      "grad_norm": 0.13414014875888824,
+      "learning_rate": 0.000164659793814433,
+      "loss": 0.6226,
+      "step": 1008
+    },
+    {
+      "epoch": 0.32288,
+      "grad_norm": 0.1390489637851715,
+      "learning_rate": 0.00016461855670103095,
+      "loss": 0.7368,
+      "step": 1009
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.13376174867153168,
+      "learning_rate": 0.00016457731958762888,
+      "loss": 0.9653,
+      "step": 1010
+    },
+    {
+      "epoch": 0.32352,
+      "grad_norm": 0.13524703681468964,
+      "learning_rate": 0.00016453608247422681,
+      "loss": 0.7106,
+      "step": 1011
+    },
+    {
+      "epoch": 0.32384,
+      "grad_norm": 0.1215464323759079,
+      "learning_rate": 0.00016449484536082475,
+      "loss": 0.6942,
+      "step": 1012
+    },
+    {
+      "epoch": 0.32416,
+      "grad_norm": 0.13397525250911713,
+      "learning_rate": 0.00016445360824742268,
+      "loss": 0.684,
+      "step": 1013
+    },
+    {
+      "epoch": 0.32448,
+      "grad_norm": 0.1190156638622284,
+      "learning_rate": 0.00016441237113402063,
+      "loss": 0.7029,
+      "step": 1014
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.13238269090652466,
+      "learning_rate": 0.00016437113402061857,
+      "loss": 0.7614,
+      "step": 1015
+    },
+    {
+      "epoch": 0.32512,
+      "grad_norm": 0.1246226504445076,
+      "learning_rate": 0.0001643298969072165,
+      "loss": 0.7804,
+      "step": 1016
+    },
+    {
+      "epoch": 0.32544,
+      "grad_norm": 0.13241590559482574,
+      "learning_rate": 0.00016428865979381446,
+      "loss": 0.7454,
+      "step": 1017
+    },
+    {
+      "epoch": 0.32576,
+      "grad_norm": 0.12619127333164215,
+      "learning_rate": 0.0001642474226804124,
+      "loss": 0.6793,
+      "step": 1018
+    },
+    {
+      "epoch": 0.32608,
+      "grad_norm": 0.128580242395401,
+      "learning_rate": 0.00016420618556701032,
+      "loss": 0.7866,
+      "step": 1019
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.14162757992744446,
+      "learning_rate": 0.00016416494845360825,
+      "loss": 0.8144,
+      "step": 1020
+    },
+    {
+      "epoch": 0.32672,
+      "grad_norm": 0.15262079238891602,
+      "learning_rate": 0.00016412371134020618,
+      "loss": 0.7865,
+      "step": 1021
+    },
+    {
+      "epoch": 0.32704,
+      "grad_norm": 0.1325531303882599,
+      "learning_rate": 0.00016408247422680414,
+      "loss": 0.6776,
+      "step": 1022
+    },
+    {
+      "epoch": 0.32736,
+      "grad_norm": 0.12302196770906448,
+      "learning_rate": 0.00016404123711340207,
+      "loss": 0.8476,
+      "step": 1023
+    },
+    {
+      "epoch": 0.32768,
+      "grad_norm": 0.12806765735149384,
+      "learning_rate": 0.000164,
+      "loss": 0.8215,
+      "step": 1024
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.11527955532073975,
+      "learning_rate": 0.00016395876288659796,
+      "loss": 0.8466,
+      "step": 1025
+    },
+    {
+      "epoch": 0.32832,
+      "grad_norm": 0.11988075077533722,
+      "learning_rate": 0.0001639175257731959,
+      "loss": 0.7193,
+      "step": 1026
+    },
+    {
+      "epoch": 0.32864,
+      "grad_norm": 0.11877843737602234,
+      "learning_rate": 0.00016387628865979382,
+      "loss": 0.6518,
+      "step": 1027
+    },
+    {
+      "epoch": 0.32896,
+      "grad_norm": 0.11726564168930054,
+      "learning_rate": 0.00016383505154639175,
+      "loss": 0.6005,
+      "step": 1028
+    },
+    {
+      "epoch": 0.32928,
+      "grad_norm": 0.1332593411207199,
+      "learning_rate": 0.00016379381443298969,
+      "loss": 0.7817,
+      "step": 1029
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.13856275379657745,
+      "learning_rate": 0.00016375257731958764,
+      "loss": 0.8255,
+      "step": 1030
+    },
+    {
+      "epoch": 0.32992,
+      "grad_norm": 0.10795895755290985,
+      "learning_rate": 0.00016371134020618558,
+      "loss": 0.5919,
+      "step": 1031
+    },
+    {
+      "epoch": 0.33024,
+      "grad_norm": 0.12465250492095947,
+      "learning_rate": 0.00016367010309278353,
+      "loss": 0.6625,
+      "step": 1032
+    },
+    {
+      "epoch": 0.33056,
+      "grad_norm": 0.12707003951072693,
+      "learning_rate": 0.00016362886597938146,
+      "loss": 0.8995,
+      "step": 1033
+    },
+    {
+      "epoch": 0.33088,
+      "grad_norm": 0.137173593044281,
+      "learning_rate": 0.00016358762886597937,
+      "loss": 0.726,
+      "step": 1034
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.12696483731269836,
+      "learning_rate": 0.00016354639175257733,
+      "loss": 0.5944,
+      "step": 1035
+    },
+    {
+      "epoch": 0.33152,
+      "grad_norm": 0.1236334815621376,
+      "learning_rate": 0.00016350515463917526,
+      "loss": 0.6641,
+      "step": 1036
+    },
+    {
+      "epoch": 0.33184,
+      "grad_norm": 0.143971785902977,
+      "learning_rate": 0.00016346391752577322,
+      "loss": 0.7842,
+      "step": 1037
+    },
+    {
+      "epoch": 0.33216,
+      "grad_norm": 0.12057577818632126,
+      "learning_rate": 0.00016342268041237115,
+      "loss": 0.6724,
+      "step": 1038
+    },
+    {
+      "epoch": 0.33248,
+      "grad_norm": 0.15648114681243896,
+      "learning_rate": 0.00016338144329896908,
+      "loss": 0.9973,
+      "step": 1039
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.1502295732498169,
+      "learning_rate": 0.000163340206185567,
+      "loss": 0.6742,
+      "step": 1040
+    },
+    {
+      "epoch": 0.33312,
+      "grad_norm": 0.1341315358877182,
+      "learning_rate": 0.00016329896907216494,
+      "loss": 0.7678,
+      "step": 1041
+    },
+    {
+      "epoch": 0.33344,
+      "grad_norm": 0.10982177406549454,
+      "learning_rate": 0.0001632577319587629,
+      "loss": 0.7545,
+      "step": 1042
+    },
+    {
+      "epoch": 0.33376,
+      "grad_norm": 0.14167900383472443,
+      "learning_rate": 0.00016321649484536083,
+      "loss": 0.7139,
+      "step": 1043
+    },
+    {
+      "epoch": 0.33408,
+      "grad_norm": 0.11281700432300568,
+      "learning_rate": 0.00016317525773195876,
+      "loss": 0.7498,
+      "step": 1044
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.13683807849884033,
+      "learning_rate": 0.00016313402061855672,
+      "loss": 0.6963,
+      "step": 1045
+    },
+    {
+      "epoch": 0.33472,
+      "grad_norm": 0.1305110901594162,
+      "learning_rate": 0.00016309278350515465,
+      "loss": 0.6053,
+      "step": 1046
+    },
+    {
+      "epoch": 0.33504,
+      "grad_norm": 0.136047825217247,
+      "learning_rate": 0.00016305154639175258,
+      "loss": 0.7007,
+      "step": 1047
+    },
+    {
+      "epoch": 0.33536,
+      "grad_norm": 0.1397719383239746,
+      "learning_rate": 0.00016301030927835052,
+      "loss": 0.8724,
+      "step": 1048
+    },
+    {
+      "epoch": 0.33568,
+      "grad_norm": 0.13805080950260162,
+      "learning_rate": 0.00016296907216494845,
+      "loss": 0.7124,
+      "step": 1049
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.14922751486301422,
+      "learning_rate": 0.0001629278350515464,
+      "loss": 0.7945,
+      "step": 1050
+    },
+    {
+      "epoch": 0.33632,
+      "grad_norm": 0.1439272165298462,
+      "learning_rate": 0.00016288659793814434,
+      "loss": 0.7118,
+      "step": 1051
+    },
+    {
+      "epoch": 0.33664,
+      "grad_norm": 0.11644583195447922,
+      "learning_rate": 0.00016284536082474227,
+      "loss": 0.8287,
+      "step": 1052
+    },
+    {
+      "epoch": 0.33696,
+      "grad_norm": 0.13248111307621002,
+      "learning_rate": 0.00016280412371134023,
+      "loss": 0.7215,
+      "step": 1053
+    },
+    {
+      "epoch": 0.33728,
+      "grad_norm": 0.13736796379089355,
+      "learning_rate": 0.00016276288659793816,
+      "loss": 0.7327,
+      "step": 1054
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.13475042581558228,
+      "learning_rate": 0.0001627216494845361,
+      "loss": 0.8121,
+      "step": 1055
+    },
+    {
+      "epoch": 0.33792,
+      "grad_norm": 0.12516193091869354,
+      "learning_rate": 0.00016268041237113402,
+      "loss": 0.7947,
+      "step": 1056
+    },
+    {
+      "epoch": 0.33824,
+      "grad_norm": 0.1351221799850464,
+      "learning_rate": 0.00016263917525773195,
+      "loss": 0.5659,
+      "step": 1057
+    },
+    {
+      "epoch": 0.33856,
+      "grad_norm": 0.12663671374320984,
+      "learning_rate": 0.0001625979381443299,
+      "loss": 0.6308,
+      "step": 1058
+    },
+    {
+      "epoch": 0.33888,
+      "grad_norm": 0.13254615664482117,
+      "learning_rate": 0.00016255670103092784,
+      "loss": 0.6656,
+      "step": 1059
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.1433473527431488,
+      "learning_rate": 0.0001625154639175258,
+      "loss": 0.808,
+      "step": 1060
+    },
+    {
+      "epoch": 0.33952,
+      "grad_norm": 0.12944042682647705,
+      "learning_rate": 0.00016247422680412373,
+      "loss": 0.6709,
+      "step": 1061
+    },
+    {
+      "epoch": 0.33984,
+      "grad_norm": 0.13818645477294922,
+      "learning_rate": 0.00016243298969072166,
+      "loss": 0.7565,
+      "step": 1062
+    },
+    {
+      "epoch": 0.34016,
+      "grad_norm": 0.12289980053901672,
+      "learning_rate": 0.0001623917525773196,
+      "loss": 0.7579,
+      "step": 1063
+    },
+    {
+      "epoch": 0.34048,
+      "grad_norm": 0.1363401859998703,
+      "learning_rate": 0.00016235051546391752,
+      "loss": 0.7677,
+      "step": 1064
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.12801024317741394,
+      "learning_rate": 0.00016230927835051548,
+      "loss": 0.5706,
+      "step": 1065
+    },
+    {
+      "epoch": 0.34112,
+      "grad_norm": 0.1203409880399704,
+      "learning_rate": 0.00016226804123711341,
+      "loss": 0.5443,
+      "step": 1066
+    },
+    {
+      "epoch": 0.34144,
+      "grad_norm": 0.14376769959926605,
+      "learning_rate": 0.00016222680412371134,
+      "loss": 0.7202,
+      "step": 1067
+    },
+    {
+      "epoch": 0.34176,
+      "grad_norm": 0.11748041212558746,
+      "learning_rate": 0.0001621855670103093,
+      "loss": 0.8342,
+      "step": 1068
+    },
+    {
+      "epoch": 0.34208,
+      "grad_norm": 0.13433270156383514,
+      "learning_rate": 0.00016214432989690723,
+      "loss": 0.7167,
+      "step": 1069
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.12405699491500854,
+      "learning_rate": 0.00016210309278350517,
+      "loss": 0.6997,
+      "step": 1070
+    },
+    {
+      "epoch": 0.34272,
+      "grad_norm": 0.13520316779613495,
+      "learning_rate": 0.0001620618556701031,
+      "loss": 0.8303,
+      "step": 1071
+    },
+    {
+      "epoch": 0.34304,
+      "grad_norm": 0.12949883937835693,
+      "learning_rate": 0.00016202061855670103,
+      "loss": 0.5729,
+      "step": 1072
+    },
+    {
+      "epoch": 0.34336,
+      "grad_norm": 0.11795380711555481,
+      "learning_rate": 0.000161979381443299,
+      "loss": 0.7703,
+      "step": 1073
+    },
+    {
+      "epoch": 0.34368,
+      "grad_norm": 0.13385391235351562,
+      "learning_rate": 0.00016193814432989692,
+      "loss": 0.5315,
+      "step": 1074
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.11290988326072693,
+      "learning_rate": 0.00016189690721649488,
+      "loss": 0.6147,
+      "step": 1075
+    },
+    {
+      "epoch": 0.34432,
+      "grad_norm": 0.14130225777626038,
+      "learning_rate": 0.00016185567010309278,
+      "loss": 0.7681,
+      "step": 1076
+    },
+    {
+      "epoch": 0.34464,
+      "grad_norm": 0.15834380686283112,
+      "learning_rate": 0.0001618144329896907,
+      "loss": 0.7883,
+      "step": 1077
+    },
+    {
+      "epoch": 0.34496,
+      "grad_norm": 0.13480958342552185,
+      "learning_rate": 0.00016177319587628867,
+      "loss": 0.7518,
+      "step": 1078
+    },
+    {
+      "epoch": 0.34528,
+      "grad_norm": 0.12487983703613281,
+      "learning_rate": 0.0001617319587628866,
+      "loss": 0.7012,
+      "step": 1079
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.12848694622516632,
+      "learning_rate": 0.00016169072164948453,
+      "loss": 0.862,
+      "step": 1080
+    },
+    {
+      "epoch": 0.34592,
+      "grad_norm": 0.13103242218494415,
+      "learning_rate": 0.0001616494845360825,
+      "loss": 0.7836,
+      "step": 1081
+    },
+    {
+      "epoch": 0.34624,
+      "grad_norm": 0.12427806109189987,
+      "learning_rate": 0.00016160824742268042,
+      "loss": 0.6948,
+      "step": 1082
+    },
+    {
+      "epoch": 0.34656,
+      "grad_norm": 0.1163354143500328,
+      "learning_rate": 0.00016156701030927835,
+      "loss": 0.638,
+      "step": 1083
+    },
+    {
+      "epoch": 0.34688,
+      "grad_norm": 0.11864982545375824,
+      "learning_rate": 0.00016152577319587628,
+      "loss": 0.7002,
+      "step": 1084
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.12704510986804962,
+      "learning_rate": 0.00016148453608247422,
+      "loss": 0.6058,
+      "step": 1085
+    },
+    {
+      "epoch": 0.34752,
+      "grad_norm": 0.1170935109257698,
+      "learning_rate": 0.00016144329896907217,
+      "loss": 0.7592,
+      "step": 1086
+    },
+    {
+      "epoch": 0.34784,
+      "grad_norm": 0.1298816204071045,
+      "learning_rate": 0.0001614020618556701,
+      "loss": 0.7716,
+      "step": 1087
+    },
+    {
+      "epoch": 0.34816,
+      "grad_norm": 0.12749038636684418,
+      "learning_rate": 0.00016136082474226806,
+      "loss": 0.6785,
+      "step": 1088
+    },
+    {
+      "epoch": 0.34848,
+      "grad_norm": 0.13424167037010193,
+      "learning_rate": 0.000161319587628866,
+      "loss": 0.6733,
+      "step": 1089
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.14593367278575897,
+      "learning_rate": 0.00016127835051546393,
+      "loss": 0.8175,
+      "step": 1090
+    },
+    {
+      "epoch": 0.34912,
+      "grad_norm": 0.13538609445095062,
+      "learning_rate": 0.00016123711340206186,
+      "loss": 0.8064,
+      "step": 1091
+    },
+    {
+      "epoch": 0.34944,
+      "grad_norm": 0.1283688247203827,
+      "learning_rate": 0.0001611958762886598,
+      "loss": 0.7415,
+      "step": 1092
+    },
+    {
+      "epoch": 0.34976,
+      "grad_norm": 0.11879759281873703,
+      "learning_rate": 0.00016115463917525775,
+      "loss": 0.6461,
+      "step": 1093
+    },
+    {
+      "epoch": 0.35008,
+      "grad_norm": 0.15270709991455078,
+      "learning_rate": 0.00016111340206185568,
+      "loss": 0.6833,
+      "step": 1094
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.14082631468772888,
+      "learning_rate": 0.0001610721649484536,
+      "loss": 0.8527,
+      "step": 1095
+    },
+    {
+      "epoch": 0.35072,
+      "grad_norm": 0.13917644321918488,
+      "learning_rate": 0.00016103092783505157,
+      "loss": 0.6855,
+      "step": 1096
+    },
+    {
+      "epoch": 0.35104,
+      "grad_norm": 0.12534797191619873,
+      "learning_rate": 0.0001609896907216495,
+      "loss": 0.7396,
+      "step": 1097
+    },
+    {
+      "epoch": 0.35136,
+      "grad_norm": 0.1246245950460434,
+      "learning_rate": 0.00016094845360824743,
+      "loss": 0.7373,
+      "step": 1098
+    },
+    {
+      "epoch": 0.35168,
+      "grad_norm": 0.1447897106409073,
+      "learning_rate": 0.00016090721649484536,
+      "loss": 0.7252,
+      "step": 1099
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.1323324739933014,
+      "learning_rate": 0.0001608659793814433,
+      "loss": 0.6699,
+      "step": 1100
+    },
+    {
+      "epoch": 0.35232,
+      "grad_norm": 0.12323680520057678,
+      "learning_rate": 0.00016082474226804125,
+      "loss": 0.7758,
+      "step": 1101
+    },
+    {
+      "epoch": 0.35264,
+      "grad_norm": 0.1507241576910019,
+      "learning_rate": 0.00016078350515463918,
+      "loss": 0.7029,
+      "step": 1102
+    },
+    {
+      "epoch": 0.35296,
+      "grad_norm": 0.1255369633436203,
+      "learning_rate": 0.00016074226804123714,
+      "loss": 0.7446,
+      "step": 1103
+    },
+    {
+      "epoch": 0.35328,
+      "grad_norm": 0.1623598039150238,
+      "learning_rate": 0.00016070103092783507,
+      "loss": 0.8059,
+      "step": 1104
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.14316579699516296,
+      "learning_rate": 0.00016065979381443298,
+      "loss": 0.7224,
+      "step": 1105
+    },
+    {
+      "epoch": 0.35392,
+      "grad_norm": 0.12849082052707672,
+      "learning_rate": 0.00016061855670103094,
+      "loss": 0.5966,
+      "step": 1106
+    },
+    {
+      "epoch": 0.35424,
+      "grad_norm": 0.11844538897275925,
+      "learning_rate": 0.00016057731958762887,
+      "loss": 0.6075,
+      "step": 1107
+    },
+    {
+      "epoch": 0.35456,
+      "grad_norm": 0.14459797739982605,
+      "learning_rate": 0.0001605360824742268,
+      "loss": 0.641,
+      "step": 1108
+    },
+    {
+      "epoch": 0.35488,
+      "grad_norm": 0.12330617755651474,
+      "learning_rate": 0.00016049484536082476,
+      "loss": 0.6642,
+      "step": 1109
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.12356940656900406,
+      "learning_rate": 0.0001604536082474227,
+      "loss": 0.6615,
+      "step": 1110
+    },
+    {
+      "epoch": 0.35552,
+      "grad_norm": 0.1263560801744461,
+      "learning_rate": 0.00016041237113402065,
+      "loss": 0.832,
+      "step": 1111
+    },
+    {
+      "epoch": 0.35584,
+      "grad_norm": 0.13208667933940887,
+      "learning_rate": 0.00016037113402061855,
+      "loss": 0.8078,
+      "step": 1112
+    },
+    {
+      "epoch": 0.35616,
+      "grad_norm": 0.11938857287168503,
+      "learning_rate": 0.00016032989690721648,
+      "loss": 0.6706,
+      "step": 1113
+    },
+    {
+      "epoch": 0.35648,
+      "grad_norm": 0.15087862312793732,
+      "learning_rate": 0.00016028865979381444,
+      "loss": 0.6342,
+      "step": 1114
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.13852839171886444,
+      "learning_rate": 0.00016024742268041237,
+      "loss": 0.7312,
+      "step": 1115
+    },
+    {
+      "epoch": 0.35712,
+      "grad_norm": 0.123874731361866,
+      "learning_rate": 0.00016020618556701033,
+      "loss": 0.5679,
+      "step": 1116
+    },
+    {
+      "epoch": 0.35744,
+      "grad_norm": 0.1398034542798996,
+      "learning_rate": 0.00016016494845360826,
+      "loss": 0.7158,
+      "step": 1117
+    },
+    {
+      "epoch": 0.35776,
+      "grad_norm": 0.15118446946144104,
+      "learning_rate": 0.0001601237113402062,
+      "loss": 0.9009,
+      "step": 1118
+    },
+    {
+      "epoch": 0.35808,
+      "grad_norm": 0.13998092710971832,
+      "learning_rate": 0.00016008247422680412,
+      "loss": 0.7229,
+      "step": 1119
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.13464230298995972,
+      "learning_rate": 0.00016004123711340205,
+      "loss": 0.6686,
+      "step": 1120
+    },
+    {
+      "epoch": 0.35872,
+      "grad_norm": 0.1247800886631012,
+      "learning_rate": 0.00016,
+      "loss": 0.6007,
+      "step": 1121
+    },
+    {
+      "epoch": 0.35904,
+      "grad_norm": 0.15494264662265778,
+      "learning_rate": 0.00015995876288659794,
+      "loss": 0.7049,
+      "step": 1122
+    },
+    {
+      "epoch": 0.35936,
+      "grad_norm": 0.13575033843517303,
+      "learning_rate": 0.00015991752577319588,
+      "loss": 0.6467,
+      "step": 1123
+    },
+    {
+      "epoch": 0.35968,
+      "grad_norm": 0.13395622372627258,
+      "learning_rate": 0.00015987628865979383,
+      "loss": 0.8035,
+      "step": 1124
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.12352868914604187,
+      "learning_rate": 0.00015983505154639177,
+      "loss": 0.5654,
+      "step": 1125
+    },
+    {
+      "epoch": 0.36032,
+      "grad_norm": 0.1200244277715683,
+      "learning_rate": 0.0001597938144329897,
+      "loss": 0.6702,
+      "step": 1126
+    },
+    {
+      "epoch": 0.36064,
+      "grad_norm": 0.1397937685251236,
+      "learning_rate": 0.00015975257731958763,
+      "loss": 0.6206,
+      "step": 1127
+    },
+    {
+      "epoch": 0.36096,
+      "grad_norm": 0.14540423452854156,
+      "learning_rate": 0.00015971134020618556,
+      "loss": 0.7432,
+      "step": 1128
+    },
+    {
+      "epoch": 0.36128,
+      "grad_norm": 0.136407732963562,
+      "learning_rate": 0.00015967010309278352,
+      "loss": 0.7838,
+      "step": 1129
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.12719576060771942,
+      "learning_rate": 0.00015962886597938145,
+      "loss": 0.7853,
+      "step": 1130
+    },
+    {
+      "epoch": 0.36192,
+      "grad_norm": 0.12858954071998596,
+      "learning_rate": 0.0001595876288659794,
+      "loss": 0.6824,
+      "step": 1131
+    },
+    {
+      "epoch": 0.36224,
+      "grad_norm": 0.1230321004986763,
+      "learning_rate": 0.00015954639175257734,
+      "loss": 0.7019,
+      "step": 1132
+    },
+    {
+      "epoch": 0.36256,
+      "grad_norm": 0.15037384629249573,
+      "learning_rate": 0.00015950515463917527,
+      "loss": 0.8284,
+      "step": 1133
+    },
+    {
+      "epoch": 0.36288,
+      "grad_norm": 0.12637773156166077,
+      "learning_rate": 0.0001594639175257732,
+      "loss": 0.5629,
+      "step": 1134
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.1314142346382141,
+      "learning_rate": 0.00015942268041237113,
+      "loss": 0.8029,
+      "step": 1135
+    },
+    {
+      "epoch": 0.36352,
+      "grad_norm": 0.1253247708082199,
+      "learning_rate": 0.0001593814432989691,
+      "loss": 0.8017,
+      "step": 1136
+    },
+    {
+      "epoch": 0.36384,
+      "grad_norm": 0.14196865260601044,
+      "learning_rate": 0.00015934020618556702,
+      "loss": 0.7405,
+      "step": 1137
+    },
+    {
+      "epoch": 0.36416,
+      "grad_norm": 0.13544997572898865,
+      "learning_rate": 0.00015929896907216495,
+      "loss": 0.8474,
+      "step": 1138
+    },
+    {
+      "epoch": 0.36448,
+      "grad_norm": 0.14127661287784576,
+      "learning_rate": 0.0001592577319587629,
+      "loss": 0.7043,
+      "step": 1139
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.12157486379146576,
+      "learning_rate": 0.00015921649484536084,
+      "loss": 0.822,
+      "step": 1140
+    },
+    {
+      "epoch": 0.36512,
+      "grad_norm": 0.1410183608531952,
+      "learning_rate": 0.00015917525773195875,
+      "loss": 0.5824,
+      "step": 1141
+    },
+    {
+      "epoch": 0.36544,
+      "grad_norm": 0.12342025339603424,
+      "learning_rate": 0.0001591340206185567,
+      "loss": 0.577,
+      "step": 1142
+    },
+    {
+      "epoch": 0.36576,
+      "grad_norm": 0.15177765488624573,
+      "learning_rate": 0.00015909278350515464,
+      "loss": 0.5881,
+      "step": 1143
+    },
+    {
+      "epoch": 0.36608,
+      "grad_norm": 0.140254408121109,
+      "learning_rate": 0.0001590515463917526,
+      "loss": 0.6406,
+      "step": 1144
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.11794767528772354,
+      "learning_rate": 0.00015901030927835053,
+      "loss": 0.7347,
+      "step": 1145
+    },
+    {
+      "epoch": 0.36672,
+      "grad_norm": 0.13113269209861755,
+      "learning_rate": 0.00015896907216494846,
+      "loss": 0.8625,
+      "step": 1146
+    },
+    {
+      "epoch": 0.36704,
+      "grad_norm": 0.13670428097248077,
+      "learning_rate": 0.00015892783505154642,
+      "loss": 0.6342,
+      "step": 1147
+    },
+    {
+      "epoch": 0.36736,
+      "grad_norm": 0.12990567088127136,
+      "learning_rate": 0.00015888659793814432,
+      "loss": 0.7242,
+      "step": 1148
+    },
+    {
+      "epoch": 0.36768,
+      "grad_norm": 0.14626851677894592,
+      "learning_rate": 0.00015884536082474228,
+      "loss": 0.7929,
+      "step": 1149
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.1317480057477951,
+      "learning_rate": 0.0001588041237113402,
+      "loss": 0.7436,
+      "step": 1150
+    },
+    {
+      "epoch": 0.36832,
+      "grad_norm": 0.1666252166032791,
+      "learning_rate": 0.00015876288659793814,
+      "loss": 0.6675,
+      "step": 1151
+    },
+    {
+      "epoch": 0.36864,
+      "grad_norm": 0.12659478187561035,
+      "learning_rate": 0.0001587216494845361,
+      "loss": 0.7032,
+      "step": 1152
+    },
+    {
+      "epoch": 0.36896,
+      "grad_norm": 0.12916727364063263,
+      "learning_rate": 0.00015868041237113403,
+      "loss": 0.7782,
+      "step": 1153
+    },
+    {
+      "epoch": 0.36928,
+      "grad_norm": 0.11833591759204865,
+      "learning_rate": 0.000158639175257732,
+      "loss": 0.5818,
+      "step": 1154
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.1341896802186966,
+      "learning_rate": 0.0001585979381443299,
+      "loss": 0.5695,
+      "step": 1155
+    },
+    {
+      "epoch": 0.36992,
+      "grad_norm": 0.13139894604682922,
+      "learning_rate": 0.00015855670103092782,
+      "loss": 0.7195,
+      "step": 1156
+    },
+    {
+      "epoch": 0.37024,
+      "grad_norm": 0.12866564095020294,
+      "learning_rate": 0.00015851546391752578,
+      "loss": 0.5747,
+      "step": 1157
+    },
+    {
+      "epoch": 0.37056,
+      "grad_norm": 0.13645455241203308,
+      "learning_rate": 0.00015847422680412371,
+      "loss": 0.632,
+      "step": 1158
+    },
+    {
+      "epoch": 0.37088,
+      "grad_norm": 0.12615318596363068,
+      "learning_rate": 0.00015843298969072167,
+      "loss": 0.6667,
+      "step": 1159
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.13311846554279327,
+      "learning_rate": 0.0001583917525773196,
+      "loss": 0.5789,
+      "step": 1160
+    },
+    {
+      "epoch": 0.37152,
+      "grad_norm": 0.14214034378528595,
+      "learning_rate": 0.00015835051546391754,
+      "loss": 0.7526,
+      "step": 1161
+    },
+    {
+      "epoch": 0.37184,
+      "grad_norm": 0.14578349888324738,
+      "learning_rate": 0.00015830927835051547,
+      "loss": 0.8063,
+      "step": 1162
+    },
+    {
+      "epoch": 0.37216,
+      "grad_norm": 0.14859075844287872,
+      "learning_rate": 0.0001582680412371134,
+      "loss": 0.6875,
+      "step": 1163
+    },
+    {
+      "epoch": 0.37248,
+      "grad_norm": 0.1303577870130539,
+      "learning_rate": 0.00015822680412371136,
+      "loss": 0.6363,
+      "step": 1164
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.13182777166366577,
+      "learning_rate": 0.0001581855670103093,
+      "loss": 0.7822,
+      "step": 1165
+    },
+    {
+      "epoch": 0.37312,
+      "grad_norm": 0.11544372886419296,
+      "learning_rate": 0.00015814432989690722,
+      "loss": 0.7469,
+      "step": 1166
+    },
+    {
+      "epoch": 0.37344,
+      "grad_norm": 0.12924250960350037,
+      "learning_rate": 0.00015810309278350518,
+      "loss": 0.7625,
+      "step": 1167
+    },
+    {
+      "epoch": 0.37376,
+      "grad_norm": 0.126427561044693,
+      "learning_rate": 0.0001580618556701031,
+      "loss": 0.651,
+      "step": 1168
+    },
+    {
+      "epoch": 0.37408,
+      "grad_norm": 0.13961675763130188,
+      "learning_rate": 0.00015802061855670104,
+      "loss": 0.7959,
+      "step": 1169
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.14169563353061676,
+      "learning_rate": 0.00015797938144329897,
+      "loss": 0.8699,
+      "step": 1170
+    },
+    {
+      "epoch": 0.37472,
+      "grad_norm": 0.1441062092781067,
+      "learning_rate": 0.0001579381443298969,
+      "loss": 0.7856,
+      "step": 1171
+    },
+    {
+      "epoch": 0.37504,
+      "grad_norm": 0.15582816302776337,
+      "learning_rate": 0.00015789690721649486,
+      "loss": 0.7262,
+      "step": 1172
+    },
+    {
+      "epoch": 0.37536,
+      "grad_norm": 0.12826552987098694,
+      "learning_rate": 0.0001578556701030928,
+      "loss": 0.7309,
+      "step": 1173
+    },
+    {
+      "epoch": 0.37568,
+      "grad_norm": 0.15205571055412292,
+      "learning_rate": 0.00015781443298969072,
+      "loss": 0.6164,
+      "step": 1174
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.14039137959480286,
+      "learning_rate": 0.00015777319587628868,
+      "loss": 0.6441,
+      "step": 1175
+    },
+    {
+      "epoch": 0.37632,
+      "grad_norm": 0.1307058483362198,
+      "learning_rate": 0.0001577319587628866,
+      "loss": 0.7143,
+      "step": 1176
+    },
+    {
+      "epoch": 0.37664,
+      "grad_norm": 0.14123345911502838,
+      "learning_rate": 0.00015769072164948454,
+      "loss": 0.7048,
+      "step": 1177
+    },
+    {
+      "epoch": 0.37696,
+      "grad_norm": 0.13117215037345886,
+      "learning_rate": 0.00015764948453608248,
+      "loss": 0.6066,
+      "step": 1178
+    },
+    {
+      "epoch": 0.37728,
+      "grad_norm": 0.119295634329319,
+      "learning_rate": 0.0001576082474226804,
+      "loss": 0.8624,
+      "step": 1179
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.15351873636245728,
+      "learning_rate": 0.00015756701030927836,
+      "loss": 0.6634,
+      "step": 1180
+    },
+    {
+      "epoch": 0.37792,
+      "grad_norm": 0.12913480401039124,
+      "learning_rate": 0.0001575257731958763,
+      "loss": 0.7033,
+      "step": 1181
+    },
+    {
+      "epoch": 0.37824,
+      "grad_norm": 0.11295458674430847,
+      "learning_rate": 0.00015748453608247425,
+      "loss": 0.5714,
+      "step": 1182
+    },
+    {
+      "epoch": 0.37856,
+      "grad_norm": 0.16187618672847748,
+      "learning_rate": 0.00015744329896907219,
+      "loss": 0.7198,
+      "step": 1183
+    },
+    {
+      "epoch": 0.37888,
+      "grad_norm": 0.1341710090637207,
+      "learning_rate": 0.0001574020618556701,
+      "loss": 0.7831,
+      "step": 1184
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.14047707617282867,
+      "learning_rate": 0.00015736082474226805,
+      "loss": 0.8205,
+      "step": 1185
+    },
+    {
+      "epoch": 0.37952,
+      "grad_norm": 0.12923875451087952,
+      "learning_rate": 0.00015731958762886598,
+      "loss": 0.6956,
+      "step": 1186
+    },
+    {
+      "epoch": 0.37984,
+      "grad_norm": 0.13387133181095123,
+      "learning_rate": 0.00015727835051546394,
+      "loss": 0.659,
+      "step": 1187
+    },
+    {
+      "epoch": 0.38016,
+      "grad_norm": 0.11878004670143127,
+      "learning_rate": 0.00015723711340206187,
+      "loss": 0.7992,
+      "step": 1188
+    },
+    {
+      "epoch": 0.38048,
+      "grad_norm": 0.13417847454547882,
+      "learning_rate": 0.0001571958762886598,
+      "loss": 0.5762,
+      "step": 1189
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.15006715059280396,
+      "learning_rate": 0.00015715463917525773,
+      "loss": 0.6173,
+      "step": 1190
+    },
+    {
+      "epoch": 0.38112,
+      "grad_norm": 0.14036807417869568,
+      "learning_rate": 0.00015711340206185566,
+      "loss": 0.7551,
+      "step": 1191
+    },
+    {
+      "epoch": 0.38144,
+      "grad_norm": 0.11697541177272797,
+      "learning_rate": 0.00015707216494845362,
+      "loss": 0.5426,
+      "step": 1192
+    },
+    {
+      "epoch": 0.38176,
+      "grad_norm": 0.11747653037309647,
+      "learning_rate": 0.00015703092783505155,
+      "loss": 0.7528,
+      "step": 1193
+    },
+    {
+      "epoch": 0.38208,
+      "grad_norm": 0.13758531212806702,
+      "learning_rate": 0.00015698969072164948,
+      "loss": 0.6126,
+      "step": 1194
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.13478727638721466,
+      "learning_rate": 0.00015694845360824744,
+      "loss": 0.6984,
+      "step": 1195
+    },
+    {
+      "epoch": 0.38272,
+      "grad_norm": 0.12369264662265778,
+      "learning_rate": 0.00015690721649484537,
+      "loss": 0.7509,
+      "step": 1196
+    },
+    {
+      "epoch": 0.38304,
+      "grad_norm": 0.13458921015262604,
+      "learning_rate": 0.0001568659793814433,
+      "loss": 0.7056,
+      "step": 1197
+    },
+    {
+      "epoch": 0.38336,
+      "grad_norm": 0.1378922164440155,
+      "learning_rate": 0.00015682474226804124,
+      "loss": 0.7409,
+      "step": 1198
+    },
+    {
+      "epoch": 0.38368,
+      "grad_norm": 0.1378144472837448,
+      "learning_rate": 0.00015678350515463917,
+      "loss": 0.5798,
+      "step": 1199
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.12796853482723236,
+      "learning_rate": 0.00015674226804123713,
+      "loss": 0.6913,
+      "step": 1200
+    },
+    {
+      "epoch": 0.38432,
+      "grad_norm": 0.14590856432914734,
+      "learning_rate": 0.00015670103092783506,
+      "loss": 0.7123,
+      "step": 1201
+    },
+    {
+      "epoch": 0.38464,
+      "grad_norm": 0.1079840138554573,
+      "learning_rate": 0.000156659793814433,
+      "loss": 0.5882,
+      "step": 1202
+    },
+    {
+      "epoch": 0.38496,
+      "grad_norm": 0.13286516070365906,
+      "learning_rate": 0.00015661855670103095,
+      "loss": 0.7692,
+      "step": 1203
+    },
+    {
+      "epoch": 0.38528,
+      "grad_norm": 0.1428433209657669,
+      "learning_rate": 0.00015657731958762888,
+      "loss": 0.838,
+      "step": 1204
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.12319666147232056,
+      "learning_rate": 0.0001565360824742268,
+      "loss": 0.846,
+      "step": 1205
+    },
+    {
+      "epoch": 0.38592,
+      "grad_norm": 0.14117276668548584,
+      "learning_rate": 0.00015649484536082474,
+      "loss": 0.7958,
+      "step": 1206
+    },
+    {
+      "epoch": 0.38624,
+      "grad_norm": 0.11911432445049286,
+      "learning_rate": 0.00015645360824742267,
+      "loss": 0.6158,
+      "step": 1207
+    },
+    {
+      "epoch": 0.38656,
+      "grad_norm": 0.13786646723747253,
+      "learning_rate": 0.00015641237113402063,
+      "loss": 0.7601,
+      "step": 1208
+    },
+    {
+      "epoch": 0.38688,
+      "grad_norm": 0.11726567149162292,
+      "learning_rate": 0.00015637113402061856,
+      "loss": 0.6906,
+      "step": 1209
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.12047921866178513,
+      "learning_rate": 0.00015632989690721652,
+      "loss": 0.638,
+      "step": 1210
+    },
+    {
+      "epoch": 0.38752,
+      "grad_norm": 0.1304006725549698,
+      "learning_rate": 0.00015628865979381445,
+      "loss": 0.7369,
+      "step": 1211
+    },
+    {
+      "epoch": 0.38784,
+      "grad_norm": 0.12691730260849,
+      "learning_rate": 0.00015624742268041238,
+      "loss": 0.747,
+      "step": 1212
+    },
+    {
+      "epoch": 0.38816,
+      "grad_norm": 0.13422106206417084,
+      "learning_rate": 0.00015620618556701031,
+      "loss": 0.7903,
+      "step": 1213
+    },
+    {
+      "epoch": 0.38848,
+      "grad_norm": 0.11382526159286499,
+      "learning_rate": 0.00015616494845360824,
+      "loss": 0.5815,
+      "step": 1214
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.14089763164520264,
+      "learning_rate": 0.0001561237113402062,
+      "loss": 0.7882,
+      "step": 1215
+    },
+    {
+      "epoch": 0.38912,
+      "grad_norm": 0.12979736924171448,
+      "learning_rate": 0.00015608247422680413,
+      "loss": 0.7013,
+      "step": 1216
+    },
+    {
+      "epoch": 0.38944,
+      "grad_norm": 0.1168995201587677,
+      "learning_rate": 0.00015604123711340207,
+      "loss": 0.6702,
+      "step": 1217
+    },
+    {
+      "epoch": 0.38976,
+      "grad_norm": 0.1315268576145172,
+      "learning_rate": 0.00015600000000000002,
+      "loss": 0.6036,
+      "step": 1218
+    },
+    {
+      "epoch": 0.39008,
+      "grad_norm": 0.130551278591156,
+      "learning_rate": 0.00015595876288659796,
+      "loss": 0.7314,
+      "step": 1219
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.11757661402225494,
+      "learning_rate": 0.0001559175257731959,
+      "loss": 0.6066,
+      "step": 1220
+    },
+    {
+      "epoch": 0.39072,
+      "grad_norm": 0.12399845570325851,
+      "learning_rate": 0.00015587628865979382,
+      "loss": 0.695,
+      "step": 1221
+    },
+    {
+      "epoch": 0.39104,
+      "grad_norm": 0.1578100472688675,
+      "learning_rate": 0.00015583505154639175,
+      "loss": 0.6838,
+      "step": 1222
+    },
+    {
+      "epoch": 0.39136,
+      "grad_norm": 0.13307176530361176,
+      "learning_rate": 0.0001557938144329897,
+      "loss": 0.7501,
+      "step": 1223
+    },
+    {
+      "epoch": 0.39168,
+      "grad_norm": 0.13976946473121643,
+      "learning_rate": 0.00015575257731958764,
+      "loss": 0.7296,
+      "step": 1224
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.1478758454322815,
+      "learning_rate": 0.0001557113402061856,
+      "loss": 0.6399,
+      "step": 1225
+    },
+    {
+      "epoch": 0.39232,
+      "grad_norm": 0.1445852369070053,
+      "learning_rate": 0.0001556701030927835,
+      "loss": 0.691,
+      "step": 1226
+    },
+    {
+      "epoch": 0.39264,
+      "grad_norm": 0.1240420863032341,
+      "learning_rate": 0.00015562886597938143,
+      "loss": 0.8225,
+      "step": 1227
+    },
+    {
+      "epoch": 0.39296,
+      "grad_norm": 0.1172756627202034,
+      "learning_rate": 0.0001555876288659794,
+      "loss": 0.7198,
+      "step": 1228
+    },
+    {
+      "epoch": 0.39328,
+      "grad_norm": 0.11416526883840561,
+      "learning_rate": 0.00015554639175257732,
+      "loss": 0.6399,
+      "step": 1229
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.1195296198129654,
+      "learning_rate": 0.00015550515463917525,
+      "loss": 0.7431,
+      "step": 1230
+    },
+    {
+      "epoch": 0.39392,
+      "grad_norm": 0.13839997351169586,
+      "learning_rate": 0.0001554639175257732,
+      "loss": 0.7989,
+      "step": 1231
+    },
+    {
+      "epoch": 0.39424,
+      "grad_norm": 0.15744848549365997,
+      "learning_rate": 0.00015542268041237114,
+      "loss": 0.7423,
+      "step": 1232
+    },
+    {
+      "epoch": 0.39456,
+      "grad_norm": 0.12856073677539825,
+      "learning_rate": 0.00015538144329896907,
+      "loss": 0.5825,
+      "step": 1233
+    },
+    {
+      "epoch": 0.39488,
+      "grad_norm": 0.1339053362607956,
+      "learning_rate": 0.000155340206185567,
+      "loss": 0.7938,
+      "step": 1234
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.13555975258350372,
+      "learning_rate": 0.00015529896907216494,
+      "loss": 0.6591,
+      "step": 1235
+    },
+    {
+      "epoch": 0.39552,
+      "grad_norm": 0.14164508879184723,
+      "learning_rate": 0.0001552577319587629,
+      "loss": 0.6992,
+      "step": 1236
+    },
+    {
+      "epoch": 0.39584,
+      "grad_norm": 0.11711747199296951,
+      "learning_rate": 0.00015521649484536083,
+      "loss": 0.6513,
+      "step": 1237
+    },
+    {
+      "epoch": 0.39616,
+      "grad_norm": 0.1403762400150299,
+      "learning_rate": 0.00015517525773195879,
+      "loss": 0.7089,
+      "step": 1238
+    },
+    {
+      "epoch": 0.39648,
+      "grad_norm": 0.125601127743721,
+      "learning_rate": 0.00015513402061855672,
+      "loss": 0.5521,
+      "step": 1239
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.12626031041145325,
+      "learning_rate": 0.00015509278350515465,
+      "loss": 0.6345,
+      "step": 1240
+    },
+    {
+      "epoch": 0.39712,
+      "grad_norm": 0.1359778344631195,
+      "learning_rate": 0.00015505154639175258,
+      "loss": 0.7158,
+      "step": 1241
+    },
+    {
+      "epoch": 0.39744,
+      "grad_norm": 0.15178370475769043,
+      "learning_rate": 0.0001550103092783505,
+      "loss": 0.7438,
+      "step": 1242
+    },
+    {
+      "epoch": 0.39776,
+      "grad_norm": 0.12930850684642792,
+      "learning_rate": 0.00015496907216494847,
+      "loss": 0.8341,
+      "step": 1243
+    },
+    {
+      "epoch": 0.39808,
+      "grad_norm": 0.12345188111066818,
+      "learning_rate": 0.0001549278350515464,
+      "loss": 0.709,
+      "step": 1244
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.14064045250415802,
+      "learning_rate": 0.00015488659793814433,
+      "loss": 0.7319,
+      "step": 1245
+    },
+    {
+      "epoch": 0.39872,
+      "grad_norm": 0.1376401036977768,
+      "learning_rate": 0.0001548453608247423,
+      "loss": 0.6587,
+      "step": 1246
+    },
+    {
+      "epoch": 0.39904,
+      "grad_norm": 0.13576684892177582,
+      "learning_rate": 0.00015480412371134022,
+      "loss": 0.8445,
+      "step": 1247
+    },
+    {
+      "epoch": 0.39936,
+      "grad_norm": 0.1358659416437149,
+      "learning_rate": 0.00015476288659793815,
+      "loss": 0.8681,
+      "step": 1248
+    },
+    {
+      "epoch": 0.39968,
+      "grad_norm": 0.14173083007335663,
+      "learning_rate": 0.00015472164948453608,
+      "loss": 0.6538,
+      "step": 1249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.12762658298015594,
+      "learning_rate": 0.00015468041237113401,
+      "loss": 0.7012,
+      "step": 1250
+    },
+    {
+      "epoch": 0.40032,
+      "grad_norm": 0.14135272800922394,
+      "learning_rate": 0.00015463917525773197,
+      "loss": 0.6523,
+      "step": 1251
+    },
+    {
+      "epoch": 0.40064,
+      "grad_norm": 0.12327791005373001,
+      "learning_rate": 0.0001545979381443299,
+      "loss": 0.7231,
+      "step": 1252
+    },
+    {
+      "epoch": 0.40096,
+      "grad_norm": 0.13688601553440094,
+      "learning_rate": 0.00015455670103092786,
+      "loss": 0.6666,
+      "step": 1253
+    },
+    {
+      "epoch": 0.40128,
+      "grad_norm": 0.13409943878650665,
+      "learning_rate": 0.0001545154639175258,
+      "loss": 0.7849,
+      "step": 1254
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.14107739925384521,
+      "learning_rate": 0.0001544742268041237,
+      "loss": 0.7455,
+      "step": 1255
+    },
+    {
+      "epoch": 0.40192,
+      "grad_norm": 0.12240707129240036,
+      "learning_rate": 0.00015443298969072166,
+      "loss": 0.7664,
+      "step": 1256
+    },
+    {
+      "epoch": 0.40224,
+      "grad_norm": 0.1431652456521988,
+      "learning_rate": 0.0001543917525773196,
+      "loss": 0.7215,
+      "step": 1257
+    },
+    {
+      "epoch": 0.40256,
+      "grad_norm": 0.14196695387363434,
+      "learning_rate": 0.00015435051546391752,
+      "loss": 0.7169,
+      "step": 1258
+    },
+    {
+      "epoch": 0.40288,
+      "grad_norm": 0.15556858479976654,
+      "learning_rate": 0.00015430927835051548,
+      "loss": 0.7989,
+      "step": 1259
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.1357007473707199,
+      "learning_rate": 0.0001542680412371134,
+      "loss": 0.6919,
+      "step": 1260
+    },
+    {
+      "epoch": 0.40352,
+      "grad_norm": 0.13433754444122314,
+      "learning_rate": 0.00015422680412371137,
+      "loss": 0.7744,
+      "step": 1261
+    },
+    {
+      "epoch": 0.40384,
+      "grad_norm": 0.1415691375732422,
+      "learning_rate": 0.00015418556701030927,
+      "loss": 0.7641,
+      "step": 1262
+    },
+    {
+      "epoch": 0.40416,
+      "grad_norm": 0.15165191888809204,
+      "learning_rate": 0.0001541443298969072,
+      "loss": 0.7048,
+      "step": 1263
+    },
+    {
+      "epoch": 0.40448,
+      "grad_norm": 0.15109646320343018,
+      "learning_rate": 0.00015410309278350516,
+      "loss": 0.7522,
+      "step": 1264
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.1453443020582199,
+      "learning_rate": 0.0001540618556701031,
+      "loss": 0.7627,
+      "step": 1265
+    },
+    {
+      "epoch": 0.40512,
+      "grad_norm": 0.13811153173446655,
+      "learning_rate": 0.00015402061855670105,
+      "loss": 0.8577,
+      "step": 1266
+    },
+    {
+      "epoch": 0.40544,
+      "grad_norm": 0.11404547840356827,
+      "learning_rate": 0.00015397938144329898,
+      "loss": 0.6577,
+      "step": 1267
+    },
+    {
+      "epoch": 0.40576,
+      "grad_norm": 0.11984894424676895,
+      "learning_rate": 0.0001539381443298969,
+      "loss": 0.735,
+      "step": 1268
+    },
+    {
+      "epoch": 0.40608,
+      "grad_norm": 0.12414862215518951,
+      "learning_rate": 0.00015389690721649484,
+      "loss": 0.5764,
+      "step": 1269
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.14393122494220734,
+      "learning_rate": 0.00015385567010309278,
+      "loss": 0.7502,
+      "step": 1270
+    },
+    {
+      "epoch": 0.40672,
+      "grad_norm": 0.12545761466026306,
+      "learning_rate": 0.00015381443298969073,
+      "loss": 0.5799,
+      "step": 1271
+    },
+    {
+      "epoch": 0.40704,
+      "grad_norm": 0.1390172392129898,
+      "learning_rate": 0.00015377319587628867,
+      "loss": 0.6935,
+      "step": 1272
+    },
+    {
+      "epoch": 0.40736,
+      "grad_norm": 0.13300630450248718,
+      "learning_rate": 0.0001537319587628866,
+      "loss": 0.6794,
+      "step": 1273
+    },
+    {
+      "epoch": 0.40768,
+      "grad_norm": 0.14342691004276276,
+      "learning_rate": 0.00015369072164948456,
+      "loss": 0.7296,
+      "step": 1274
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.12729567289352417,
+      "learning_rate": 0.00015364948453608249,
+      "loss": 0.6043,
+      "step": 1275
+    },
+    {
+      "epoch": 0.40832,
+      "grad_norm": 0.12945295870304108,
+      "learning_rate": 0.00015360824742268042,
+      "loss": 0.6406,
+      "step": 1276
+    },
+    {
+      "epoch": 0.40864,
+      "grad_norm": 0.13834479451179504,
+      "learning_rate": 0.00015356701030927835,
+      "loss": 0.6925,
+      "step": 1277
+    },
+    {
+      "epoch": 0.40896,
+      "grad_norm": 0.13942793011665344,
+      "learning_rate": 0.00015352577319587628,
+      "loss": 0.5954,
+      "step": 1278
+    },
+    {
+      "epoch": 0.40928,
+      "grad_norm": 0.12837499380111694,
+      "learning_rate": 0.00015348453608247424,
+      "loss": 0.7063,
+      "step": 1279
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.13395194709300995,
+      "learning_rate": 0.00015344329896907217,
+      "loss": 0.88,
+      "step": 1280
+    },
+    {
+      "epoch": 0.40992,
+      "grad_norm": 0.12082265317440033,
+      "learning_rate": 0.00015340206185567013,
+      "loss": 0.7147,
+      "step": 1281
+    },
+    {
+      "epoch": 0.41024,
+      "grad_norm": 0.12534114718437195,
+      "learning_rate": 0.00015336082474226806,
+      "loss": 0.736,
+      "step": 1282
+    },
+    {
+      "epoch": 0.41056,
+      "grad_norm": 0.12417594343423843,
+      "learning_rate": 0.000153319587628866,
+      "loss": 0.7383,
+      "step": 1283
+    },
+    {
+      "epoch": 0.41088,
+      "grad_norm": 0.1353493630886078,
+      "learning_rate": 0.00015327835051546392,
+      "loss": 0.8153,
+      "step": 1284
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.12171890586614609,
+      "learning_rate": 0.00015323711340206185,
+      "loss": 0.6863,
+      "step": 1285
+    },
+    {
+      "epoch": 0.41152,
+      "grad_norm": 0.12119521200656891,
+      "learning_rate": 0.0001531958762886598,
+      "loss": 0.7354,
+      "step": 1286
+    },
+    {
+      "epoch": 0.41184,
+      "grad_norm": 0.14262773096561432,
+      "learning_rate": 0.00015315463917525774,
+      "loss": 0.7325,
+      "step": 1287
+    },
+    {
+      "epoch": 0.41216,
+      "grad_norm": 0.14777816832065582,
+      "learning_rate": 0.00015311340206185567,
+      "loss": 0.805,
+      "step": 1288
+    },
+    {
+      "epoch": 0.41248,
+      "grad_norm": 0.15291215479373932,
+      "learning_rate": 0.00015307216494845363,
+      "loss": 0.737,
+      "step": 1289
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.13089078664779663,
+      "learning_rate": 0.00015303092783505156,
+      "loss": 0.7067,
+      "step": 1290
+    },
+    {
+      "epoch": 0.41312,
+      "grad_norm": 0.12483957409858704,
+      "learning_rate": 0.00015298969072164947,
+      "loss": 0.6353,
+      "step": 1291
+    },
+    {
+      "epoch": 0.41344,
+      "grad_norm": 0.13014909625053406,
+      "learning_rate": 0.00015294845360824743,
+      "loss": 0.8446,
+      "step": 1292
+    },
+    {
+      "epoch": 0.41376,
+      "grad_norm": 0.13972054421901703,
+      "learning_rate": 0.00015290721649484536,
+      "loss": 0.7848,
+      "step": 1293
+    },
+    {
+      "epoch": 0.41408,
+      "grad_norm": 0.14988793432712555,
+      "learning_rate": 0.00015286597938144332,
+      "loss": 0.7206,
+      "step": 1294
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.14217117428779602,
+      "learning_rate": 0.00015282474226804125,
+      "loss": 0.6913,
+      "step": 1295
+    },
+    {
+      "epoch": 0.41472,
+      "grad_norm": 0.13317078351974487,
+      "learning_rate": 0.00015278350515463918,
+      "loss": 0.6613,
+      "step": 1296
+    },
+    {
+      "epoch": 0.41504,
+      "grad_norm": 0.12724672257900238,
+      "learning_rate": 0.00015274226804123714,
+      "loss": 0.7247,
+      "step": 1297
+    },
+    {
+      "epoch": 0.41536,
+      "grad_norm": 0.1318334937095642,
+      "learning_rate": 0.00015270103092783504,
+      "loss": 0.7964,
+      "step": 1298
+    },
+    {
+      "epoch": 0.41568,
+      "grad_norm": 0.13650526106357574,
+      "learning_rate": 0.000152659793814433,
+      "loss": 0.6923,
+      "step": 1299
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.1494276076555252,
+      "learning_rate": 0.00015261855670103093,
+      "loss": 0.7033,
+      "step": 1300
+    },
+    {
+      "epoch": 0.41632,
+      "grad_norm": 0.1358669549226761,
+      "learning_rate": 0.00015257731958762886,
+      "loss": 0.6855,
+      "step": 1301
+    },
+    {
+      "epoch": 0.41664,
+      "grad_norm": 0.1421387791633606,
+      "learning_rate": 0.00015253608247422682,
+      "loss": 0.7236,
+      "step": 1302
+    },
+    {
+      "epoch": 0.41696,
+      "grad_norm": 0.14789563417434692,
+      "learning_rate": 0.00015249484536082475,
+      "loss": 0.6563,
+      "step": 1303
+    },
+    {
+      "epoch": 0.41728,
+      "grad_norm": 0.1348695158958435,
+      "learning_rate": 0.0001524536082474227,
+      "loss": 0.8174,
+      "step": 1304
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.1256149411201477,
+      "learning_rate": 0.00015241237113402061,
+      "loss": 0.8209,
+      "step": 1305
+    },
+    {
+      "epoch": 0.41792,
+      "grad_norm": 0.13752906024456024,
+      "learning_rate": 0.00015237113402061855,
+      "loss": 0.6479,
+      "step": 1306
+    },
+    {
+      "epoch": 0.41824,
+      "grad_norm": 0.1335868388414383,
+      "learning_rate": 0.0001523298969072165,
+      "loss": 0.8608,
+      "step": 1307
+    },
+    {
+      "epoch": 0.41856,
+      "grad_norm": 0.1334158182144165,
+      "learning_rate": 0.00015228865979381444,
+      "loss": 0.615,
+      "step": 1308
+    },
+    {
+      "epoch": 0.41888,
+      "grad_norm": 0.1413755565881729,
+      "learning_rate": 0.0001522474226804124,
+      "loss": 0.8455,
+      "step": 1309
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.11322362720966339,
+      "learning_rate": 0.00015220618556701032,
+      "loss": 0.7054,
+      "step": 1310
+    },
+    {
+      "epoch": 0.41952,
+      "grad_norm": 0.12619735300540924,
+      "learning_rate": 0.00015216494845360826,
+      "loss": 0.7141,
+      "step": 1311
+    },
+    {
+      "epoch": 0.41984,
+      "grad_norm": 0.13447314500808716,
+      "learning_rate": 0.0001521237113402062,
+      "loss": 0.6255,
+      "step": 1312
+    },
+    {
+      "epoch": 0.42016,
+      "grad_norm": 0.12858811020851135,
+      "learning_rate": 0.00015208247422680412,
+      "loss": 0.8257,
+      "step": 1313
+    },
+    {
+      "epoch": 0.42048,
+      "grad_norm": 0.14247672259807587,
+      "learning_rate": 0.00015204123711340208,
+      "loss": 0.696,
+      "step": 1314
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.15502195060253143,
+      "learning_rate": 0.000152,
+      "loss": 0.7275,
+      "step": 1315
+    },
+    {
+      "epoch": 0.42112,
+      "grad_norm": 0.11959568411111832,
+      "learning_rate": 0.00015195876288659794,
+      "loss": 0.5196,
+      "step": 1316
+    },
+    {
+      "epoch": 0.42144,
+      "grad_norm": 0.12191260606050491,
+      "learning_rate": 0.0001519175257731959,
+      "loss": 0.6726,
+      "step": 1317
+    },
+    {
+      "epoch": 0.42176,
+      "grad_norm": 0.12593212723731995,
+      "learning_rate": 0.00015187628865979383,
+      "loss": 0.7233,
+      "step": 1318
+    },
+    {
+      "epoch": 0.42208,
+      "grad_norm": 0.13905274868011475,
+      "learning_rate": 0.00015183505154639176,
+      "loss": 0.8305,
+      "step": 1319
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.11840659379959106,
+      "learning_rate": 0.0001517938144329897,
+      "loss": 0.6331,
+      "step": 1320
+    },
+    {
+      "epoch": 0.42272,
+      "grad_norm": 0.14931786060333252,
+      "learning_rate": 0.00015175257731958762,
+      "loss": 0.7547,
+      "step": 1321
+    },
+    {
+      "epoch": 0.42304,
+      "grad_norm": 0.13363072276115417,
+      "learning_rate": 0.00015171134020618558,
+      "loss": 0.6848,
+      "step": 1322
+    },
+    {
+      "epoch": 0.42336,
+      "grad_norm": 0.13666385412216187,
+      "learning_rate": 0.0001516701030927835,
+      "loss": 0.7105,
+      "step": 1323
+    },
+    {
+      "epoch": 0.42368,
+      "grad_norm": 0.14308300614356995,
+      "learning_rate": 0.00015162886597938144,
+      "loss": 0.6972,
+      "step": 1324
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.10861511528491974,
+      "learning_rate": 0.0001515876288659794,
+      "loss": 0.6213,
+      "step": 1325
+    },
+    {
+      "epoch": 0.42432,
+      "grad_norm": 0.13606682419776917,
+      "learning_rate": 0.00015154639175257733,
+      "loss": 0.789,
+      "step": 1326
+    },
+    {
+      "epoch": 0.42464,
+      "grad_norm": 0.13982504606246948,
+      "learning_rate": 0.00015150515463917526,
+      "loss": 0.6831,
+      "step": 1327
+    },
+    {
+      "epoch": 0.42496,
+      "grad_norm": 0.1425737738609314,
+      "learning_rate": 0.0001514639175257732,
+      "loss": 0.7159,
+      "step": 1328
+    },
+    {
+      "epoch": 0.42528,
+      "grad_norm": 0.1338706612586975,
+      "learning_rate": 0.00015142268041237113,
+      "loss": 0.776,
+      "step": 1329
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.13453266024589539,
+      "learning_rate": 0.00015138144329896909,
+      "loss": 0.5924,
+      "step": 1330
+    },
+    {
+      "epoch": 0.42592,
+      "grad_norm": 0.12489192187786102,
+      "learning_rate": 0.00015134020618556702,
+      "loss": 0.7241,
+      "step": 1331
+    },
+    {
+      "epoch": 0.42624,
+      "grad_norm": 0.1499992311000824,
+      "learning_rate": 0.00015129896907216498,
+      "loss": 0.7659,
+      "step": 1332
+    },
+    {
+      "epoch": 0.42656,
+      "grad_norm": 0.13693708181381226,
+      "learning_rate": 0.0001512577319587629,
+      "loss": 0.7011,
+      "step": 1333
+    },
+    {
+      "epoch": 0.42688,
+      "grad_norm": 0.1315765529870987,
+      "learning_rate": 0.0001512164948453608,
+      "loss": 0.76,
+      "step": 1334
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.161947101354599,
+      "learning_rate": 0.00015117525773195877,
+      "loss": 0.8024,
+      "step": 1335
+    },
+    {
+      "epoch": 0.42752,
+      "grad_norm": 0.14304119348526,
+      "learning_rate": 0.0001511340206185567,
+      "loss": 0.7123,
+      "step": 1336
+    },
+    {
+      "epoch": 0.42784,
+      "grad_norm": 0.11994520574808121,
+      "learning_rate": 0.00015109278350515466,
+      "loss": 0.5823,
+      "step": 1337
+    },
+    {
+      "epoch": 0.42816,
+      "grad_norm": 0.13152840733528137,
+      "learning_rate": 0.0001510515463917526,
+      "loss": 0.754,
+      "step": 1338
+    },
+    {
+      "epoch": 0.42848,
+      "grad_norm": 0.1281958967447281,
+      "learning_rate": 0.00015101030927835052,
+      "loss": 0.7402,
+      "step": 1339
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.14896529912948608,
+      "learning_rate": 0.00015096907216494845,
+      "loss": 0.7494,
+      "step": 1340
+    },
+    {
+      "epoch": 0.42912,
+      "grad_norm": 0.1480063498020172,
+      "learning_rate": 0.00015092783505154638,
+      "loss": 0.8042,
+      "step": 1341
+    },
+    {
+      "epoch": 0.42944,
+      "grad_norm": 0.11985750496387482,
+      "learning_rate": 0.00015088659793814434,
+      "loss": 0.7831,
+      "step": 1342
+    },
+    {
+      "epoch": 0.42976,
+      "grad_norm": 0.14073733985424042,
+      "learning_rate": 0.00015084536082474227,
+      "loss": 0.7365,
+      "step": 1343
+    },
+    {
+      "epoch": 0.43008,
+      "grad_norm": 0.15199826657772064,
+      "learning_rate": 0.0001508041237113402,
+      "loss": 0.6514,
+      "step": 1344
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.15114089846611023,
+      "learning_rate": 0.00015076288659793816,
+      "loss": 0.8126,
+      "step": 1345
+    },
+    {
+      "epoch": 0.43072,
+      "grad_norm": 0.13839714229106903,
+      "learning_rate": 0.0001507216494845361,
+      "loss": 0.6972,
+      "step": 1346
+    },
+    {
+      "epoch": 0.43104,
+      "grad_norm": 0.1391737163066864,
+      "learning_rate": 0.00015068041237113403,
+      "loss": 0.666,
+      "step": 1347
+    },
+    {
+      "epoch": 0.43136,
+      "grad_norm": 0.14514316618442535,
+      "learning_rate": 0.00015063917525773196,
+      "loss": 0.6214,
+      "step": 1348
+    },
+    {
+      "epoch": 0.43168,
+      "grad_norm": 0.13457876443862915,
+      "learning_rate": 0.0001505979381443299,
+      "loss": 0.6535,
+      "step": 1349
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.13254746794700623,
+      "learning_rate": 0.00015055670103092785,
+      "loss": 0.7395,
+      "step": 1350
+    },
+    {
+      "epoch": 0.43232,
+      "grad_norm": 0.14473433792591095,
+      "learning_rate": 0.00015051546391752578,
+      "loss": 0.6602,
+      "step": 1351
+    },
+    {
+      "epoch": 0.43264,
+      "grad_norm": 0.13290688395500183,
+      "learning_rate": 0.0001504742268041237,
+      "loss": 0.6509,
+      "step": 1352
+    },
+    {
+      "epoch": 0.43296,
+      "grad_norm": 0.13023175299167633,
+      "learning_rate": 0.00015043298969072167,
+      "loss": 0.7091,
+      "step": 1353
+    },
+    {
+      "epoch": 0.43328,
+      "grad_norm": 0.13794860243797302,
+      "learning_rate": 0.0001503917525773196,
+      "loss": 0.5858,
+      "step": 1354
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.14477092027664185,
+      "learning_rate": 0.00015035051546391753,
+      "loss": 0.6183,
+      "step": 1355
+    },
+    {
+      "epoch": 0.43392,
+      "grad_norm": 0.12636211514472961,
+      "learning_rate": 0.00015030927835051546,
+      "loss": 0.6809,
+      "step": 1356
+    },
+    {
+      "epoch": 0.43424,
+      "grad_norm": 0.11365452408790588,
+      "learning_rate": 0.0001502680412371134,
+      "loss": 0.7062,
+      "step": 1357
+    },
+    {
+      "epoch": 0.43456,
+      "grad_norm": 0.15799300372600555,
+      "learning_rate": 0.00015022680412371135,
+      "loss": 0.6066,
+      "step": 1358
+    },
+    {
+      "epoch": 0.43488,
+      "grad_norm": 0.1377616673707962,
+      "learning_rate": 0.00015018556701030928,
+      "loss": 0.676,
+      "step": 1359
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.11081568151712418,
+      "learning_rate": 0.00015014432989690724,
+      "loss": 0.7228,
+      "step": 1360
+    },
+    {
+      "epoch": 0.43552,
+      "grad_norm": 0.13361386954784393,
+      "learning_rate": 0.00015010309278350517,
+      "loss": 0.8244,
+      "step": 1361
+    },
+    {
+      "epoch": 0.43584,
+      "grad_norm": 0.13209503889083862,
+      "learning_rate": 0.0001500618556701031,
+      "loss": 0.6072,
+      "step": 1362
+    },
+    {
+      "epoch": 0.43616,
+      "grad_norm": 0.13853539526462555,
+      "learning_rate": 0.00015002061855670103,
+      "loss": 0.7314,
+      "step": 1363
+    },
+    {
+      "epoch": 0.43648,
+      "grad_norm": 0.13965541124343872,
+      "learning_rate": 0.00014997938144329897,
+      "loss": 0.6755,
+      "step": 1364
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.13788078725337982,
+      "learning_rate": 0.00014993814432989692,
+      "loss": 0.7235,
+      "step": 1365
+    },
+    {
+      "epoch": 0.43712,
+      "grad_norm": 0.15744584798812866,
+      "learning_rate": 0.00014989690721649486,
+      "loss": 0.719,
+      "step": 1366
+    },
+    {
+      "epoch": 0.43744,
+      "grad_norm": 0.12780450284481049,
+      "learning_rate": 0.0001498556701030928,
+      "loss": 0.6638,
+      "step": 1367
+    },
+    {
+      "epoch": 0.43776,
+      "grad_norm": 0.1370459794998169,
+      "learning_rate": 0.00014981443298969075,
+      "loss": 0.6931,
+      "step": 1368
+    },
+    {
+      "epoch": 0.43808,
+      "grad_norm": 0.1236288920044899,
+      "learning_rate": 0.00014977319587628868,
+      "loss": 0.6773,
+      "step": 1369
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.14718841016292572,
+      "learning_rate": 0.0001497319587628866,
+      "loss": 0.8331,
+      "step": 1370
+    },
+    {
+      "epoch": 0.43872,
+      "grad_norm": 0.11981282383203506,
+      "learning_rate": 0.00014969072164948454,
+      "loss": 0.8594,
+      "step": 1371
+    },
+    {
+      "epoch": 0.43904,
+      "grad_norm": 0.12866224348545074,
+      "learning_rate": 0.00014964948453608247,
+      "loss": 0.6174,
+      "step": 1372
+    },
+    {
+      "epoch": 0.43936,
+      "grad_norm": 0.13502919673919678,
+      "learning_rate": 0.00014960824742268043,
+      "loss": 0.6696,
+      "step": 1373
+    },
+    {
+      "epoch": 0.43968,
+      "grad_norm": 0.1343011111021042,
+      "learning_rate": 0.00014956701030927836,
+      "loss": 0.5349,
+      "step": 1374
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.12182802706956863,
+      "learning_rate": 0.00014952577319587632,
+      "loss": 0.6973,
+      "step": 1375
+    },
+    {
+      "epoch": 0.44032,
+      "grad_norm": 0.13002310693264008,
+      "learning_rate": 0.00014948453608247422,
+      "loss": 0.6637,
+      "step": 1376
+    },
+    {
+      "epoch": 0.44064,
+      "grad_norm": 0.13899263739585876,
+      "learning_rate": 0.00014944329896907215,
+      "loss": 0.721,
+      "step": 1377
+    },
+    {
+      "epoch": 0.44096,
+      "grad_norm": 0.12251676619052887,
+      "learning_rate": 0.0001494020618556701,
+      "loss": 0.6787,
+      "step": 1378
+    },
+    {
+      "epoch": 0.44128,
+      "grad_norm": 0.10930760204792023,
+      "learning_rate": 0.00014936082474226804,
+      "loss": 0.7106,
+      "step": 1379
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.1537499874830246,
+      "learning_rate": 0.00014931958762886597,
+      "loss": 0.7221,
+      "step": 1380
+    },
+    {
+      "epoch": 0.44192,
+      "grad_norm": 0.12851859629154205,
+      "learning_rate": 0.00014927835051546393,
+      "loss": 0.6125,
+      "step": 1381
+    },
+    {
+      "epoch": 0.44224,
+      "grad_norm": 0.13918665051460266,
+      "learning_rate": 0.00014923711340206186,
+      "loss": 0.7332,
+      "step": 1382
+    },
+    {
+      "epoch": 0.44256,
+      "grad_norm": 0.1295069009065628,
+      "learning_rate": 0.0001491958762886598,
+      "loss": 0.6818,
+      "step": 1383
+    },
+    {
+      "epoch": 0.44288,
+      "grad_norm": 0.11744880676269531,
+      "learning_rate": 0.00014915463917525773,
+      "loss": 0.685,
+      "step": 1384
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.12638433277606964,
+      "learning_rate": 0.00014911340206185566,
+      "loss": 0.8496,
+      "step": 1385
+    },
+    {
+      "epoch": 0.44352,
+      "grad_norm": 0.1390245407819748,
+      "learning_rate": 0.00014907216494845362,
+      "loss": 0.7578,
+      "step": 1386
+    },
+    {
+      "epoch": 0.44384,
+      "grad_norm": 0.15041057765483856,
+      "learning_rate": 0.00014903092783505155,
+      "loss": 0.7562,
+      "step": 1387
+    },
+    {
+      "epoch": 0.44416,
+      "grad_norm": 0.15098267793655396,
+      "learning_rate": 0.0001489896907216495,
+      "loss": 0.7524,
+      "step": 1388
+    },
+    {
+      "epoch": 0.44448,
+      "grad_norm": 0.12846030294895172,
+      "learning_rate": 0.00014894845360824744,
+      "loss": 0.6334,
+      "step": 1389
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.12228365242481232,
+      "learning_rate": 0.00014890721649484537,
+      "loss": 0.6569,
+      "step": 1390
+    },
+    {
+      "epoch": 0.44512,
+      "grad_norm": 0.12928150594234467,
+      "learning_rate": 0.0001488659793814433,
+      "loss": 0.7362,
+      "step": 1391
+    },
+    {
+      "epoch": 0.44544,
+      "grad_norm": 0.13998471200466156,
+      "learning_rate": 0.00014882474226804123,
+      "loss": 0.5858,
+      "step": 1392
+    },
+    {
+      "epoch": 0.44576,
+      "grad_norm": 0.1343538761138916,
+      "learning_rate": 0.0001487835051546392,
+      "loss": 0.5379,
+      "step": 1393
+    },
+    {
+      "epoch": 0.44608,
+      "grad_norm": 0.12681473791599274,
+      "learning_rate": 0.00014874226804123712,
+      "loss": 0.6388,
+      "step": 1394
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.14395521581172943,
+      "learning_rate": 0.00014870103092783505,
+      "loss": 0.767,
+      "step": 1395
+    },
+    {
+      "epoch": 0.44672,
+      "grad_norm": 0.12973885238170624,
+      "learning_rate": 0.000148659793814433,
+      "loss": 0.6566,
+      "step": 1396
+    },
+    {
+      "epoch": 0.44704,
+      "grad_norm": 0.1297558695077896,
+      "learning_rate": 0.00014861855670103094,
+      "loss": 0.6941,
+      "step": 1397
+    },
+    {
+      "epoch": 0.44736,
+      "grad_norm": 0.12988127768039703,
+      "learning_rate": 0.00014857731958762887,
+      "loss": 0.6677,
+      "step": 1398
+    },
+    {
+      "epoch": 0.44768,
+      "grad_norm": 0.1514456570148468,
+      "learning_rate": 0.0001485360824742268,
+      "loss": 0.7391,
+      "step": 1399
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.1323680728673935,
+      "learning_rate": 0.00014849484536082474,
+      "loss": 0.5817,
+      "step": 1400
+    },
+    {
+      "epoch": 0.44832,
+      "grad_norm": 0.11719689518213272,
+      "learning_rate": 0.0001484536082474227,
+      "loss": 0.5905,
+      "step": 1401
+    },
+    {
+      "epoch": 0.44864,
+      "grad_norm": 0.1345493197441101,
+      "learning_rate": 0.00014841237113402063,
+      "loss": 0.6735,
+      "step": 1402
+    },
+    {
+      "epoch": 0.44896,
+      "grad_norm": 0.1417340785264969,
+      "learning_rate": 0.00014837113402061858,
+      "loss": 0.6581,
+      "step": 1403
+    },
+    {
+      "epoch": 0.44928,
+      "grad_norm": 0.13357757031917572,
+      "learning_rate": 0.00014832989690721652,
+      "loss": 0.6473,
+      "step": 1404
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.14018036425113678,
+      "learning_rate": 0.00014828865979381442,
+      "loss": 0.7394,
+      "step": 1405
+    },
+    {
+      "epoch": 0.44992,
+      "grad_norm": 0.16627104580402374,
+      "learning_rate": 0.00014824742268041238,
+      "loss": 0.8408,
+      "step": 1406
+    },
+    {
+      "epoch": 0.45024,
+      "grad_norm": 0.12548863887786865,
+      "learning_rate": 0.0001482061855670103,
+      "loss": 0.589,
+      "step": 1407
+    },
+    {
+      "epoch": 0.45056,
+      "grad_norm": 0.12361471354961395,
+      "learning_rate": 0.00014816494845360827,
+      "loss": 0.5416,
+      "step": 1408
+    },
+    {
+      "epoch": 0.45088,
+      "grad_norm": 0.15174099802970886,
+      "learning_rate": 0.0001481237113402062,
+      "loss": 0.9295,
+      "step": 1409
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.1422296017408371,
+      "learning_rate": 0.00014808247422680413,
+      "loss": 0.6572,
+      "step": 1410
+    },
+    {
+      "epoch": 0.45152,
+      "grad_norm": 0.13866426050662994,
+      "learning_rate": 0.0001480412371134021,
+      "loss": 0.6791,
+      "step": 1411
+    },
+    {
+      "epoch": 0.45184,
+      "grad_norm": 0.1270676702260971,
+      "learning_rate": 0.000148,
+      "loss": 0.7297,
+      "step": 1412
+    },
+    {
+      "epoch": 0.45216,
+      "grad_norm": 0.16468572616577148,
+      "learning_rate": 0.00014795876288659792,
+      "loss": 0.6981,
+      "step": 1413
+    },
+    {
+      "epoch": 0.45248,
+      "grad_norm": 0.1560823917388916,
+      "learning_rate": 0.00014791752577319588,
+      "loss": 0.7232,
+      "step": 1414
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.13539525866508484,
+      "learning_rate": 0.0001478762886597938,
+      "loss": 0.6393,
+      "step": 1415
+    },
+    {
+      "epoch": 0.45312,
+      "grad_norm": 0.1415485292673111,
+      "learning_rate": 0.00014783505154639177,
+      "loss": 0.7464,
+      "step": 1416
+    },
+    {
+      "epoch": 0.45344,
+      "grad_norm": 0.1506825089454651,
+      "learning_rate": 0.0001477938144329897,
+      "loss": 0.7382,
+      "step": 1417
+    },
+    {
+      "epoch": 0.45376,
+      "grad_norm": 0.11746775358915329,
+      "learning_rate": 0.00014775257731958763,
+      "loss": 0.6446,
+      "step": 1418
+    },
+    {
+      "epoch": 0.45408,
+      "grad_norm": 0.16701194643974304,
+      "learning_rate": 0.00014771134020618557,
+      "loss": 0.717,
+      "step": 1419
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.11998537927865982,
+      "learning_rate": 0.0001476701030927835,
+      "loss": 0.6464,
+      "step": 1420
+    },
+    {
+      "epoch": 0.45472,
+      "grad_norm": 0.14511455595493317,
+      "learning_rate": 0.00014762886597938146,
+      "loss": 0.6387,
+      "step": 1421
+    },
+    {
+      "epoch": 0.45504,
+      "grad_norm": 0.13467402756214142,
+      "learning_rate": 0.0001475876288659794,
+      "loss": 0.6711,
+      "step": 1422
+    },
+    {
+      "epoch": 0.45536,
+      "grad_norm": 0.1327303797006607,
+      "learning_rate": 0.00014754639175257732,
+      "loss": 0.6128,
+      "step": 1423
+    },
+    {
+      "epoch": 0.45568,
+      "grad_norm": 0.15037193894386292,
+      "learning_rate": 0.00014750515463917528,
+      "loss": 0.6816,
+      "step": 1424
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.11883807182312012,
+      "learning_rate": 0.0001474639175257732,
+      "loss": 0.7379,
+      "step": 1425
+    },
+    {
+      "epoch": 0.45632,
+      "grad_norm": 0.15287265181541443,
+      "learning_rate": 0.00014742268041237114,
+      "loss": 0.7345,
+      "step": 1426
+    },
+    {
+      "epoch": 0.45664,
+      "grad_norm": 0.12675881385803223,
+      "learning_rate": 0.00014738144329896907,
+      "loss": 0.6973,
+      "step": 1427
+    },
+    {
+      "epoch": 0.45696,
+      "grad_norm": 0.13408991694450378,
+      "learning_rate": 0.000147340206185567,
+      "loss": 0.6391,
+      "step": 1428
+    },
+    {
+      "epoch": 0.45728,
+      "grad_norm": 0.14130409061908722,
+      "learning_rate": 0.00014729896907216496,
+      "loss": 0.6632,
+      "step": 1429
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.14775030314922333,
+      "learning_rate": 0.0001472577319587629,
+      "loss": 0.6253,
+      "step": 1430
+    },
+    {
+      "epoch": 0.45792,
+      "grad_norm": 0.12623794376850128,
+      "learning_rate": 0.00014721649484536085,
+      "loss": 0.809,
+      "step": 1431
+    },
+    {
+      "epoch": 0.45824,
+      "grad_norm": 0.13884440064430237,
+      "learning_rate": 0.00014717525773195878,
+      "loss": 0.7387,
+      "step": 1432
+    },
+    {
+      "epoch": 0.45856,
+      "grad_norm": 0.1509568840265274,
+      "learning_rate": 0.0001471340206185567,
+      "loss": 0.7517,
+      "step": 1433
+    },
+    {
+      "epoch": 0.45888,
+      "grad_norm": 0.13382567465305328,
+      "learning_rate": 0.00014709278350515464,
+      "loss": 0.6297,
+      "step": 1434
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.136491060256958,
+      "learning_rate": 0.00014705154639175257,
+      "loss": 0.8006,
+      "step": 1435
+    },
+    {
+      "epoch": 0.45952,
+      "grad_norm": 0.13975565135478973,
+      "learning_rate": 0.00014701030927835053,
+      "loss": 0.7377,
+      "step": 1436
+    },
+    {
+      "epoch": 0.45984,
+      "grad_norm": 0.13055750727653503,
+      "learning_rate": 0.00014696907216494846,
+      "loss": 0.7487,
+      "step": 1437
+    },
+    {
+      "epoch": 0.46016,
+      "grad_norm": 0.14289508759975433,
+      "learning_rate": 0.0001469278350515464,
+      "loss": 0.9125,
+      "step": 1438
+    },
+    {
+      "epoch": 0.46048,
+      "grad_norm": 0.12028776854276657,
+      "learning_rate": 0.00014688659793814435,
+      "loss": 0.631,
+      "step": 1439
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.16747339069843292,
+      "learning_rate": 0.00014684536082474228,
+      "loss": 0.7855,
+      "step": 1440
+    },
+    {
+      "epoch": 0.46112,
+      "grad_norm": 0.1353611946105957,
+      "learning_rate": 0.0001468041237113402,
+      "loss": 0.7545,
+      "step": 1441
+    },
+    {
+      "epoch": 0.46144,
+      "grad_norm": 0.1368974894285202,
+      "learning_rate": 0.00014676288659793815,
+      "loss": 0.8353,
+      "step": 1442
+    },
+    {
+      "epoch": 0.46176,
+      "grad_norm": 0.13990259170532227,
+      "learning_rate": 0.00014672164948453608,
+      "loss": 0.6957,
+      "step": 1443
+    },
+    {
+      "epoch": 0.46208,
+      "grad_norm": 0.15001052618026733,
+      "learning_rate": 0.00014668041237113404,
+      "loss": 0.6809,
+      "step": 1444
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.1292935013771057,
+      "learning_rate": 0.00014663917525773197,
+      "loss": 0.6982,
+      "step": 1445
+    },
+    {
+      "epoch": 0.46272,
+      "grad_norm": 0.14710412919521332,
+      "learning_rate": 0.0001465979381443299,
+      "loss": 0.5293,
+      "step": 1446
+    },
+    {
+      "epoch": 0.46304,
+      "grad_norm": 0.1477661430835724,
+      "learning_rate": 0.00014655670103092786,
+      "loss": 0.8996,
+      "step": 1447
+    },
+    {
+      "epoch": 0.46336,
+      "grad_norm": 0.12355426698923111,
+      "learning_rate": 0.00014651546391752576,
+      "loss": 0.6654,
+      "step": 1448
+    },
+    {
+      "epoch": 0.46368,
+      "grad_norm": 0.13544946908950806,
+      "learning_rate": 0.00014647422680412372,
+      "loss": 0.77,
+      "step": 1449
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.13810297846794128,
+      "learning_rate": 0.00014643298969072165,
+      "loss": 0.5933,
+      "step": 1450
+    },
+    {
+      "epoch": 0.46432,
+      "grad_norm": 0.1285339742898941,
+      "learning_rate": 0.00014639175257731958,
+      "loss": 0.6522,
+      "step": 1451
+    },
+    {
+      "epoch": 0.46464,
+      "grad_norm": 0.1257028728723526,
+      "learning_rate": 0.00014635051546391754,
+      "loss": 0.581,
+      "step": 1452
+    },
+    {
+      "epoch": 0.46496,
+      "grad_norm": 0.1406828761100769,
+      "learning_rate": 0.00014630927835051547,
+      "loss": 0.6475,
+      "step": 1453
+    },
+    {
+      "epoch": 0.46528,
+      "grad_norm": 0.14316070079803467,
+      "learning_rate": 0.00014626804123711343,
+      "loss": 0.7419,
+      "step": 1454
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.14277386665344238,
+      "learning_rate": 0.00014622680412371134,
+      "loss": 0.6507,
+      "step": 1455
+    },
+    {
+      "epoch": 0.46592,
+      "grad_norm": 0.144886776804924,
+      "learning_rate": 0.00014618556701030927,
+      "loss": 0.7173,
+      "step": 1456
+    },
+    {
+      "epoch": 0.46624,
+      "grad_norm": 0.1306687295436859,
+      "learning_rate": 0.00014614432989690723,
+      "loss": 0.8279,
+      "step": 1457
+    },
+    {
+      "epoch": 0.46656,
+      "grad_norm": 0.14299385249614716,
+      "learning_rate": 0.00014610309278350516,
+      "loss": 0.6481,
+      "step": 1458
+    },
+    {
+      "epoch": 0.46688,
+      "grad_norm": 0.12328580021858215,
+      "learning_rate": 0.00014606185567010311,
+      "loss": 0.6978,
+      "step": 1459
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.13827897608280182,
+      "learning_rate": 0.00014602061855670105,
+      "loss": 0.7415,
+      "step": 1460
+    },
+    {
+      "epoch": 0.46752,
+      "grad_norm": 0.1236303448677063,
+      "learning_rate": 0.00014597938144329898,
+      "loss": 0.628,
+      "step": 1461
+    },
+    {
+      "epoch": 0.46784,
+      "grad_norm": 0.14038272202014923,
+      "learning_rate": 0.0001459381443298969,
+      "loss": 0.7499,
+      "step": 1462
+    },
+    {
+      "epoch": 0.46816,
+      "grad_norm": 0.1291581094264984,
+      "learning_rate": 0.00014589690721649484,
+      "loss": 0.7137,
+      "step": 1463
+    },
+    {
+      "epoch": 0.46848,
+      "grad_norm": 0.1379321664571762,
+      "learning_rate": 0.0001458556701030928,
+      "loss": 0.7787,
+      "step": 1464
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.13971829414367676,
+      "learning_rate": 0.00014581443298969073,
+      "loss": 0.7112,
+      "step": 1465
+    },
+    {
+      "epoch": 0.46912,
+      "grad_norm": 0.1500317007303238,
+      "learning_rate": 0.00014577319587628866,
+      "loss": 0.6516,
+      "step": 1466
+    },
+    {
+      "epoch": 0.46944,
+      "grad_norm": 0.1315118670463562,
+      "learning_rate": 0.00014573195876288662,
+      "loss": 0.793,
+      "step": 1467
+    },
+    {
+      "epoch": 0.46976,
+      "grad_norm": 0.14588972926139832,
+      "learning_rate": 0.00014569072164948455,
+      "loss": 0.7037,
+      "step": 1468
+    },
+    {
+      "epoch": 0.47008,
+      "grad_norm": 0.14539368450641632,
+      "learning_rate": 0.00014564948453608248,
+      "loss": 0.8842,
+      "step": 1469
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.12320923805236816,
+      "learning_rate": 0.0001456082474226804,
+      "loss": 0.5365,
+      "step": 1470
+    },
+    {
+      "epoch": 0.47072,
+      "grad_norm": 0.1435680389404297,
+      "learning_rate": 0.00014556701030927834,
+      "loss": 0.6666,
+      "step": 1471
+    },
+    {
+      "epoch": 0.47104,
+      "grad_norm": 0.12959571182727814,
+      "learning_rate": 0.0001455257731958763,
+      "loss": 0.7514,
+      "step": 1472
+    },
+    {
+      "epoch": 0.47136,
+      "grad_norm": 0.14037887752056122,
+      "learning_rate": 0.00014548453608247423,
+      "loss": 0.6174,
+      "step": 1473
+    },
+    {
+      "epoch": 0.47168,
+      "grad_norm": 0.13851316273212433,
+      "learning_rate": 0.00014544329896907217,
+      "loss": 0.6604,
+      "step": 1474
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.13352732360363007,
+      "learning_rate": 0.00014540206185567012,
+      "loss": 0.6804,
+      "step": 1475
+    },
+    {
+      "epoch": 0.47232,
+      "grad_norm": 0.13814695179462433,
+      "learning_rate": 0.00014536082474226805,
+      "loss": 0.7677,
+      "step": 1476
+    },
+    {
+      "epoch": 0.47264,
+      "grad_norm": 0.13144215941429138,
+      "learning_rate": 0.00014531958762886599,
+      "loss": 0.715,
+      "step": 1477
+    },
+    {
+      "epoch": 0.47296,
+      "grad_norm": 0.1366271823644638,
+      "learning_rate": 0.00014527835051546392,
+      "loss": 0.7411,
+      "step": 1478
+    },
+    {
+      "epoch": 0.47328,
+      "grad_norm": 0.1347101926803589,
+      "learning_rate": 0.00014523711340206185,
+      "loss": 0.7294,
+      "step": 1479
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.13796299695968628,
+      "learning_rate": 0.0001451958762886598,
+      "loss": 0.8298,
+      "step": 1480
+    },
+    {
+      "epoch": 0.47392,
+      "grad_norm": 0.1773163378238678,
+      "learning_rate": 0.00014515463917525774,
+      "loss": 0.8373,
+      "step": 1481
+    },
+    {
+      "epoch": 0.47424,
+      "grad_norm": 0.12339209020137787,
+      "learning_rate": 0.0001451134020618557,
+      "loss": 0.7864,
+      "step": 1482
+    },
+    {
+      "epoch": 0.47456,
+      "grad_norm": 0.1335781365633011,
+      "learning_rate": 0.00014507216494845363,
+      "loss": 0.7072,
+      "step": 1483
+    },
+    {
+      "epoch": 0.47488,
+      "grad_norm": 0.12188687175512314,
+      "learning_rate": 0.00014503092783505153,
+      "loss": 0.6166,
+      "step": 1484
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.14522816240787506,
+      "learning_rate": 0.0001449896907216495,
+      "loss": 0.7699,
+      "step": 1485
+    },
+    {
+      "epoch": 0.47552,
+      "grad_norm": 0.15175694227218628,
+      "learning_rate": 0.00014494845360824742,
+      "loss": 0.8623,
+      "step": 1486
+    },
+    {
+      "epoch": 0.47584,
+      "grad_norm": 0.11388963460922241,
+      "learning_rate": 0.00014490721649484538,
+      "loss": 0.583,
+      "step": 1487
+    },
+    {
+      "epoch": 0.47616,
+      "grad_norm": 0.13264136016368866,
+      "learning_rate": 0.0001448659793814433,
+      "loss": 0.7065,
+      "step": 1488
+    },
+    {
+      "epoch": 0.47648,
+      "grad_norm": 0.15064145624637604,
+      "learning_rate": 0.00014482474226804124,
+      "loss": 0.7306,
+      "step": 1489
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.11270318925380707,
+      "learning_rate": 0.00014478350515463917,
+      "loss": 0.6129,
+      "step": 1490
+    },
+    {
+      "epoch": 0.47712,
+      "grad_norm": 0.13935883343219757,
+      "learning_rate": 0.0001447422680412371,
+      "loss": 0.603,
+      "step": 1491
+    },
+    {
+      "epoch": 0.47744,
+      "grad_norm": 0.12119437009096146,
+      "learning_rate": 0.00014470103092783506,
+      "loss": 0.6056,
+      "step": 1492
+    },
+    {
+      "epoch": 0.47776,
+      "grad_norm": 0.13266988098621368,
+      "learning_rate": 0.000144659793814433,
+      "loss": 0.6984,
+      "step": 1493
+    },
+    {
+      "epoch": 0.47808,
+      "grad_norm": 0.13279670476913452,
+      "learning_rate": 0.00014461855670103093,
+      "loss": 0.7021,
+      "step": 1494
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.12158623337745667,
+      "learning_rate": 0.00014457731958762888,
+      "loss": 0.617,
+      "step": 1495
+    },
+    {
+      "epoch": 0.47872,
+      "grad_norm": 0.14636047184467316,
+      "learning_rate": 0.00014453608247422682,
+      "loss": 0.649,
+      "step": 1496
+    },
+    {
+      "epoch": 0.47904,
+      "grad_norm": 0.13223512470722198,
+      "learning_rate": 0.00014449484536082475,
+      "loss": 0.6275,
+      "step": 1497
+    },
+    {
+      "epoch": 0.47936,
+      "grad_norm": 0.14344871044158936,
+      "learning_rate": 0.00014445360824742268,
+      "loss": 0.6604,
+      "step": 1498
+    },
+    {
+      "epoch": 0.47968,
+      "grad_norm": 0.13986854255199432,
+      "learning_rate": 0.0001444123711340206,
+      "loss": 0.8142,
+      "step": 1499
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.12066833674907684,
+      "learning_rate": 0.00014437113402061857,
+      "loss": 0.6837,
+      "step": 1500
+    },
+    {
+      "epoch": 0.48032,
+      "grad_norm": 0.137162446975708,
+      "learning_rate": 0.0001443298969072165,
+      "loss": 0.5547,
+      "step": 1501
+    },
+    {
+      "epoch": 0.48064,
+      "grad_norm": 0.1468525379896164,
+      "learning_rate": 0.00014428865979381443,
+      "loss": 0.7303,
+      "step": 1502
+    },
+    {
+      "epoch": 0.48096,
+      "grad_norm": 0.1561747044324875,
+      "learning_rate": 0.0001442474226804124,
+      "loss": 0.8114,
+      "step": 1503
+    },
+    {
+      "epoch": 0.48128,
+      "grad_norm": 0.1542164832353592,
+      "learning_rate": 0.00014420618556701032,
+      "loss": 0.7206,
+      "step": 1504
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.14567720890045166,
+      "learning_rate": 0.00014416494845360825,
+      "loss": 0.7906,
+      "step": 1505
+    },
+    {
+      "epoch": 0.48192,
+      "grad_norm": 0.14530521631240845,
+      "learning_rate": 0.00014412371134020618,
+      "loss": 0.7286,
+      "step": 1506
+    },
+    {
+      "epoch": 0.48224,
+      "grad_norm": 0.13600529730319977,
+      "learning_rate": 0.00014408247422680411,
+      "loss": 0.739,
+      "step": 1507
+    },
+    {
+      "epoch": 0.48256,
+      "grad_norm": 0.13068170845508575,
+      "learning_rate": 0.00014404123711340207,
+      "loss": 0.7286,
+      "step": 1508
+    },
+    {
+      "epoch": 0.48288,
+      "grad_norm": 0.13706938922405243,
+      "learning_rate": 0.000144,
+      "loss": 0.6003,
+      "step": 1509
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.16714246571063995,
+      "learning_rate": 0.00014395876288659796,
+      "loss": 0.7568,
+      "step": 1510
+    },
+    {
+      "epoch": 0.48352,
+      "grad_norm": 0.15205715596675873,
+      "learning_rate": 0.0001439175257731959,
+      "loss": 0.714,
+      "step": 1511
+    },
+    {
+      "epoch": 0.48384,
+      "grad_norm": 0.15466946363449097,
+      "learning_rate": 0.00014387628865979382,
+      "loss": 0.7392,
+      "step": 1512
+    },
+    {
+      "epoch": 0.48416,
+      "grad_norm": 0.14179885387420654,
+      "learning_rate": 0.00014383505154639176,
+      "loss": 0.7935,
+      "step": 1513
+    },
+    {
+      "epoch": 0.48448,
+      "grad_norm": 0.13034437596797943,
+      "learning_rate": 0.0001437938144329897,
+      "loss": 0.6451,
+      "step": 1514
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.1632513552904129,
+      "learning_rate": 0.00014375257731958765,
+      "loss": 0.7264,
+      "step": 1515
+    },
+    {
+      "epoch": 0.48512,
+      "grad_norm": 0.15103459358215332,
+      "learning_rate": 0.00014371134020618558,
+      "loss": 0.6607,
+      "step": 1516
+    },
+    {
+      "epoch": 0.48544,
+      "grad_norm": 0.13666647672653198,
+      "learning_rate": 0.0001436701030927835,
+      "loss": 0.6192,
+      "step": 1517
+    },
+    {
+      "epoch": 0.48576,
+      "grad_norm": 0.14958883821964264,
+      "learning_rate": 0.00014362886597938147,
+      "loss": 0.5868,
+      "step": 1518
+    },
+    {
+      "epoch": 0.48608,
+      "grad_norm": 0.13229940831661224,
+      "learning_rate": 0.0001435876288659794,
+      "loss": 0.5931,
+      "step": 1519
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.14606302976608276,
+      "learning_rate": 0.00014354639175257733,
+      "loss": 0.6243,
+      "step": 1520
+    },
+    {
+      "epoch": 0.48672,
+      "grad_norm": 0.14299476146697998,
+      "learning_rate": 0.00014350515463917526,
+      "loss": 0.6024,
+      "step": 1521
+    },
+    {
+      "epoch": 0.48704,
+      "grad_norm": 0.1333245038986206,
+      "learning_rate": 0.0001434639175257732,
+      "loss": 0.8185,
+      "step": 1522
+    },
+    {
+      "epoch": 0.48736,
+      "grad_norm": 0.15337637066841125,
+      "learning_rate": 0.00014342268041237115,
+      "loss": 0.7985,
+      "step": 1523
+    },
+    {
+      "epoch": 0.48768,
+      "grad_norm": 0.1316799521446228,
+      "learning_rate": 0.00014338144329896908,
+      "loss": 0.8541,
+      "step": 1524
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.15217626094818115,
+      "learning_rate": 0.00014334020618556704,
+      "loss": 0.7402,
+      "step": 1525
+    },
+    {
+      "epoch": 0.48832,
+      "grad_norm": 0.12334585934877396,
+      "learning_rate": 0.00014329896907216494,
+      "loss": 0.725,
+      "step": 1526
+    },
+    {
+      "epoch": 0.48864,
+      "grad_norm": 0.1464010328054428,
+      "learning_rate": 0.00014325773195876287,
+      "loss": 0.5833,
+      "step": 1527
+    },
+    {
+      "epoch": 0.48896,
+      "grad_norm": 0.12967054545879364,
+      "learning_rate": 0.00014321649484536083,
+      "loss": 0.6763,
+      "step": 1528
+    },
+    {
+      "epoch": 0.48928,
+      "grad_norm": 0.12463313341140747,
+      "learning_rate": 0.00014317525773195876,
+      "loss": 0.5501,
+      "step": 1529
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.13481323421001434,
+      "learning_rate": 0.00014313402061855672,
+      "loss": 0.807,
+      "step": 1530
+    },
+    {
+      "epoch": 0.48992,
+      "grad_norm": 0.1528305560350418,
+      "learning_rate": 0.00014309278350515465,
+      "loss": 0.6632,
+      "step": 1531
+    },
+    {
+      "epoch": 0.49024,
+      "grad_norm": 0.1464243084192276,
+      "learning_rate": 0.00014305154639175259,
+      "loss": 0.7756,
+      "step": 1532
+    },
+    {
+      "epoch": 0.49056,
+      "grad_norm": 0.1248038038611412,
+      "learning_rate": 0.00014301030927835052,
+      "loss": 0.6521,
+      "step": 1533
+    },
+    {
+      "epoch": 0.49088,
+      "grad_norm": 0.14767804741859436,
+      "learning_rate": 0.00014296907216494845,
+      "loss": 0.7785,
+      "step": 1534
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.13207073509693146,
+      "learning_rate": 0.00014292783505154638,
+      "loss": 0.8221,
+      "step": 1535
+    },
+    {
+      "epoch": 0.49152,
+      "grad_norm": 0.14973971247673035,
+      "learning_rate": 0.00014288659793814434,
+      "loss": 0.6877,
+      "step": 1536
+    },
+    {
+      "epoch": 0.49184,
+      "grad_norm": 0.13894066214561462,
+      "learning_rate": 0.00014284536082474227,
+      "loss": 0.7506,
+      "step": 1537
+    },
+    {
+      "epoch": 0.49216,
+      "grad_norm": 0.12908771634101868,
+      "learning_rate": 0.00014280412371134023,
+      "loss": 0.7926,
+      "step": 1538
+    },
+    {
+      "epoch": 0.49248,
+      "grad_norm": 0.1435110867023468,
+      "learning_rate": 0.00014276288659793816,
+      "loss": 0.6879,
+      "step": 1539
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.1450876146554947,
+      "learning_rate": 0.0001427216494845361,
+      "loss": 0.6263,
+      "step": 1540
+    },
+    {
+      "epoch": 0.49312,
+      "grad_norm": 0.1568361520767212,
+      "learning_rate": 0.00014268041237113402,
+      "loss": 0.7474,
+      "step": 1541
+    },
+    {
+      "epoch": 0.49344,
+      "grad_norm": 0.11616674065589905,
+      "learning_rate": 0.00014263917525773195,
+      "loss": 0.5327,
+      "step": 1542
+    },
+    {
+      "epoch": 0.49376,
+      "grad_norm": 0.15635275840759277,
+      "learning_rate": 0.0001425979381443299,
+      "loss": 0.7125,
+      "step": 1543
+    },
+    {
+      "epoch": 0.49408,
+      "grad_norm": 0.12766999006271362,
+      "learning_rate": 0.00014255670103092784,
+      "loss": 0.697,
+      "step": 1544
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.12405005097389221,
+      "learning_rate": 0.00014251546391752577,
+      "loss": 0.6365,
+      "step": 1545
+    },
+    {
+      "epoch": 0.49472,
+      "grad_norm": 0.1484600007534027,
+      "learning_rate": 0.00014247422680412373,
+      "loss": 0.8282,
+      "step": 1546
+    },
+    {
+      "epoch": 0.49504,
+      "grad_norm": 0.16164913773536682,
+      "learning_rate": 0.00014243298969072166,
+      "loss": 0.6262,
+      "step": 1547
+    },
+    {
+      "epoch": 0.49536,
+      "grad_norm": 0.1343853771686554,
+      "learning_rate": 0.0001423917525773196,
+      "loss": 0.8293,
+      "step": 1548
+    },
+    {
+      "epoch": 0.49568,
+      "grad_norm": 0.15468348562717438,
+      "learning_rate": 0.00014235051546391753,
+      "loss": 0.6369,
+      "step": 1549
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.1245027631521225,
+      "learning_rate": 0.00014230927835051546,
+      "loss": 0.6722,
+      "step": 1550
+    },
+    {
+      "epoch": 0.49632,
+      "grad_norm": 0.1333836168050766,
+      "learning_rate": 0.00014226804123711342,
+      "loss": 0.6849,
+      "step": 1551
+    },
+    {
+      "epoch": 0.49664,
+      "grad_norm": 0.14538751542568207,
+      "learning_rate": 0.00014222680412371135,
+      "loss": 0.6602,
+      "step": 1552
+    },
+    {
+      "epoch": 0.49696,
+      "grad_norm": 0.13172303140163422,
+      "learning_rate": 0.0001421855670103093,
+      "loss": 0.6736,
+      "step": 1553
+    },
+    {
+      "epoch": 0.49728,
+      "grad_norm": 0.12844419479370117,
+      "learning_rate": 0.00014214432989690724,
+      "loss": 0.5586,
+      "step": 1554
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.14242854714393616,
+      "learning_rate": 0.00014210309278350517,
+      "loss": 0.7817,
+      "step": 1555
+    },
+    {
+      "epoch": 0.49792,
+      "grad_norm": 0.13983237743377686,
+      "learning_rate": 0.0001420618556701031,
+      "loss": 0.7779,
+      "step": 1556
+    },
+    {
+      "epoch": 0.49824,
+      "grad_norm": 0.12832964956760406,
+      "learning_rate": 0.00014202061855670103,
+      "loss": 0.8568,
+      "step": 1557
+    },
+    {
+      "epoch": 0.49856,
+      "grad_norm": 0.139461487531662,
+      "learning_rate": 0.000141979381443299,
+      "loss": 0.639,
+      "step": 1558
+    },
+    {
+      "epoch": 0.49888,
+      "grad_norm": 0.15198762714862823,
+      "learning_rate": 0.00014193814432989692,
+      "loss": 0.7881,
+      "step": 1559
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.13192367553710938,
+      "learning_rate": 0.00014189690721649485,
+      "loss": 0.7608,
+      "step": 1560
+    },
+    {
+      "epoch": 0.49952,
+      "grad_norm": 0.15548580884933472,
+      "learning_rate": 0.0001418556701030928,
+      "loss": 0.6954,
+      "step": 1561
+    },
+    {
+      "epoch": 0.49984,
+      "grad_norm": 0.1270473599433899,
+      "learning_rate": 0.0001418144329896907,
+      "loss": 0.5897,
+      "step": 1562
+    },
+    {
+      "epoch": 0.50016,
+      "grad_norm": 0.13659198582172394,
+      "learning_rate": 0.00014177319587628864,
+      "loss": 0.6254,
+      "step": 1563
+    },
+    {
+      "epoch": 0.50048,
+      "grad_norm": 0.1459890455007553,
+      "learning_rate": 0.0001417319587628866,
+      "loss": 0.6082,
+      "step": 1564
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.14214038848876953,
+      "learning_rate": 0.00014169072164948453,
+      "loss": 0.6159,
+      "step": 1565
+    },
+    {
+      "epoch": 0.50112,
+      "grad_norm": 0.15616926550865173,
+      "learning_rate": 0.0001416494845360825,
+      "loss": 0.6342,
+      "step": 1566
+    },
+    {
+      "epoch": 0.50144,
+      "grad_norm": 0.1373022496700287,
+      "learning_rate": 0.00014160824742268042,
+      "loss": 0.7758,
+      "step": 1567
+    },
+    {
+      "epoch": 0.50176,
+      "grad_norm": 0.15784426033496857,
+      "learning_rate": 0.00014156701030927836,
+      "loss": 0.9453,
+      "step": 1568
+    },
+    {
+      "epoch": 0.50208,
+      "grad_norm": 0.12950581312179565,
+      "learning_rate": 0.0001415257731958763,
+      "loss": 0.5893,
+      "step": 1569
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.14546595513820648,
+      "learning_rate": 0.00014148453608247422,
+      "loss": 0.7556,
+      "step": 1570
+    },
+    {
+      "epoch": 0.50272,
+      "grad_norm": 0.12755361199378967,
+      "learning_rate": 0.00014144329896907218,
+      "loss": 0.6448,
+      "step": 1571
+    },
+    {
+      "epoch": 0.50304,
+      "grad_norm": 0.14262261986732483,
+      "learning_rate": 0.0001414020618556701,
+      "loss": 0.6608,
+      "step": 1572
+    },
+    {
+      "epoch": 0.50336,
+      "grad_norm": 0.13705115020275116,
+      "learning_rate": 0.00014136082474226804,
+      "loss": 0.7744,
+      "step": 1573
+    },
+    {
+      "epoch": 0.50368,
+      "grad_norm": 0.14189304411411285,
+      "learning_rate": 0.000141319587628866,
+      "loss": 0.7894,
+      "step": 1574
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.15143121778964996,
+      "learning_rate": 0.00014127835051546393,
+      "loss": 0.7071,
+      "step": 1575
+    },
+    {
+      "epoch": 0.50432,
+      "grad_norm": 0.13547833263874054,
+      "learning_rate": 0.00014123711340206186,
+      "loss": 0.7542,
+      "step": 1576
+    },
+    {
+      "epoch": 0.50464,
+      "grad_norm": 0.144126757979393,
+      "learning_rate": 0.0001411958762886598,
+      "loss": 0.6932,
+      "step": 1577
+    },
+    {
+      "epoch": 0.50496,
+      "grad_norm": 0.13636307418346405,
+      "learning_rate": 0.00014115463917525772,
+      "loss": 0.7421,
+      "step": 1578
+    },
+    {
+      "epoch": 0.50528,
+      "grad_norm": 0.14498503506183624,
+      "learning_rate": 0.00014111340206185568,
+      "loss": 0.7263,
+      "step": 1579
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.14120568335056305,
+      "learning_rate": 0.0001410721649484536,
+      "loss": 0.7044,
+      "step": 1580
+    },
+    {
+      "epoch": 0.50592,
+      "grad_norm": 0.13528260588645935,
+      "learning_rate": 0.00014103092783505157,
+      "loss": 0.7811,
+      "step": 1581
+    },
+    {
+      "epoch": 0.50624,
+      "grad_norm": 0.13734883069992065,
+      "learning_rate": 0.0001409896907216495,
+      "loss": 0.6135,
+      "step": 1582
+    },
+    {
+      "epoch": 0.50656,
+      "grad_norm": 0.1357855200767517,
+      "learning_rate": 0.00014094845360824743,
+      "loss": 0.5976,
+      "step": 1583
+    },
+    {
+      "epoch": 0.50688,
+      "grad_norm": 0.140086829662323,
+      "learning_rate": 0.00014090721649484536,
+      "loss": 0.6395,
+      "step": 1584
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.13380053639411926,
+      "learning_rate": 0.0001408659793814433,
+      "loss": 0.8164,
+      "step": 1585
+    },
+    {
+      "epoch": 0.50752,
+      "grad_norm": 0.13138113915920258,
+      "learning_rate": 0.00014082474226804125,
+      "loss": 0.708,
+      "step": 1586
+    },
+    {
+      "epoch": 0.50784,
+      "grad_norm": 0.14903022348880768,
+      "learning_rate": 0.00014078350515463919,
+      "loss": 0.7525,
+      "step": 1587
+    },
+    {
+      "epoch": 0.50816,
+      "grad_norm": 0.13910505175590515,
+      "learning_rate": 0.00014074226804123712,
+      "loss": 0.6733,
+      "step": 1588
+    },
+    {
+      "epoch": 0.50848,
+      "grad_norm": 0.14415191113948822,
+      "learning_rate": 0.00014070103092783507,
+      "loss": 0.6756,
+      "step": 1589
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.1318030208349228,
+      "learning_rate": 0.000140659793814433,
+      "loss": 0.6555,
+      "step": 1590
+    },
+    {
+      "epoch": 0.50912,
+      "grad_norm": 0.12761345505714417,
+      "learning_rate": 0.0001406185567010309,
+      "loss": 0.8246,
+      "step": 1591
+    },
+    {
+      "epoch": 0.50944,
+      "grad_norm": 0.1260489523410797,
+      "learning_rate": 0.00014057731958762887,
+      "loss": 0.5207,
+      "step": 1592
+    },
+    {
+      "epoch": 0.50976,
+      "grad_norm": 0.13099458813667297,
+      "learning_rate": 0.0001405360824742268,
+      "loss": 0.6913,
+      "step": 1593
+    },
+    {
+      "epoch": 0.51008,
+      "grad_norm": 0.12148265540599823,
+      "learning_rate": 0.00014049484536082476,
+      "loss": 0.661,
+      "step": 1594
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.1384635716676712,
+      "learning_rate": 0.0001404536082474227,
+      "loss": 0.7435,
+      "step": 1595
+    },
+    {
+      "epoch": 0.51072,
+      "grad_norm": 0.1455785036087036,
+      "learning_rate": 0.00014041237113402062,
+      "loss": 0.7483,
+      "step": 1596
+    },
+    {
+      "epoch": 0.51104,
+      "grad_norm": 0.14684465527534485,
+      "learning_rate": 0.00014037113402061858,
+      "loss": 0.6953,
+      "step": 1597
+    },
+    {
+      "epoch": 0.51136,
+      "grad_norm": 0.1461796909570694,
+      "learning_rate": 0.00014032989690721648,
+      "loss": 0.7893,
+      "step": 1598
+    },
+    {
+      "epoch": 0.51168,
+      "grad_norm": 0.14683541655540466,
+      "learning_rate": 0.00014028865979381444,
+      "loss": 0.7007,
+      "step": 1599
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.13216625154018402,
+      "learning_rate": 0.00014024742268041237,
+      "loss": 0.6182,
+      "step": 1600
+    },
+    {
+      "epoch": 0.51232,
+      "grad_norm": 0.12355850636959076,
+      "learning_rate": 0.0001402061855670103,
+      "loss": 0.759,
+      "step": 1601
+    },
+    {
+      "epoch": 0.51264,
+      "grad_norm": 0.12836319208145142,
+      "learning_rate": 0.00014016494845360826,
+      "loss": 0.5823,
+      "step": 1602
+    },
+    {
+      "epoch": 0.51296,
+      "grad_norm": 0.15208405256271362,
+      "learning_rate": 0.0001401237113402062,
+      "loss": 0.7535,
+      "step": 1603
+    },
+    {
+      "epoch": 0.51328,
+      "grad_norm": 0.1435183584690094,
+      "learning_rate": 0.00014008247422680415,
+      "loss": 0.5679,
+      "step": 1604
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.15758828818798065,
+      "learning_rate": 0.00014004123711340206,
+      "loss": 0.5941,
+      "step": 1605
+    },
+    {
+      "epoch": 0.51392,
+      "grad_norm": 0.14006471633911133,
+      "learning_rate": 0.00014,
+      "loss": 0.6687,
+      "step": 1606
+    },
+    {
+      "epoch": 0.51424,
+      "grad_norm": 0.14834052324295044,
+      "learning_rate": 0.00013995876288659795,
+      "loss": 0.6459,
+      "step": 1607
+    },
+    {
+      "epoch": 0.51456,
+      "grad_norm": 0.13367867469787598,
+      "learning_rate": 0.00013991752577319588,
+      "loss": 0.6245,
+      "step": 1608
+    },
+    {
+      "epoch": 0.51488,
+      "grad_norm": 0.15048961341381073,
+      "learning_rate": 0.00013987628865979384,
+      "loss": 0.7576,
+      "step": 1609
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.13006894290447235,
+      "learning_rate": 0.00013983505154639177,
+      "loss": 0.835,
+      "step": 1610
+    },
+    {
+      "epoch": 0.51552,
+      "grad_norm": 0.13258709013462067,
+      "learning_rate": 0.0001397938144329897,
+      "loss": 0.5882,
+      "step": 1611
+    },
+    {
+      "epoch": 0.51584,
+      "grad_norm": 0.15616407990455627,
+      "learning_rate": 0.00013975257731958763,
+      "loss": 0.6671,
+      "step": 1612
+    },
+    {
+      "epoch": 0.51616,
+      "grad_norm": 0.11868329346179962,
+      "learning_rate": 0.00013971134020618556,
+      "loss": 0.7288,
+      "step": 1613
+    },
+    {
+      "epoch": 0.51648,
+      "grad_norm": 0.13981391489505768,
+      "learning_rate": 0.00013967010309278352,
+      "loss": 0.8886,
+      "step": 1614
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.14195965230464935,
+      "learning_rate": 0.00013962886597938145,
+      "loss": 0.7257,
+      "step": 1615
+    },
+    {
+      "epoch": 0.51712,
+      "grad_norm": 0.11883968859910965,
+      "learning_rate": 0.00013958762886597938,
+      "loss": 0.7532,
+      "step": 1616
+    },
+    {
+      "epoch": 0.51744,
+      "grad_norm": 0.12168893963098526,
+      "learning_rate": 0.00013954639175257734,
+      "loss": 0.6678,
+      "step": 1617
+    },
+    {
+      "epoch": 0.51776,
+      "grad_norm": 0.11379075050354004,
+      "learning_rate": 0.00013950515463917527,
+      "loss": 0.5969,
+      "step": 1618
+    },
+    {
+      "epoch": 0.51808,
+      "grad_norm": 0.13150529563426971,
+      "learning_rate": 0.0001394639175257732,
+      "loss": 0.7442,
+      "step": 1619
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.13873353600502014,
+      "learning_rate": 0.00013942268041237113,
+      "loss": 0.6682,
+      "step": 1620
+    },
+    {
+      "epoch": 0.51872,
+      "grad_norm": 0.12265759706497192,
+      "learning_rate": 0.00013938144329896907,
+      "loss": 0.8013,
+      "step": 1621
+    },
+    {
+      "epoch": 0.51904,
+      "grad_norm": 0.14572475850582123,
+      "learning_rate": 0.00013934020618556702,
+      "loss": 0.5613,
+      "step": 1622
+    },
+    {
+      "epoch": 0.51936,
+      "grad_norm": 0.15529032051563263,
+      "learning_rate": 0.00013929896907216495,
+      "loss": 0.7016,
+      "step": 1623
+    },
+    {
+      "epoch": 0.51968,
+      "grad_norm": 0.1380968689918518,
+      "learning_rate": 0.00013925773195876289,
+      "loss": 0.7537,
+      "step": 1624
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.13644950091838837,
+      "learning_rate": 0.00013921649484536084,
+      "loss": 0.6711,
+      "step": 1625
+    },
+    {
+      "epoch": 0.52032,
+      "grad_norm": 0.1295856237411499,
+      "learning_rate": 0.00013917525773195878,
+      "loss": 0.6545,
+      "step": 1626
+    },
+    {
+      "epoch": 0.52064,
+      "grad_norm": 0.14123371243476868,
+      "learning_rate": 0.0001391340206185567,
+      "loss": 0.6589,
+      "step": 1627
+    },
+    {
+      "epoch": 0.52096,
+      "grad_norm": 0.14248934388160706,
+      "learning_rate": 0.00013909278350515464,
+      "loss": 0.7579,
+      "step": 1628
+    },
+    {
+      "epoch": 0.52128,
+      "grad_norm": 0.1358531415462494,
+      "learning_rate": 0.00013905154639175257,
+      "loss": 0.6164,
+      "step": 1629
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.12175789475440979,
+      "learning_rate": 0.00013901030927835053,
+      "loss": 0.6545,
+      "step": 1630
+    },
+    {
+      "epoch": 0.52192,
+      "grad_norm": 0.15957528352737427,
+      "learning_rate": 0.00013896907216494846,
+      "loss": 0.6837,
+      "step": 1631
+    },
+    {
+      "epoch": 0.52224,
+      "grad_norm": 0.13381953537464142,
+      "learning_rate": 0.00013892783505154642,
+      "loss": 0.6428,
+      "step": 1632
+    },
+    {
+      "epoch": 0.52256,
+      "grad_norm": 0.16512586176395416,
+      "learning_rate": 0.00013888659793814435,
+      "loss": 0.7207,
+      "step": 1633
+    },
+    {
+      "epoch": 0.52288,
+      "grad_norm": 0.11967901140451431,
+      "learning_rate": 0.00013884536082474225,
+      "loss": 0.6968,
+      "step": 1634
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.14090409874916077,
+      "learning_rate": 0.0001388041237113402,
+      "loss": 0.6865,
+      "step": 1635
+    },
+    {
+      "epoch": 0.52352,
+      "grad_norm": 0.13441434502601624,
+      "learning_rate": 0.00013876288659793814,
+      "loss": 0.8455,
+      "step": 1636
+    },
+    {
+      "epoch": 0.52384,
+      "grad_norm": 0.13941827416419983,
+      "learning_rate": 0.0001387216494845361,
+      "loss": 0.6088,
+      "step": 1637
+    },
+    {
+      "epoch": 0.52416,
+      "grad_norm": 0.13021568953990936,
+      "learning_rate": 0.00013868041237113403,
+      "loss": 0.6578,
+      "step": 1638
+    },
+    {
+      "epoch": 0.52448,
+      "grad_norm": 0.165999636054039,
+      "learning_rate": 0.00013863917525773196,
+      "loss": 0.8015,
+      "step": 1639
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.14085112512111664,
+      "learning_rate": 0.0001385979381443299,
+      "loss": 0.703,
+      "step": 1640
+    },
+    {
+      "epoch": 0.52512,
+      "grad_norm": 0.1498437523841858,
+      "learning_rate": 0.00013855670103092783,
+      "loss": 0.7359,
+      "step": 1641
+    },
+    {
+      "epoch": 0.52544,
+      "grad_norm": 0.12368453294038773,
+      "learning_rate": 0.00013851546391752578,
+      "loss": 0.7872,
+      "step": 1642
+    },
+    {
+      "epoch": 0.52576,
+      "grad_norm": 0.15656830370426178,
+      "learning_rate": 0.00013847422680412372,
+      "loss": 0.8115,
+      "step": 1643
+    },
+    {
+      "epoch": 0.52608,
+      "grad_norm": 0.15052351355552673,
+      "learning_rate": 0.00013843298969072165,
+      "loss": 0.7281,
+      "step": 1644
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.12937717139720917,
+      "learning_rate": 0.0001383917525773196,
+      "loss": 0.7406,
+      "step": 1645
+    },
+    {
+      "epoch": 0.52672,
+      "grad_norm": 0.14770230650901794,
+      "learning_rate": 0.00013835051546391754,
+      "loss": 0.7535,
+      "step": 1646
+    },
+    {
+      "epoch": 0.52704,
+      "grad_norm": 0.1457260549068451,
+      "learning_rate": 0.00013830927835051547,
+      "loss": 0.7317,
+      "step": 1647
+    },
+    {
+      "epoch": 0.52736,
+      "grad_norm": 0.14773280918598175,
+      "learning_rate": 0.0001382680412371134,
+      "loss": 0.7076,
+      "step": 1648
+    },
+    {
+      "epoch": 0.52768,
+      "grad_norm": 0.13222400844097137,
+      "learning_rate": 0.00013822680412371133,
+      "loss": 0.6722,
+      "step": 1649
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.14744020998477936,
+      "learning_rate": 0.0001381855670103093,
+      "loss": 0.7032,
+      "step": 1650
+    },
+    {
+      "epoch": 0.52832,
+      "grad_norm": 0.14622749388217926,
+      "learning_rate": 0.00013814432989690722,
+      "loss": 0.7313,
+      "step": 1651
+    },
+    {
+      "epoch": 0.52864,
+      "grad_norm": 0.1350373923778534,
+      "learning_rate": 0.00013810309278350515,
+      "loss": 0.8763,
+      "step": 1652
+    },
+    {
+      "epoch": 0.52896,
+      "grad_norm": 0.14311152696609497,
+      "learning_rate": 0.0001380618556701031,
+      "loss": 0.7024,
+      "step": 1653
+    },
+    {
+      "epoch": 0.52928,
+      "grad_norm": 0.1319030225276947,
+      "learning_rate": 0.00013802061855670104,
+      "loss": 0.9381,
+      "step": 1654
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.13256032764911652,
+      "learning_rate": 0.00013797938144329897,
+      "loss": 0.73,
+      "step": 1655
+    },
+    {
+      "epoch": 0.52992,
+      "grad_norm": 0.1313982903957367,
+      "learning_rate": 0.0001379381443298969,
+      "loss": 0.7527,
+      "step": 1656
+    },
+    {
+      "epoch": 0.53024,
+      "grad_norm": 0.12875613570213318,
+      "learning_rate": 0.00013789690721649484,
+      "loss": 0.785,
+      "step": 1657
+    },
+    {
+      "epoch": 0.53056,
+      "grad_norm": 0.1500009149312973,
+      "learning_rate": 0.0001378556701030928,
+      "loss": 0.647,
+      "step": 1658
+    },
+    {
+      "epoch": 0.53088,
+      "grad_norm": 0.13786911964416504,
+      "learning_rate": 0.00013781443298969072,
+      "loss": 0.6475,
+      "step": 1659
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.1468701958656311,
+      "learning_rate": 0.00013777319587628868,
+      "loss": 0.6968,
+      "step": 1660
+    },
+    {
+      "epoch": 0.53152,
+      "grad_norm": 0.135935440659523,
+      "learning_rate": 0.00013773195876288661,
+      "loss": 0.6248,
+      "step": 1661
+    },
+    {
+      "epoch": 0.53184,
+      "grad_norm": 0.12344254553318024,
+      "learning_rate": 0.00013769072164948455,
+      "loss": 0.5669,
+      "step": 1662
+    },
+    {
+      "epoch": 0.53216,
+      "grad_norm": 0.14883849024772644,
+      "learning_rate": 0.00013764948453608248,
+      "loss": 0.631,
+      "step": 1663
+    },
+    {
+      "epoch": 0.53248,
+      "grad_norm": 0.146417036652565,
+      "learning_rate": 0.0001376082474226804,
+      "loss": 0.7066,
+      "step": 1664
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.1755172163248062,
+      "learning_rate": 0.00013756701030927837,
+      "loss": 0.7664,
+      "step": 1665
+    },
+    {
+      "epoch": 0.53312,
+      "grad_norm": 0.15486696362495422,
+      "learning_rate": 0.0001375257731958763,
+      "loss": 0.7043,
+      "step": 1666
+    },
+    {
+      "epoch": 0.53344,
+      "grad_norm": 0.13096696138381958,
+      "learning_rate": 0.00013748453608247423,
+      "loss": 0.7577,
+      "step": 1667
+    },
+    {
+      "epoch": 0.53376,
+      "grad_norm": 0.15096555650234222,
+      "learning_rate": 0.0001374432989690722,
+      "loss": 0.7308,
+      "step": 1668
+    },
+    {
+      "epoch": 0.53408,
+      "grad_norm": 0.15723296999931335,
+      "learning_rate": 0.00013740206185567012,
+      "loss": 0.7045,
+      "step": 1669
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.15512202680110931,
+      "learning_rate": 0.00013736082474226805,
+      "loss": 0.755,
+      "step": 1670
+    },
+    {
+      "epoch": 0.53472,
+      "grad_norm": 0.15073993802070618,
+      "learning_rate": 0.00013731958762886598,
+      "loss": 0.7349,
+      "step": 1671
+    },
+    {
+      "epoch": 0.53504,
+      "grad_norm": 0.1436302363872528,
+      "learning_rate": 0.0001372783505154639,
+      "loss": 0.6763,
+      "step": 1672
+    },
+    {
+      "epoch": 0.53536,
+      "grad_norm": 0.11828181892633438,
+      "learning_rate": 0.00013723711340206187,
+      "loss": 0.6416,
+      "step": 1673
+    },
+    {
+      "epoch": 0.53568,
+      "grad_norm": 0.14397715032100677,
+      "learning_rate": 0.0001371958762886598,
+      "loss": 0.6905,
+      "step": 1674
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.13953359425067902,
+      "learning_rate": 0.00013715463917525776,
+      "loss": 0.6202,
+      "step": 1675
+    },
+    {
+      "epoch": 0.53632,
+      "grad_norm": 0.12717866897583008,
+      "learning_rate": 0.00013711340206185566,
+      "loss": 0.5746,
+      "step": 1676
+    },
+    {
+      "epoch": 0.53664,
+      "grad_norm": 0.1348889023065567,
+      "learning_rate": 0.0001370721649484536,
+      "loss": 0.7732,
+      "step": 1677
+    },
+    {
+      "epoch": 0.53696,
+      "grad_norm": 0.12386288493871689,
+      "learning_rate": 0.00013703092783505155,
+      "loss": 0.5508,
+      "step": 1678
+    },
+    {
+      "epoch": 0.53728,
+      "grad_norm": 0.16130952537059784,
+      "learning_rate": 0.00013698969072164949,
+      "loss": 0.8291,
+      "step": 1679
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.13158445060253143,
+      "learning_rate": 0.00013694845360824744,
+      "loss": 0.6264,
+      "step": 1680
+    },
+    {
+      "epoch": 0.53792,
+      "grad_norm": 0.1518358737230301,
+      "learning_rate": 0.00013690721649484538,
+      "loss": 0.6718,
+      "step": 1681
+    },
+    {
+      "epoch": 0.53824,
+      "grad_norm": 0.1292293816804886,
+      "learning_rate": 0.0001368659793814433,
+      "loss": 0.5921,
+      "step": 1682
+    },
+    {
+      "epoch": 0.53856,
+      "grad_norm": 0.12687307596206665,
+      "learning_rate": 0.00013682474226804124,
+      "loss": 0.6668,
+      "step": 1683
+    },
+    {
+      "epoch": 0.53888,
+      "grad_norm": 0.12536568939685822,
+      "learning_rate": 0.00013678350515463917,
+      "loss": 0.6386,
+      "step": 1684
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.13863909244537354,
+      "learning_rate": 0.0001367422680412371,
+      "loss": 0.6364,
+      "step": 1685
+    },
+    {
+      "epoch": 0.53952,
+      "grad_norm": 0.1437811255455017,
+      "learning_rate": 0.00013670103092783506,
+      "loss": 0.6716,
+      "step": 1686
+    },
+    {
+      "epoch": 0.53984,
+      "grad_norm": 0.13246455788612366,
+      "learning_rate": 0.000136659793814433,
+      "loss": 0.6327,
+      "step": 1687
+    },
+    {
+      "epoch": 0.54016,
+      "grad_norm": 0.144587904214859,
+      "learning_rate": 0.00013661855670103095,
+      "loss": 0.7246,
+      "step": 1688
+    },
+    {
+      "epoch": 0.54048,
+      "grad_norm": 0.14768695831298828,
+      "learning_rate": 0.00013657731958762888,
+      "loss": 0.6216,
+      "step": 1689
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.14124509692192078,
+      "learning_rate": 0.0001365360824742268,
+      "loss": 0.6779,
+      "step": 1690
+    },
+    {
+      "epoch": 0.54112,
+      "grad_norm": 0.14516718685626984,
+      "learning_rate": 0.00013649484536082474,
+      "loss": 0.7312,
+      "step": 1691
+    },
+    {
+      "epoch": 0.54144,
+      "grad_norm": 0.12537337839603424,
+      "learning_rate": 0.00013645360824742267,
+      "loss": 0.6352,
+      "step": 1692
+    },
+    {
+      "epoch": 0.54176,
+      "grad_norm": 0.13072200119495392,
+      "learning_rate": 0.00013641237113402063,
+      "loss": 0.8758,
+      "step": 1693
+    },
+    {
+      "epoch": 0.54208,
+      "grad_norm": 0.13488595187664032,
+      "learning_rate": 0.00013637113402061856,
+      "loss": 0.6175,
+      "step": 1694
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.12652455270290375,
+      "learning_rate": 0.0001363298969072165,
+      "loss": 0.6515,
+      "step": 1695
+    },
+    {
+      "epoch": 0.54272,
+      "grad_norm": 0.13127075135707855,
+      "learning_rate": 0.00013628865979381445,
+      "loss": 0.6905,
+      "step": 1696
+    },
+    {
+      "epoch": 0.54304,
+      "grad_norm": 0.1282002478837967,
+      "learning_rate": 0.00013624742268041238,
+      "loss": 0.8249,
+      "step": 1697
+    },
+    {
+      "epoch": 0.54336,
+      "grad_norm": 0.1588786542415619,
+      "learning_rate": 0.00013620618556701032,
+      "loss": 0.5997,
+      "step": 1698
+    },
+    {
+      "epoch": 0.54368,
+      "grad_norm": 0.16623853147029877,
+      "learning_rate": 0.00013616494845360825,
+      "loss": 0.8093,
+      "step": 1699
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.15679019689559937,
+      "learning_rate": 0.00013612371134020618,
+      "loss": 0.8038,
+      "step": 1700
+    },
+    {
+      "epoch": 0.54432,
+      "grad_norm": 0.13648444414138794,
+      "learning_rate": 0.00013608247422680414,
+      "loss": 0.7195,
+      "step": 1701
+    },
+    {
+      "epoch": 0.54464,
+      "grad_norm": 0.14090919494628906,
+      "learning_rate": 0.00013604123711340207,
+      "loss": 0.7692,
+      "step": 1702
+    },
+    {
+      "epoch": 0.54496,
+      "grad_norm": 0.12612387537956238,
+      "learning_rate": 0.00013600000000000003,
+      "loss": 0.7787,
+      "step": 1703
+    },
+    {
+      "epoch": 0.54528,
+      "grad_norm": 0.13551892340183258,
+      "learning_rate": 0.00013595876288659796,
+      "loss": 0.7509,
+      "step": 1704
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.1308637112379074,
+      "learning_rate": 0.0001359175257731959,
+      "loss": 0.6905,
+      "step": 1705
+    },
+    {
+      "epoch": 0.54592,
+      "grad_norm": 0.12324418127536774,
+      "learning_rate": 0.00013587628865979382,
+      "loss": 0.7413,
+      "step": 1706
+    },
+    {
+      "epoch": 0.54624,
+      "grad_norm": 0.13526131212711334,
+      "learning_rate": 0.00013583505154639175,
+      "loss": 0.5412,
+      "step": 1707
+    },
+    {
+      "epoch": 0.54656,
+      "grad_norm": 0.14957688748836517,
+      "learning_rate": 0.0001357938144329897,
+      "loss": 0.6902,
+      "step": 1708
+    },
+    {
+      "epoch": 0.54688,
+      "grad_norm": 0.13602221012115479,
+      "learning_rate": 0.00013575257731958764,
+      "loss": 0.7267,
+      "step": 1709
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.13706456124782562,
+      "learning_rate": 0.00013571134020618557,
+      "loss": 0.6815,
+      "step": 1710
+    },
+    {
+      "epoch": 0.54752,
+      "grad_norm": 0.14902132749557495,
+      "learning_rate": 0.00013567010309278353,
+      "loss": 0.7527,
+      "step": 1711
+    },
+    {
+      "epoch": 0.54784,
+      "grad_norm": 0.1630292683839798,
+      "learning_rate": 0.00013562886597938143,
+      "loss": 0.6755,
+      "step": 1712
+    },
+    {
+      "epoch": 0.54816,
+      "grad_norm": 0.13587939739227295,
+      "learning_rate": 0.00013558762886597937,
+      "loss": 0.7811,
+      "step": 1713
+    },
+    {
+      "epoch": 0.54848,
+      "grad_norm": 0.1295965015888214,
+      "learning_rate": 0.00013554639175257732,
+      "loss": 0.5197,
+      "step": 1714
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.12837980687618256,
+      "learning_rate": 0.00013550515463917526,
+      "loss": 0.7401,
+      "step": 1715
+    },
+    {
+      "epoch": 0.54912,
+      "grad_norm": 0.1482519954442978,
+      "learning_rate": 0.00013546391752577321,
+      "loss": 0.6882,
+      "step": 1716
+    },
+    {
+      "epoch": 0.54944,
+      "grad_norm": 0.13367551565170288,
+      "learning_rate": 0.00013542268041237115,
+      "loss": 0.7258,
+      "step": 1717
+    },
+    {
+      "epoch": 0.54976,
+      "grad_norm": 0.15452930331230164,
+      "learning_rate": 0.00013538144329896908,
+      "loss": 0.6638,
+      "step": 1718
+    },
+    {
+      "epoch": 0.55008,
+      "grad_norm": 0.12944737076759338,
+      "learning_rate": 0.000135340206185567,
+      "loss": 0.7219,
+      "step": 1719
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.13899493217468262,
+      "learning_rate": 0.00013529896907216494,
+      "loss": 0.924,
+      "step": 1720
+    },
+    {
+      "epoch": 0.55072,
+      "grad_norm": 0.12524177134037018,
+      "learning_rate": 0.0001352577319587629,
+      "loss": 0.7761,
+      "step": 1721
+    },
+    {
+      "epoch": 0.55104,
+      "grad_norm": 0.12263306230306625,
+      "learning_rate": 0.00013521649484536083,
+      "loss": 0.6369,
+      "step": 1722
+    },
+    {
+      "epoch": 0.55136,
+      "grad_norm": 0.14325588941574097,
+      "learning_rate": 0.00013517525773195876,
+      "loss": 0.6871,
+      "step": 1723
+    },
+    {
+      "epoch": 0.55168,
+      "grad_norm": 0.13194425404071808,
+      "learning_rate": 0.00013513402061855672,
+      "loss": 0.8146,
+      "step": 1724
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.15382345020771027,
+      "learning_rate": 0.00013509278350515465,
+      "loss": 0.7097,
+      "step": 1725
+    },
+    {
+      "epoch": 0.55232,
+      "grad_norm": 0.1529049277305603,
+      "learning_rate": 0.00013505154639175258,
+      "loss": 0.5837,
+      "step": 1726
+    },
+    {
+      "epoch": 0.55264,
+      "grad_norm": 0.1539517641067505,
+      "learning_rate": 0.0001350103092783505,
+      "loss": 0.5286,
+      "step": 1727
+    },
+    {
+      "epoch": 0.55296,
+      "grad_norm": 0.1586897224187851,
+      "learning_rate": 0.00013496907216494844,
+      "loss": 0.7999,
+      "step": 1728
+    },
+    {
+      "epoch": 0.55328,
+      "grad_norm": 0.12118642777204514,
+      "learning_rate": 0.0001349278350515464,
+      "loss": 0.5811,
+      "step": 1729
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.1392558515071869,
+      "learning_rate": 0.00013488659793814433,
+      "loss": 0.8315,
+      "step": 1730
+    },
+    {
+      "epoch": 0.55392,
+      "grad_norm": 0.13815298676490784,
+      "learning_rate": 0.0001348453608247423,
+      "loss": 0.6403,
+      "step": 1731
+    },
+    {
+      "epoch": 0.55424,
+      "grad_norm": 0.1518428474664688,
+      "learning_rate": 0.00013480412371134022,
+      "loss": 0.8195,
+      "step": 1732
+    },
+    {
+      "epoch": 0.55456,
+      "grad_norm": 0.1450730264186859,
+      "learning_rate": 0.00013476288659793815,
+      "loss": 0.641,
+      "step": 1733
+    },
+    {
+      "epoch": 0.55488,
+      "grad_norm": 0.14829540252685547,
+      "learning_rate": 0.00013472164948453609,
+      "loss": 0.7184,
+      "step": 1734
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.13482412695884705,
+      "learning_rate": 0.00013468041237113402,
+      "loss": 0.7044,
+      "step": 1735
+    },
+    {
+      "epoch": 0.55552,
+      "grad_norm": 0.1404435932636261,
+      "learning_rate": 0.00013463917525773197,
+      "loss": 0.7029,
+      "step": 1736
+    },
+    {
+      "epoch": 0.55584,
+      "grad_norm": 0.14939363300800323,
+      "learning_rate": 0.0001345979381443299,
+      "loss": 0.8864,
+      "step": 1737
+    },
+    {
+      "epoch": 0.55616,
+      "grad_norm": 0.14376312494277954,
+      "learning_rate": 0.00013455670103092784,
+      "loss": 0.8516,
+      "step": 1738
+    },
+    {
+      "epoch": 0.55648,
+      "grad_norm": 0.1442050188779831,
+      "learning_rate": 0.0001345154639175258,
+      "loss": 0.7104,
+      "step": 1739
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.12962238490581512,
+      "learning_rate": 0.00013447422680412373,
+      "loss": 0.687,
+      "step": 1740
+    },
+    {
+      "epoch": 0.55712,
+      "grad_norm": 0.13987627625465393,
+      "learning_rate": 0.00013443298969072166,
+      "loss": 0.7208,
+      "step": 1741
+    },
+    {
+      "epoch": 0.55744,
+      "grad_norm": 0.14828431606292725,
+      "learning_rate": 0.0001343917525773196,
+      "loss": 0.6677,
+      "step": 1742
+    },
+    {
+      "epoch": 0.55776,
+      "grad_norm": 0.14022362232208252,
+      "learning_rate": 0.00013435051546391752,
+      "loss": 0.6182,
+      "step": 1743
+    },
+    {
+      "epoch": 0.55808,
+      "grad_norm": 0.13057346642017365,
+      "learning_rate": 0.00013430927835051548,
+      "loss": 0.5761,
+      "step": 1744
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.1349085420370102,
+      "learning_rate": 0.0001342680412371134,
+      "loss": 0.6868,
+      "step": 1745
+    },
+    {
+      "epoch": 0.55872,
+      "grad_norm": 0.13358305394649506,
+      "learning_rate": 0.00013422680412371134,
+      "loss": 0.4868,
+      "step": 1746
+    },
+    {
+      "epoch": 0.55904,
+      "grad_norm": 0.14226961135864258,
+      "learning_rate": 0.0001341855670103093,
+      "loss": 0.737,
+      "step": 1747
+    },
+    {
+      "epoch": 0.55936,
+      "grad_norm": 0.14248816668987274,
+      "learning_rate": 0.0001341443298969072,
+      "loss": 0.6367,
+      "step": 1748
+    },
+    {
+      "epoch": 0.55968,
+      "grad_norm": 0.13468438386917114,
+      "learning_rate": 0.00013410309278350516,
+      "loss": 0.9036,
+      "step": 1749
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.14425882697105408,
+      "learning_rate": 0.0001340618556701031,
+      "loss": 0.8104,
+      "step": 1750
+    },
+    {
+      "epoch": 0.56032,
+      "grad_norm": 0.16469749808311462,
+      "learning_rate": 0.00013402061855670103,
+      "loss": 0.6554,
+      "step": 1751
+    },
+    {
+      "epoch": 0.56064,
+      "grad_norm": 0.1481115072965622,
+      "learning_rate": 0.00013397938144329898,
+      "loss": 0.7364,
+      "step": 1752
+    },
+    {
+      "epoch": 0.56096,
+      "grad_norm": 0.13165169954299927,
+      "learning_rate": 0.00013393814432989691,
+      "loss": 0.6251,
+      "step": 1753
+    },
+    {
+      "epoch": 0.56128,
+      "grad_norm": 0.13377028703689575,
+      "learning_rate": 0.00013389690721649487,
+      "loss": 0.6772,
+      "step": 1754
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.12232476472854614,
+      "learning_rate": 0.00013385567010309278,
+      "loss": 0.6245,
+      "step": 1755
+    },
+    {
+      "epoch": 0.56192,
+      "grad_norm": 0.1359662115573883,
+      "learning_rate": 0.0001338144329896907,
+      "loss": 0.645,
+      "step": 1756
+    },
+    {
+      "epoch": 0.56224,
+      "grad_norm": 0.1492493748664856,
+      "learning_rate": 0.00013377319587628867,
+      "loss": 0.6599,
+      "step": 1757
+    },
+    {
+      "epoch": 0.56256,
+      "grad_norm": 0.13330449163913727,
+      "learning_rate": 0.0001337319587628866,
+      "loss": 0.6645,
+      "step": 1758
+    },
+    {
+      "epoch": 0.56288,
+      "grad_norm": 0.15756262838840485,
+      "learning_rate": 0.00013369072164948456,
+      "loss": 0.7503,
+      "step": 1759
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.13502779603004456,
+      "learning_rate": 0.0001336494845360825,
+      "loss": 0.6723,
+      "step": 1760
+    },
+    {
+      "epoch": 0.56352,
+      "grad_norm": 0.12344586849212646,
+      "learning_rate": 0.00013360824742268042,
+      "loss": 0.6976,
+      "step": 1761
+    },
+    {
+      "epoch": 0.56384,
+      "grad_norm": 0.15826153755187988,
+      "learning_rate": 0.00013356701030927835,
+      "loss": 0.7032,
+      "step": 1762
+    },
+    {
+      "epoch": 0.56416,
+      "grad_norm": 0.14459273219108582,
+      "learning_rate": 0.00013352577319587628,
+      "loss": 0.7533,
+      "step": 1763
+    },
+    {
+      "epoch": 0.56448,
+      "grad_norm": 0.13797153532505035,
+      "learning_rate": 0.00013348453608247424,
+      "loss": 0.539,
+      "step": 1764
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.14102274179458618,
+      "learning_rate": 0.00013344329896907217,
+      "loss": 0.5844,
+      "step": 1765
+    },
+    {
+      "epoch": 0.56512,
+      "grad_norm": 0.12047169357538223,
+      "learning_rate": 0.0001334020618556701,
+      "loss": 0.5146,
+      "step": 1766
+    },
+    {
+      "epoch": 0.56544,
+      "grad_norm": 0.14720189571380615,
+      "learning_rate": 0.00013336082474226806,
+      "loss": 0.7236,
+      "step": 1767
+    },
+    {
+      "epoch": 0.56576,
+      "grad_norm": 0.1129353791475296,
+      "learning_rate": 0.000133319587628866,
+      "loss": 0.6582,
+      "step": 1768
+    },
+    {
+      "epoch": 0.56608,
+      "grad_norm": 0.1277375966310501,
+      "learning_rate": 0.00013327835051546392,
+      "loss": 0.6529,
+      "step": 1769
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.13147298991680145,
+      "learning_rate": 0.00013323711340206186,
+      "loss": 0.7737,
+      "step": 1770
+    },
+    {
+      "epoch": 0.56672,
+      "grad_norm": 0.15640847384929657,
+      "learning_rate": 0.00013319587628865979,
+      "loss": 0.5673,
+      "step": 1771
+    },
+    {
+      "epoch": 0.56704,
+      "grad_norm": 0.1401795595884323,
+      "learning_rate": 0.00013315463917525774,
+      "loss": 0.6257,
+      "step": 1772
+    },
+    {
+      "epoch": 0.56736,
+      "grad_norm": 0.13787108659744263,
+      "learning_rate": 0.00013311340206185568,
+      "loss": 0.7706,
+      "step": 1773
+    },
+    {
+      "epoch": 0.56768,
+      "grad_norm": 0.13369019329547882,
+      "learning_rate": 0.0001330721649484536,
+      "loss": 0.6816,
+      "step": 1774
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.14988088607788086,
+      "learning_rate": 0.00013303092783505157,
+      "loss": 0.7451,
+      "step": 1775
+    },
+    {
+      "epoch": 0.56832,
+      "grad_norm": 0.14894211292266846,
+      "learning_rate": 0.0001329896907216495,
+      "loss": 0.6831,
+      "step": 1776
+    },
+    {
+      "epoch": 0.56864,
+      "grad_norm": 0.13918204605579376,
+      "learning_rate": 0.00013294845360824743,
+      "loss": 0.7758,
+      "step": 1777
+    },
+    {
+      "epoch": 0.56896,
+      "grad_norm": 0.1445845663547516,
+      "learning_rate": 0.00013290721649484536,
+      "loss": 0.8243,
+      "step": 1778
+    },
+    {
+      "epoch": 0.56928,
+      "grad_norm": 0.13548369705677032,
+      "learning_rate": 0.0001328659793814433,
+      "loss": 0.7407,
+      "step": 1779
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.14967234432697296,
+      "learning_rate": 0.00013282474226804125,
+      "loss": 0.6652,
+      "step": 1780
+    },
+    {
+      "epoch": 0.56992,
+      "grad_norm": 0.13391603529453278,
+      "learning_rate": 0.00013278350515463918,
+      "loss": 0.6124,
+      "step": 1781
+    },
+    {
+      "epoch": 0.57024,
+      "grad_norm": 0.1333674043416977,
+      "learning_rate": 0.00013274226804123714,
+      "loss": 0.6413,
+      "step": 1782
+    },
+    {
+      "epoch": 0.57056,
+      "grad_norm": 0.1482914239168167,
+      "learning_rate": 0.00013270103092783507,
+      "loss": 0.7787,
+      "step": 1783
+    },
+    {
+      "epoch": 0.57088,
+      "grad_norm": 0.1339278519153595,
+      "learning_rate": 0.00013265979381443297,
+      "loss": 0.5781,
+      "step": 1784
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.14695081114768982,
+      "learning_rate": 0.00013261855670103093,
+      "loss": 0.7708,
+      "step": 1785
+    },
+    {
+      "epoch": 0.57152,
+      "grad_norm": 0.14863723516464233,
+      "learning_rate": 0.00013257731958762886,
+      "loss": 0.7009,
+      "step": 1786
+    },
+    {
+      "epoch": 0.57184,
+      "grad_norm": 0.14896482229232788,
+      "learning_rate": 0.00013253608247422682,
+      "loss": 0.6738,
+      "step": 1787
+    },
+    {
+      "epoch": 0.57216,
+      "grad_norm": 0.1547907441854477,
+      "learning_rate": 0.00013249484536082475,
+      "loss": 0.5639,
+      "step": 1788
+    },
+    {
+      "epoch": 0.57248,
+      "grad_norm": 0.13742683827877045,
+      "learning_rate": 0.00013245360824742268,
+      "loss": 0.6558,
+      "step": 1789
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.15084274113178253,
+      "learning_rate": 0.00013241237113402064,
+      "loss": 0.6869,
+      "step": 1790
+    },
+    {
+      "epoch": 0.57312,
+      "grad_norm": 0.14779849350452423,
+      "learning_rate": 0.00013237113402061855,
+      "loss": 0.638,
+      "step": 1791
+    },
+    {
+      "epoch": 0.57344,
+      "grad_norm": 0.12779498100280762,
+      "learning_rate": 0.0001323298969072165,
+      "loss": 0.6937,
+      "step": 1792
+    },
+    {
+      "epoch": 0.57376,
+      "grad_norm": 0.14302028715610504,
+      "learning_rate": 0.00013228865979381444,
+      "loss": 0.7148,
+      "step": 1793
+    },
+    {
+      "epoch": 0.57408,
+      "grad_norm": 0.12912467122077942,
+      "learning_rate": 0.00013224742268041237,
+      "loss": 0.8158,
+      "step": 1794
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.1584804505109787,
+      "learning_rate": 0.00013220618556701033,
+      "loss": 0.7598,
+      "step": 1795
+    },
+    {
+      "epoch": 0.57472,
+      "grad_norm": 0.14452625811100006,
+      "learning_rate": 0.00013216494845360826,
+      "loss": 0.7039,
+      "step": 1796
+    },
+    {
+      "epoch": 0.57504,
+      "grad_norm": 0.12496565282344818,
+      "learning_rate": 0.0001321237113402062,
+      "loss": 0.9407,
+      "step": 1797
+    },
+    {
+      "epoch": 0.57536,
+      "grad_norm": 0.14201682806015015,
+      "learning_rate": 0.00013208247422680412,
+      "loss": 0.5505,
+      "step": 1798
+    },
+    {
+      "epoch": 0.57568,
+      "grad_norm": 0.14024336636066437,
+      "learning_rate": 0.00013204123711340205,
+      "loss": 0.7361,
+      "step": 1799
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.1336219757795334,
+      "learning_rate": 0.000132,
+      "loss": 0.7854,
+      "step": 1800
+    },
+    {
+      "epoch": 0.57632,
+      "grad_norm": 0.13467498123645782,
+      "learning_rate": 0.00013195876288659794,
+      "loss": 0.7383,
+      "step": 1801
+    },
+    {
+      "epoch": 0.57664,
+      "grad_norm": 0.1330866515636444,
+      "learning_rate": 0.0001319175257731959,
+      "loss": 0.7307,
+      "step": 1802
+    },
+    {
+      "epoch": 0.57696,
+      "grad_norm": 0.11944994330406189,
+      "learning_rate": 0.00013187628865979383,
+      "loss": 0.6408,
+      "step": 1803
+    },
+    {
+      "epoch": 0.57728,
+      "grad_norm": 0.13701461255550385,
+      "learning_rate": 0.00013183505154639176,
+      "loss": 0.8054,
+      "step": 1804
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.12145336717367172,
+      "learning_rate": 0.0001317938144329897,
+      "loss": 0.6173,
+      "step": 1805
+    },
+    {
+      "epoch": 0.57792,
+      "grad_norm": 0.13406139612197876,
+      "learning_rate": 0.00013175257731958762,
+      "loss": 0.7681,
+      "step": 1806
+    },
+    {
+      "epoch": 0.57824,
+      "grad_norm": 0.1492442935705185,
+      "learning_rate": 0.00013171134020618556,
+      "loss": 0.6116,
+      "step": 1807
+    },
+    {
+      "epoch": 0.57856,
+      "grad_norm": 0.14872552454471588,
+      "learning_rate": 0.00013167010309278351,
+      "loss": 0.7095,
+      "step": 1808
+    },
+    {
+      "epoch": 0.57888,
+      "grad_norm": 0.12427672743797302,
+      "learning_rate": 0.00013162886597938145,
+      "loss": 0.6901,
+      "step": 1809
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.13664910197257996,
+      "learning_rate": 0.0001315876288659794,
+      "loss": 0.7601,
+      "step": 1810
+    },
+    {
+      "epoch": 0.57952,
+      "grad_norm": 0.14193250238895416,
+      "learning_rate": 0.00013154639175257734,
+      "loss": 0.7525,
+      "step": 1811
+    },
+    {
+      "epoch": 0.57984,
+      "grad_norm": 0.14674946665763855,
+      "learning_rate": 0.00013150515463917527,
+      "loss": 0.6901,
+      "step": 1812
+    },
+    {
+      "epoch": 0.58016,
+      "grad_norm": 0.14705410599708557,
+      "learning_rate": 0.0001314639175257732,
+      "loss": 0.6423,
+      "step": 1813
+    },
+    {
+      "epoch": 0.58048,
+      "grad_norm": 0.14765401184558868,
+      "learning_rate": 0.00013142268041237113,
+      "loss": 0.7426,
+      "step": 1814
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.1558821201324463,
+      "learning_rate": 0.0001313814432989691,
+      "loss": 0.7017,
+      "step": 1815
+    },
+    {
+      "epoch": 0.58112,
+      "grad_norm": 0.13213512301445007,
+      "learning_rate": 0.00013134020618556702,
+      "loss": 0.8138,
+      "step": 1816
+    },
+    {
+      "epoch": 0.58144,
+      "grad_norm": 0.13489393889904022,
+      "learning_rate": 0.00013129896907216495,
+      "loss": 0.6842,
+      "step": 1817
+    },
+    {
+      "epoch": 0.58176,
+      "grad_norm": 0.1441633552312851,
+      "learning_rate": 0.0001312577319587629,
+      "loss": 0.7445,
+      "step": 1818
+    },
+    {
+      "epoch": 0.58208,
+      "grad_norm": 0.1427292674779892,
+      "learning_rate": 0.00013121649484536084,
+      "loss": 0.7351,
+      "step": 1819
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.13226602971553802,
+      "learning_rate": 0.00013117525773195877,
+      "loss": 0.7178,
+      "step": 1820
+    },
+    {
+      "epoch": 0.58272,
+      "grad_norm": 0.16712085902690887,
+      "learning_rate": 0.0001311340206185567,
+      "loss": 0.7166,
+      "step": 1821
+    },
+    {
+      "epoch": 0.58304,
+      "grad_norm": 0.13856053352355957,
+      "learning_rate": 0.00013109278350515463,
+      "loss": 0.6683,
+      "step": 1822
+    },
+    {
+      "epoch": 0.58336,
+      "grad_norm": 0.16740012168884277,
+      "learning_rate": 0.0001310515463917526,
+      "loss": 0.8369,
+      "step": 1823
+    },
+    {
+      "epoch": 0.58368,
+      "grad_norm": 0.1308383196592331,
+      "learning_rate": 0.00013101030927835052,
+      "loss": 0.5037,
+      "step": 1824
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.15628789365291595,
+      "learning_rate": 0.00013096907216494848,
+      "loss": 0.7379,
+      "step": 1825
+    },
+    {
+      "epoch": 0.58432,
+      "grad_norm": 0.1515168994665146,
+      "learning_rate": 0.00013092783505154639,
+      "loss": 0.6811,
+      "step": 1826
+    },
+    {
+      "epoch": 0.58464,
+      "grad_norm": 0.1565101146697998,
+      "learning_rate": 0.00013088659793814432,
+      "loss": 0.6737,
+      "step": 1827
+    },
+    {
+      "epoch": 0.58496,
+      "grad_norm": 0.15891097486019135,
+      "learning_rate": 0.00013084536082474228,
+      "loss": 0.7704,
+      "step": 1828
+    },
+    {
+      "epoch": 0.58528,
+      "grad_norm": 0.1445811688899994,
+      "learning_rate": 0.0001308041237113402,
+      "loss": 0.7628,
+      "step": 1829
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.12324066460132599,
+      "learning_rate": 0.00013076288659793817,
+      "loss": 0.5743,
+      "step": 1830
+    },
+    {
+      "epoch": 0.58592,
+      "grad_norm": 0.15465180575847626,
+      "learning_rate": 0.0001307216494845361,
+      "loss": 0.783,
+      "step": 1831
+    },
+    {
+      "epoch": 0.58624,
+      "grad_norm": 0.14205990731716156,
+      "learning_rate": 0.00013068041237113403,
+      "loss": 0.7149,
+      "step": 1832
+    },
+    {
+      "epoch": 0.58656,
+      "grad_norm": 0.14440414309501648,
+      "learning_rate": 0.00013063917525773196,
+      "loss": 0.6951,
+      "step": 1833
+    },
+    {
+      "epoch": 0.58688,
+      "grad_norm": 0.16611546277999878,
+      "learning_rate": 0.0001305979381443299,
+      "loss": 0.9304,
+      "step": 1834
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.13717123866081238,
+      "learning_rate": 0.00013055670103092782,
+      "loss": 0.5108,
+      "step": 1835
+    },
+    {
+      "epoch": 0.58752,
+      "grad_norm": 0.1380356401205063,
+      "learning_rate": 0.00013051546391752578,
+      "loss": 0.6698,
+      "step": 1836
+    },
+    {
+      "epoch": 0.58784,
+      "grad_norm": 0.1463797241449356,
+      "learning_rate": 0.0001304742268041237,
+      "loss": 0.7732,
+      "step": 1837
+    },
+    {
+      "epoch": 0.58816,
+      "grad_norm": 0.13822247087955475,
+      "learning_rate": 0.00013043298969072167,
+      "loss": 0.7733,
+      "step": 1838
+    },
+    {
+      "epoch": 0.58848,
+      "grad_norm": 0.12503483891487122,
+      "learning_rate": 0.0001303917525773196,
+      "loss": 0.7661,
+      "step": 1839
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.1147647500038147,
+      "learning_rate": 0.00013035051546391753,
+      "loss": 0.4546,
+      "step": 1840
+    },
+    {
+      "epoch": 0.58912,
+      "grad_norm": 0.1230325773358345,
+      "learning_rate": 0.00013030927835051546,
+      "loss": 0.5585,
+      "step": 1841
+    },
+    {
+      "epoch": 0.58944,
+      "grad_norm": 0.14122216403484344,
+      "learning_rate": 0.0001302680412371134,
+      "loss": 0.5758,
+      "step": 1842
+    },
+    {
+      "epoch": 0.58976,
+      "grad_norm": 0.15151569247245789,
+      "learning_rate": 0.00013022680412371135,
+      "loss": 0.6843,
+      "step": 1843
+    },
+    {
+      "epoch": 0.59008,
+      "grad_norm": 0.1292106658220291,
+      "learning_rate": 0.00013018556701030928,
+      "loss": 0.699,
+      "step": 1844
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.13869857788085938,
+      "learning_rate": 0.00013014432989690722,
+      "loss": 0.7525,
+      "step": 1845
+    },
+    {
+      "epoch": 0.59072,
+      "grad_norm": 0.15281198918819427,
+      "learning_rate": 0.00013010309278350517,
+      "loss": 0.7205,
+      "step": 1846
+    },
+    {
+      "epoch": 0.59104,
+      "grad_norm": 0.13255898654460907,
+      "learning_rate": 0.0001300618556701031,
+      "loss": 0.5767,
+      "step": 1847
+    },
+    {
+      "epoch": 0.59136,
+      "grad_norm": 0.11870738863945007,
+      "learning_rate": 0.00013002061855670104,
+      "loss": 0.5077,
+      "step": 1848
+    },
+    {
+      "epoch": 0.59168,
+      "grad_norm": 0.14117857813835144,
+      "learning_rate": 0.00012997938144329897,
+      "loss": 0.568,
+      "step": 1849
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.14479069411754608,
+      "learning_rate": 0.0001299381443298969,
+      "loss": 0.5165,
+      "step": 1850
+    },
+    {
+      "epoch": 0.59232,
+      "grad_norm": 0.14627261459827423,
+      "learning_rate": 0.00012989690721649486,
+      "loss": 0.6057,
+      "step": 1851
+    },
+    {
+      "epoch": 0.59264,
+      "grad_norm": 0.13284321129322052,
+      "learning_rate": 0.0001298556701030928,
+      "loss": 0.8567,
+      "step": 1852
+    },
+    {
+      "epoch": 0.59296,
+      "grad_norm": 0.15283948183059692,
+      "learning_rate": 0.00012981443298969075,
+      "loss": 0.7619,
+      "step": 1853
+    },
+    {
+      "epoch": 0.59328,
+      "grad_norm": 0.13968519866466522,
+      "learning_rate": 0.00012977319587628868,
+      "loss": 0.7535,
+      "step": 1854
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.1302313208580017,
+      "learning_rate": 0.0001297319587628866,
+      "loss": 0.6574,
+      "step": 1855
+    },
+    {
+      "epoch": 0.59392,
+      "grad_norm": 0.13157455623149872,
+      "learning_rate": 0.00012969072164948454,
+      "loss": 0.704,
+      "step": 1856
+    },
+    {
+      "epoch": 0.59424,
+      "grad_norm": 0.15155114233493805,
+      "learning_rate": 0.00012964948453608247,
+      "loss": 0.7422,
+      "step": 1857
+    },
+    {
+      "epoch": 0.59456,
+      "grad_norm": 0.13552071154117584,
+      "learning_rate": 0.00012960824742268043,
+      "loss": 0.5676,
+      "step": 1858
+    },
+    {
+      "epoch": 0.59488,
+      "grad_norm": 0.149967223405838,
+      "learning_rate": 0.00012956701030927836,
+      "loss": 0.6742,
+      "step": 1859
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.15703719854354858,
+      "learning_rate": 0.0001295257731958763,
+      "loss": 0.7453,
+      "step": 1860
+    },
+    {
+      "epoch": 0.59552,
+      "grad_norm": 0.1725168377161026,
+      "learning_rate": 0.00012948453608247425,
+      "loss": 0.8449,
+      "step": 1861
+    },
+    {
+      "epoch": 0.59584,
+      "grad_norm": 0.13105261325836182,
+      "learning_rate": 0.00012944329896907216,
+      "loss": 0.767,
+      "step": 1862
+    },
+    {
+      "epoch": 0.59616,
+      "grad_norm": 0.1367502510547638,
+      "learning_rate": 0.00012940206185567011,
+      "loss": 0.5927,
+      "step": 1863
+    },
+    {
+      "epoch": 0.59648,
+      "grad_norm": 0.18537381291389465,
+      "learning_rate": 0.00012936082474226805,
+      "loss": 0.756,
+      "step": 1864
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.14291930198669434,
+      "learning_rate": 0.00012931958762886598,
+      "loss": 0.7345,
+      "step": 1865
+    },
+    {
+      "epoch": 0.59712,
+      "grad_norm": 0.14345696568489075,
+      "learning_rate": 0.00012927835051546393,
+      "loss": 0.7558,
+      "step": 1866
+    },
+    {
+      "epoch": 0.59744,
+      "grad_norm": 0.13259442150592804,
+      "learning_rate": 0.00012923711340206187,
+      "loss": 0.6752,
+      "step": 1867
+    },
+    {
+      "epoch": 0.59776,
+      "grad_norm": 0.12619280815124512,
+      "learning_rate": 0.0001291958762886598,
+      "loss": 0.6396,
+      "step": 1868
+    },
+    {
+      "epoch": 0.59808,
+      "grad_norm": 0.145010843873024,
+      "learning_rate": 0.00012915463917525773,
+      "loss": 0.7933,
+      "step": 1869
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.14956963062286377,
+      "learning_rate": 0.00012911340206185566,
+      "loss": 0.7931,
+      "step": 1870
+    },
+    {
+      "epoch": 0.59872,
+      "grad_norm": 0.1342199295759201,
+      "learning_rate": 0.00012907216494845362,
+      "loss": 0.6408,
+      "step": 1871
+    },
+    {
+      "epoch": 0.59904,
+      "grad_norm": 0.1416260153055191,
+      "learning_rate": 0.00012903092783505155,
+      "loss": 0.6813,
+      "step": 1872
+    },
+    {
+      "epoch": 0.59936,
+      "grad_norm": 0.1454564481973648,
+      "learning_rate": 0.00012898969072164948,
+      "loss": 0.5713,
+      "step": 1873
+    },
+    {
+      "epoch": 0.59968,
+      "grad_norm": 0.13765162229537964,
+      "learning_rate": 0.00012894845360824744,
+      "loss": 0.5956,
+      "step": 1874
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.13723546266555786,
+      "learning_rate": 0.00012890721649484537,
+      "loss": 0.7485,
+      "step": 1875
+    },
+    {
+      "epoch": 0.60032,
+      "grad_norm": 0.12591178715229034,
+      "learning_rate": 0.0001288659793814433,
+      "loss": 0.6953,
+      "step": 1876
+    },
+    {
+      "epoch": 0.60064,
+      "grad_norm": 0.12784048914909363,
+      "learning_rate": 0.00012882474226804123,
+      "loss": 0.7045,
+      "step": 1877
+    },
+    {
+      "epoch": 0.60096,
+      "grad_norm": 0.13309244811534882,
+      "learning_rate": 0.00012878350515463916,
+      "loss": 0.5357,
+      "step": 1878
+    },
+    {
+      "epoch": 0.60128,
+      "grad_norm": 0.12655137479305267,
+      "learning_rate": 0.00012874226804123712,
+      "loss": 0.6734,
+      "step": 1879
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.14110948145389557,
+      "learning_rate": 0.00012870103092783505,
+      "loss": 0.6368,
+      "step": 1880
+    },
+    {
+      "epoch": 0.60192,
+      "grad_norm": 0.12137459218502045,
+      "learning_rate": 0.000128659793814433,
+      "loss": 0.6156,
+      "step": 1881
+    },
+    {
+      "epoch": 0.60224,
+      "grad_norm": 0.15284189581871033,
+      "learning_rate": 0.00012861855670103094,
+      "loss": 0.6539,
+      "step": 1882
+    },
+    {
+      "epoch": 0.60256,
+      "grad_norm": 0.1448545604944229,
+      "learning_rate": 0.00012857731958762888,
+      "loss": 0.5931,
+      "step": 1883
+    },
+    {
+      "epoch": 0.60288,
+      "grad_norm": 0.1656656712293625,
+      "learning_rate": 0.0001285360824742268,
+      "loss": 0.6722,
+      "step": 1884
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.15540426969528198,
+      "learning_rate": 0.00012849484536082474,
+      "loss": 0.6907,
+      "step": 1885
+    },
+    {
+      "epoch": 0.60352,
+      "grad_norm": 0.13939699530601501,
+      "learning_rate": 0.0001284536082474227,
+      "loss": 0.7573,
+      "step": 1886
+    },
+    {
+      "epoch": 0.60384,
+      "grad_norm": 0.14229856431484222,
+      "learning_rate": 0.00012841237113402063,
+      "loss": 0.6349,
+      "step": 1887
+    },
+    {
+      "epoch": 0.60416,
+      "grad_norm": 0.13974221050739288,
+      "learning_rate": 0.00012837113402061856,
+      "loss": 0.8005,
+      "step": 1888
+    },
+    {
+      "epoch": 0.60448,
+      "grad_norm": 0.14249245822429657,
+      "learning_rate": 0.00012832989690721652,
+      "loss": 0.6809,
+      "step": 1889
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.1409933716058731,
+      "learning_rate": 0.00012828865979381445,
+      "loss": 0.7416,
+      "step": 1890
+    },
+    {
+      "epoch": 0.60512,
+      "grad_norm": 0.1686551868915558,
+      "learning_rate": 0.00012824742268041238,
+      "loss": 0.749,
+      "step": 1891
+    },
+    {
+      "epoch": 0.60544,
+      "grad_norm": 0.12405146658420563,
+      "learning_rate": 0.0001282061855670103,
+      "loss": 0.7878,
+      "step": 1892
+    },
+    {
+      "epoch": 0.60576,
+      "grad_norm": 0.13999059796333313,
+      "learning_rate": 0.00012816494845360824,
+      "loss": 0.5989,
+      "step": 1893
+    },
+    {
+      "epoch": 0.60608,
+      "grad_norm": 0.1495513916015625,
+      "learning_rate": 0.0001281237113402062,
+      "loss": 0.6428,
+      "step": 1894
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.13758565485477448,
+      "learning_rate": 0.00012808247422680413,
+      "loss": 0.7109,
+      "step": 1895
+    },
+    {
+      "epoch": 0.60672,
+      "grad_norm": 0.14844781160354614,
+      "learning_rate": 0.00012804123711340206,
+      "loss": 0.6306,
+      "step": 1896
+    },
+    {
+      "epoch": 0.60704,
+      "grad_norm": 0.13203153014183044,
+      "learning_rate": 0.00012800000000000002,
+      "loss": 0.7102,
+      "step": 1897
+    },
+    {
+      "epoch": 0.60736,
+      "grad_norm": 0.14242084324359894,
+      "learning_rate": 0.00012795876288659793,
+      "loss": 0.8308,
+      "step": 1898
+    },
+    {
+      "epoch": 0.60768,
+      "grad_norm": 0.12235117703676224,
+      "learning_rate": 0.00012791752577319588,
+      "loss": 0.6965,
+      "step": 1899
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.14560118317604065,
+      "learning_rate": 0.00012787628865979382,
+      "loss": 0.6338,
+      "step": 1900
+    },
+    {
+      "epoch": 0.60832,
+      "grad_norm": 0.14123523235321045,
+      "learning_rate": 0.00012783505154639175,
+      "loss": 0.7186,
+      "step": 1901
+    },
+    {
+      "epoch": 0.60864,
+      "grad_norm": 0.1220543310046196,
+      "learning_rate": 0.0001277938144329897,
+      "loss": 0.5828,
+      "step": 1902
+    },
+    {
+      "epoch": 0.60896,
+      "grad_norm": 0.1339149922132492,
+      "learning_rate": 0.00012775257731958764,
+      "loss": 0.6722,
+      "step": 1903
+    },
+    {
+      "epoch": 0.60928,
+      "grad_norm": 0.1346278339624405,
+      "learning_rate": 0.0001277113402061856,
+      "loss": 0.8396,
+      "step": 1904
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.13202722370624542,
+      "learning_rate": 0.0001276701030927835,
+      "loss": 0.9066,
+      "step": 1905
+    },
+    {
+      "epoch": 0.60992,
+      "grad_norm": 0.1463432013988495,
+      "learning_rate": 0.00012762886597938143,
+      "loss": 0.7598,
+      "step": 1906
+    },
+    {
+      "epoch": 0.61024,
+      "grad_norm": 0.14574944972991943,
+      "learning_rate": 0.0001275876288659794,
+      "loss": 0.621,
+      "step": 1907
+    },
+    {
+      "epoch": 0.61056,
+      "grad_norm": 0.14618104696273804,
+      "learning_rate": 0.00012754639175257732,
+      "loss": 0.5827,
+      "step": 1908
+    },
+    {
+      "epoch": 0.61088,
+      "grad_norm": 0.14066514372825623,
+      "learning_rate": 0.00012750515463917528,
+      "loss": 0.5984,
+      "step": 1909
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.14523790776729584,
+      "learning_rate": 0.0001274639175257732,
+      "loss": 0.6005,
+      "step": 1910
+    },
+    {
+      "epoch": 0.61152,
+      "grad_norm": 0.14804573357105255,
+      "learning_rate": 0.00012742268041237114,
+      "loss": 0.6568,
+      "step": 1911
+    },
+    {
+      "epoch": 0.61184,
+      "grad_norm": 0.1466348022222519,
+      "learning_rate": 0.00012738144329896907,
+      "loss": 0.8652,
+      "step": 1912
+    },
+    {
+      "epoch": 0.61216,
+      "grad_norm": 0.14785926043987274,
+      "learning_rate": 0.000127340206185567,
+      "loss": 0.6374,
+      "step": 1913
+    },
+    {
+      "epoch": 0.61248,
+      "grad_norm": 0.13066139817237854,
+      "learning_rate": 0.00012729896907216496,
+      "loss": 0.5485,
+      "step": 1914
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.14120493829250336,
+      "learning_rate": 0.0001272577319587629,
+      "loss": 0.7202,
+      "step": 1915
+    },
+    {
+      "epoch": 0.61312,
+      "grad_norm": 0.16122666001319885,
+      "learning_rate": 0.00012721649484536082,
+      "loss": 0.7973,
+      "step": 1916
+    },
+    {
+      "epoch": 0.61344,
+      "grad_norm": 0.12795791029930115,
+      "learning_rate": 0.00012717525773195878,
+      "loss": 0.5116,
+      "step": 1917
+    },
+    {
+      "epoch": 0.61376,
+      "grad_norm": 0.14723263680934906,
+      "learning_rate": 0.00012713402061855671,
+      "loss": 0.7565,
+      "step": 1918
+    },
+    {
+      "epoch": 0.61408,
+      "grad_norm": 0.13519616425037384,
+      "learning_rate": 0.00012709278350515464,
+      "loss": 0.6887,
+      "step": 1919
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.14142216742038727,
+      "learning_rate": 0.00012705154639175258,
+      "loss": 0.8053,
+      "step": 1920
+    },
+    {
+      "epoch": 0.61472,
+      "grad_norm": 0.14305110275745392,
+      "learning_rate": 0.0001270103092783505,
+      "loss": 0.6143,
+      "step": 1921
+    },
+    {
+      "epoch": 0.61504,
+      "grad_norm": 0.13851754367351532,
+      "learning_rate": 0.00012696907216494847,
+      "loss": 0.7167,
+      "step": 1922
+    },
+    {
+      "epoch": 0.61536,
+      "grad_norm": 0.12938192486763,
+      "learning_rate": 0.0001269278350515464,
+      "loss": 0.5404,
+      "step": 1923
+    },
+    {
+      "epoch": 0.61568,
+      "grad_norm": 0.13803830742835999,
+      "learning_rate": 0.00012688659793814436,
+      "loss": 0.6398,
+      "step": 1924
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.14019587635993958,
+      "learning_rate": 0.0001268453608247423,
+      "loss": 0.8015,
+      "step": 1925
+    },
+    {
+      "epoch": 0.61632,
+      "grad_norm": 0.15434010326862335,
+      "learning_rate": 0.00012680412371134022,
+      "loss": 0.6866,
+      "step": 1926
+    },
+    {
+      "epoch": 0.61664,
+      "grad_norm": 0.130121111869812,
+      "learning_rate": 0.00012676288659793815,
+      "loss": 0.8558,
+      "step": 1927
+    },
+    {
+      "epoch": 0.61696,
+      "grad_norm": 0.11969605833292007,
+      "learning_rate": 0.00012672164948453608,
+      "loss": 0.7358,
+      "step": 1928
+    },
+    {
+      "epoch": 0.61728,
+      "grad_norm": 0.13726817071437836,
+      "learning_rate": 0.000126680412371134,
+      "loss": 0.7962,
+      "step": 1929
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.16535113751888275,
+      "learning_rate": 0.00012663917525773197,
+      "loss": 0.7808,
+      "step": 1930
+    },
+    {
+      "epoch": 0.61792,
+      "grad_norm": 0.1266782283782959,
+      "learning_rate": 0.0001265979381443299,
+      "loss": 0.6858,
+      "step": 1931
+    },
+    {
+      "epoch": 0.61824,
+      "grad_norm": 0.13304907083511353,
+      "learning_rate": 0.00012655670103092786,
+      "loss": 0.5996,
+      "step": 1932
+    },
+    {
+      "epoch": 0.61856,
+      "grad_norm": 0.1450585424900055,
+      "learning_rate": 0.0001265154639175258,
+      "loss": 0.7357,
+      "step": 1933
+    },
+    {
+      "epoch": 0.61888,
+      "grad_norm": 0.13656657934188843,
+      "learning_rate": 0.0001264742268041237,
+      "loss": 0.7243,
+      "step": 1934
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.12630528211593628,
+      "learning_rate": 0.00012643298969072165,
+      "loss": 0.6664,
+      "step": 1935
+    },
+    {
+      "epoch": 0.61952,
+      "grad_norm": 0.14062075316905975,
+      "learning_rate": 0.00012639175257731958,
+      "loss": 0.6144,
+      "step": 1936
+    },
+    {
+      "epoch": 0.61984,
+      "grad_norm": 0.12736253440380096,
+      "learning_rate": 0.00012635051546391754,
+      "loss": 0.6227,
+      "step": 1937
+    },
+    {
+      "epoch": 0.62016,
+      "grad_norm": 0.14937236905097961,
+      "learning_rate": 0.00012630927835051547,
+      "loss": 0.6411,
+      "step": 1938
+    },
+    {
+      "epoch": 0.62048,
+      "grad_norm": 0.1423880010843277,
+      "learning_rate": 0.0001262680412371134,
+      "loss": 0.7728,
+      "step": 1939
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.1501777172088623,
+      "learning_rate": 0.00012622680412371136,
+      "loss": 0.759,
+      "step": 1940
+    },
+    {
+      "epoch": 0.62112,
+      "grad_norm": 0.13527531921863556,
+      "learning_rate": 0.00012618556701030927,
+      "loss": 0.5421,
+      "step": 1941
+    },
+    {
+      "epoch": 0.62144,
+      "grad_norm": 0.14614367485046387,
+      "learning_rate": 0.00012614432989690723,
+      "loss": 0.6431,
+      "step": 1942
+    },
+    {
+      "epoch": 0.62176,
+      "grad_norm": 0.13307569921016693,
+      "learning_rate": 0.00012610309278350516,
+      "loss": 0.6767,
+      "step": 1943
+    },
+    {
+      "epoch": 0.62208,
+      "grad_norm": 0.13808198273181915,
+      "learning_rate": 0.0001260618556701031,
+      "loss": 0.6282,
+      "step": 1944
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.12743665277957916,
+      "learning_rate": 0.00012602061855670105,
+      "loss": 0.5825,
+      "step": 1945
+    },
+    {
+      "epoch": 0.62272,
+      "grad_norm": 0.1567707061767578,
+      "learning_rate": 0.00012597938144329898,
+      "loss": 0.7175,
+      "step": 1946
+    },
+    {
+      "epoch": 0.62304,
+      "grad_norm": 0.12966367602348328,
+      "learning_rate": 0.0001259381443298969,
+      "loss": 0.6444,
+      "step": 1947
+    },
+    {
+      "epoch": 0.62336,
+      "grad_norm": 0.12914657592773438,
+      "learning_rate": 0.00012589690721649484,
+      "loss": 0.5537,
+      "step": 1948
+    },
+    {
+      "epoch": 0.62368,
+      "grad_norm": 0.1428314447402954,
+      "learning_rate": 0.00012585567010309277,
+      "loss": 0.7166,
+      "step": 1949
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.14030608534812927,
+      "learning_rate": 0.00012581443298969073,
+      "loss": 0.7611,
+      "step": 1950
+    },
+    {
+      "epoch": 0.62432,
+      "grad_norm": 0.13528037071228027,
+      "learning_rate": 0.00012577319587628866,
+      "loss": 0.665,
+      "step": 1951
+    },
+    {
+      "epoch": 0.62464,
+      "grad_norm": 0.15317410230636597,
+      "learning_rate": 0.00012573195876288662,
+      "loss": 0.6768,
+      "step": 1952
+    },
+    {
+      "epoch": 0.62496,
+      "grad_norm": 0.146495059132576,
+      "learning_rate": 0.00012569072164948455,
+      "loss": 0.596,
+      "step": 1953
+    },
+    {
+      "epoch": 0.62528,
+      "grad_norm": 0.15675994753837585,
+      "learning_rate": 0.00012564948453608248,
+      "loss": 0.7209,
+      "step": 1954
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.1454465538263321,
+      "learning_rate": 0.00012560824742268041,
+      "loss": 0.8546,
+      "step": 1955
+    },
+    {
+      "epoch": 0.62592,
+      "grad_norm": 0.11702533811330795,
+      "learning_rate": 0.00012556701030927835,
+      "loss": 0.6069,
+      "step": 1956
+    },
+    {
+      "epoch": 0.62624,
+      "grad_norm": 0.15057525038719177,
+      "learning_rate": 0.00012552577319587628,
+      "loss": 0.5768,
+      "step": 1957
+    },
+    {
+      "epoch": 0.62656,
+      "grad_norm": 0.1403677612543106,
+      "learning_rate": 0.00012548453608247424,
+      "loss": 0.7117,
+      "step": 1958
+    },
+    {
+      "epoch": 0.62688,
+      "grad_norm": 0.1540384590625763,
+      "learning_rate": 0.00012544329896907217,
+      "loss": 0.7182,
+      "step": 1959
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.16401363909244537,
+      "learning_rate": 0.00012540206185567013,
+      "loss": 0.8016,
+      "step": 1960
+    },
+    {
+      "epoch": 0.62752,
+      "grad_norm": 0.15659166872501373,
+      "learning_rate": 0.00012536082474226806,
+      "loss": 0.5818,
+      "step": 1961
+    },
+    {
+      "epoch": 0.62784,
+      "grad_norm": 0.1653420329093933,
+      "learning_rate": 0.000125319587628866,
+      "loss": 0.7166,
+      "step": 1962
+    },
+    {
+      "epoch": 0.62816,
+      "grad_norm": 0.14746549725532532,
+      "learning_rate": 0.00012527835051546392,
+      "loss": 0.6868,
+      "step": 1963
+    },
+    {
+      "epoch": 0.62848,
+      "grad_norm": 0.13338971138000488,
+      "learning_rate": 0.00012523711340206185,
+      "loss": 0.7502,
+      "step": 1964
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.16360381245613098,
+      "learning_rate": 0.0001251958762886598,
+      "loss": 0.7824,
+      "step": 1965
+    },
+    {
+      "epoch": 0.62912,
+      "grad_norm": 0.15600335597991943,
+      "learning_rate": 0.00012515463917525774,
+      "loss": 0.7415,
+      "step": 1966
+    },
+    {
+      "epoch": 0.62944,
+      "grad_norm": 0.1535622477531433,
+      "learning_rate": 0.00012511340206185567,
+      "loss": 0.8199,
+      "step": 1967
+    },
+    {
+      "epoch": 0.62976,
+      "grad_norm": 0.14531464874744415,
+      "learning_rate": 0.00012507216494845363,
+      "loss": 0.5838,
+      "step": 1968
+    },
+    {
+      "epoch": 0.63008,
+      "grad_norm": 0.14213936030864716,
+      "learning_rate": 0.00012503092783505156,
+      "loss": 0.9415,
+      "step": 1969
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.15627896785736084,
+      "learning_rate": 0.0001249896907216495,
+      "loss": 0.7646,
+      "step": 1970
+    },
+    {
+      "epoch": 0.63072,
+      "grad_norm": 0.14554524421691895,
+      "learning_rate": 0.00012494845360824742,
+      "loss": 0.793,
+      "step": 1971
+    },
+    {
+      "epoch": 0.63104,
+      "grad_norm": 0.14182400703430176,
+      "learning_rate": 0.00012490721649484535,
+      "loss": 0.7185,
+      "step": 1972
+    },
+    {
+      "epoch": 0.63136,
+      "grad_norm": 0.15984246134757996,
+      "learning_rate": 0.0001248659793814433,
+      "loss": 0.6947,
+      "step": 1973
+    },
+    {
+      "epoch": 0.63168,
+      "grad_norm": 0.143141508102417,
+      "learning_rate": 0.00012482474226804124,
+      "loss": 0.694,
+      "step": 1974
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.1503412425518036,
+      "learning_rate": 0.0001247835051546392,
+      "loss": 0.5724,
+      "step": 1975
+    },
+    {
+      "epoch": 0.63232,
+      "grad_norm": 0.15339024364948273,
+      "learning_rate": 0.0001247422680412371,
+      "loss": 0.6489,
+      "step": 1976
+    },
+    {
+      "epoch": 0.63264,
+      "grad_norm": 0.14561650156974792,
+      "learning_rate": 0.00012470103092783504,
+      "loss": 0.6682,
+      "step": 1977
+    },
+    {
+      "epoch": 0.63296,
+      "grad_norm": 0.13642360270023346,
+      "learning_rate": 0.000124659793814433,
+      "loss": 0.7385,
+      "step": 1978
+    },
+    {
+      "epoch": 0.63328,
+      "grad_norm": 0.13638220727443695,
+      "learning_rate": 0.00012461855670103093,
+      "loss": 0.5501,
+      "step": 1979
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.14778625965118408,
+      "learning_rate": 0.00012457731958762889,
+      "loss": 0.6875,
+      "step": 1980
+    },
+    {
+      "epoch": 0.63392,
+      "grad_norm": 0.1807885617017746,
+      "learning_rate": 0.00012453608247422682,
+      "loss": 0.7934,
+      "step": 1981
+    },
+    {
+      "epoch": 0.63424,
+      "grad_norm": 0.1617950052022934,
+      "learning_rate": 0.00012449484536082475,
+      "loss": 0.7168,
+      "step": 1982
+    },
+    {
+      "epoch": 0.63456,
+      "grad_norm": 0.1469191014766693,
+      "learning_rate": 0.00012445360824742268,
+      "loss": 0.6633,
+      "step": 1983
+    },
+    {
+      "epoch": 0.63488,
+      "grad_norm": 0.14085760712623596,
+      "learning_rate": 0.0001244123711340206,
+      "loss": 0.6816,
+      "step": 1984
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.13655433058738708,
+      "learning_rate": 0.00012437113402061854,
+      "loss": 0.6451,
+      "step": 1985
+    },
+    {
+      "epoch": 0.63552,
+      "grad_norm": 0.1581992506980896,
+      "learning_rate": 0.0001243298969072165,
+      "loss": 0.7158,
+      "step": 1986
+    },
+    {
+      "epoch": 0.63584,
+      "grad_norm": 0.13187338411808014,
+      "learning_rate": 0.00012428865979381443,
+      "loss": 0.5549,
+      "step": 1987
+    },
+    {
+      "epoch": 0.63616,
+      "grad_norm": 0.13619250059127808,
+      "learning_rate": 0.0001242474226804124,
+      "loss": 0.9195,
+      "step": 1988
+    },
+    {
+      "epoch": 0.63648,
+      "grad_norm": 0.13870079815387726,
+      "learning_rate": 0.00012420618556701032,
+      "loss": 0.5944,
+      "step": 1989
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.12829461693763733,
+      "learning_rate": 0.00012416494845360825,
+      "loss": 0.7286,
+      "step": 1990
+    },
+    {
+      "epoch": 0.63712,
+      "grad_norm": 0.13943707942962646,
+      "learning_rate": 0.00012412371134020618,
+      "loss": 0.5972,
+      "step": 1991
+    },
+    {
+      "epoch": 0.63744,
+      "grad_norm": 0.13263818621635437,
+      "learning_rate": 0.00012408247422680412,
+      "loss": 0.797,
+      "step": 1992
+    },
+    {
+      "epoch": 0.63776,
+      "grad_norm": 0.13591541349887848,
+      "learning_rate": 0.00012404123711340207,
+      "loss": 0.7167,
+      "step": 1993
+    },
+    {
+      "epoch": 0.63808,
+      "grad_norm": 0.14553944766521454,
+      "learning_rate": 0.000124,
+      "loss": 0.649,
+      "step": 1994
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.15163764357566833,
+      "learning_rate": 0.00012395876288659794,
+      "loss": 0.7781,
+      "step": 1995
+    },
+    {
+      "epoch": 0.63872,
+      "grad_norm": 0.15245743095874786,
+      "learning_rate": 0.0001239175257731959,
+      "loss": 0.6715,
+      "step": 1996
+    },
+    {
+      "epoch": 0.63904,
+      "grad_norm": 0.1749904602766037,
+      "learning_rate": 0.00012387628865979383,
+      "loss": 0.692,
+      "step": 1997
+    },
+    {
+      "epoch": 0.63936,
+      "grad_norm": 0.13300080597400665,
+      "learning_rate": 0.00012383505154639176,
+      "loss": 0.5676,
+      "step": 1998
+    },
+    {
+      "epoch": 0.63968,
+      "grad_norm": 0.15774020552635193,
+      "learning_rate": 0.0001237938144329897,
+      "loss": 0.7337,
+      "step": 1999
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.13856618106365204,
+      "learning_rate": 0.00012375257731958762,
+      "loss": 0.5507,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 5000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 250,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.58513473816875e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}