diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/last-checkpoint/trainer_state.json"
@@ -0,0 +1,15232 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.27506654835847383,
+  "eval_steps": 2170,
+  "global_step": 2170,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00012675877804537965,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.0,
+      "loss": 2.9836,
+      "step": 1
+    },
+    {
+      "epoch": 0.0002535175560907593,
+      "grad_norm": 2.734375,
+      "learning_rate": 2.7649769585253456e-07,
+      "loss": 3.1814,
+      "step": 2
+    },
+    {
+      "epoch": 0.0003802763341361389,
+      "grad_norm": 3.21875,
+      "learning_rate": 5.529953917050691e-07,
+      "loss": 3.3287,
+      "step": 3
+    },
+    {
+      "epoch": 0.0005070351121815186,
+      "grad_norm": 2.8125,
+      "learning_rate": 8.294930875576038e-07,
+      "loss": 3.162,
+      "step": 4
+    },
+    {
+      "epoch": 0.0006337938902268982,
+      "grad_norm": 2.359375,
+      "learning_rate": 1.1059907834101382e-06,
+      "loss": 3.1274,
+      "step": 5
+    },
+    {
+      "epoch": 0.0007605526682722779,
+      "grad_norm": 2.28125,
+      "learning_rate": 1.3824884792626729e-06,
+      "loss": 3.4527,
+      "step": 6
+    },
+    {
+      "epoch": 0.0008873114463176575,
+      "grad_norm": 3.25,
+      "learning_rate": 1.6589861751152075e-06,
+      "loss": 3.134,
+      "step": 7
+    },
+    {
+      "epoch": 0.0010140702243630372,
+      "grad_norm": 2.25,
+      "learning_rate": 1.935483870967742e-06,
+      "loss": 3.1991,
+      "step": 8
+    },
+    {
+      "epoch": 0.0011408290024084169,
+      "grad_norm": 2.890625,
+      "learning_rate": 2.2119815668202764e-06,
+      "loss": 3.4574,
+      "step": 9
+    },
+    {
+      "epoch": 0.0012675877804537963,
+      "grad_norm": 2.921875,
+      "learning_rate": 2.4884792626728113e-06,
+      "loss": 2.6411,
+      "step": 10
+    },
+    {
+      "epoch": 0.001394346558499176,
+      "grad_norm": 2.75,
+      "learning_rate": 2.7649769585253458e-06,
+      "loss": 3.4507,
+      "step": 11
+    },
+    {
+      "epoch": 0.0015211053365445557,
+      "grad_norm": 2.609375,
+      "learning_rate": 3.0414746543778802e-06,
+      "loss": 3.0045,
+      "step": 12
+    },
+    {
+      "epoch": 0.0016478641145899354,
+      "grad_norm": 2.375,
+      "learning_rate": 3.317972350230415e-06,
+      "loss": 3.2921,
+      "step": 13
+    },
+    {
+      "epoch": 0.001774622892635315,
+      "grad_norm": 2.875,
+      "learning_rate": 3.594470046082949e-06,
+      "loss": 2.8938,
+      "step": 14
+    },
+    {
+      "epoch": 0.0019013816706806947,
+      "grad_norm": 2.15625,
+      "learning_rate": 3.870967741935484e-06,
+      "loss": 3.0096,
+      "step": 15
+    },
+    {
+      "epoch": 0.0020281404487260744,
+      "grad_norm": 2.65625,
+      "learning_rate": 4.147465437788019e-06,
+      "loss": 3.0827,
+      "step": 16
+    },
+    {
+      "epoch": 0.002154899226771454,
+      "grad_norm": 2.28125,
+      "learning_rate": 4.423963133640553e-06,
+      "loss": 2.9214,
+      "step": 17
+    },
+    {
+      "epoch": 0.0022816580048168338,
+      "grad_norm": 2.640625,
+      "learning_rate": 4.700460829493087e-06,
+      "loss": 3.0053,
+      "step": 18
+    },
+    {
+      "epoch": 0.002408416782862213,
+      "grad_norm": 1.984375,
+      "learning_rate": 4.976958525345623e-06,
+      "loss": 2.8434,
+      "step": 19
+    },
+    {
+      "epoch": 0.0025351755609075927,
+      "grad_norm": 2.53125,
+      "learning_rate": 5.253456221198157e-06,
+      "loss": 3.1882,
+      "step": 20
+    },
+    {
+      "epoch": 0.0026619343389529724,
+      "grad_norm": 2.171875,
+      "learning_rate": 5.5299539170506915e-06,
+      "loss": 3.3649,
+      "step": 21
+    },
+    {
+      "epoch": 0.002788693116998352,
+      "grad_norm": 2.703125,
+      "learning_rate": 5.8064516129032256e-06,
+      "loss": 4.069,
+      "step": 22
+    },
+    {
+      "epoch": 0.0029154518950437317,
+      "grad_norm": 2.28125,
+      "learning_rate": 6.0829493087557604e-06,
+      "loss": 3.3636,
+      "step": 23
+    },
+    {
+      "epoch": 0.0030422106730891114,
+      "grad_norm": 2.625,
+      "learning_rate": 6.359447004608295e-06,
+      "loss": 3.2612,
+      "step": 24
+    },
+    {
+      "epoch": 0.003168969451134491,
+      "grad_norm": 2.515625,
+      "learning_rate": 6.63594470046083e-06,
+      "loss": 3.2926,
+      "step": 25
+    },
+    {
+      "epoch": 0.0032957282291798708,
+      "grad_norm": 2.375,
+      "learning_rate": 6.912442396313364e-06,
+      "loss": 3.5472,
+      "step": 26
+    },
+    {
+      "epoch": 0.0034224870072252504,
+      "grad_norm": 2.609375,
+      "learning_rate": 7.188940092165898e-06,
+      "loss": 3.345,
+      "step": 27
+    },
+    {
+      "epoch": 0.00354924578527063,
+      "grad_norm": 2.578125,
+      "learning_rate": 7.465437788018433e-06,
+      "loss": 2.7514,
+      "step": 28
+    },
+    {
+      "epoch": 0.00367600456331601,
+      "grad_norm": 1.8828125,
+      "learning_rate": 7.741935483870968e-06,
+      "loss": 2.7626,
+      "step": 29
+    },
+    {
+      "epoch": 0.0038027633413613895,
+      "grad_norm": 2.421875,
+      "learning_rate": 8.018433179723503e-06,
+      "loss": 2.8706,
+      "step": 30
+    },
+    {
+      "epoch": 0.003929522119406769,
+      "grad_norm": 2.546875,
+      "learning_rate": 8.294930875576038e-06,
+      "loss": 3.2619,
+      "step": 31
+    },
+    {
+      "epoch": 0.004056280897452149,
+      "grad_norm": 2.78125,
+      "learning_rate": 8.571428571428571e-06,
+      "loss": 3.3856,
+      "step": 32
+    },
+    {
+      "epoch": 0.004183039675497528,
+      "grad_norm": 2.1875,
+      "learning_rate": 8.847926267281106e-06,
+      "loss": 3.0388,
+      "step": 33
+    },
+    {
+      "epoch": 0.004309798453542908,
+      "grad_norm": 1.9296875,
+      "learning_rate": 9.12442396313364e-06,
+      "loss": 2.8448,
+      "step": 34
+    },
+    {
+      "epoch": 0.0044365572315882874,
+      "grad_norm": 2.390625,
+      "learning_rate": 9.400921658986174e-06,
+      "loss": 3.1973,
+      "step": 35
+    },
+    {
+      "epoch": 0.0045633160096336675,
+      "grad_norm": 1.671875,
+      "learning_rate": 9.67741935483871e-06,
+      "loss": 2.9847,
+      "step": 36
+    },
+    {
+      "epoch": 0.004690074787679047,
+      "grad_norm": 2.59375,
+      "learning_rate": 9.953917050691245e-06,
+      "loss": 3.7099,
+      "step": 37
+    },
+    {
+      "epoch": 0.004816833565724426,
+      "grad_norm": 2.3125,
+      "learning_rate": 1.023041474654378e-05,
+      "loss": 3.0622,
+      "step": 38
+    },
+    {
+      "epoch": 0.004943592343769806,
+      "grad_norm": 2.15625,
+      "learning_rate": 1.0506912442396313e-05,
+      "loss": 2.7991,
+      "step": 39
+    },
+    {
+      "epoch": 0.005070351121815185,
+      "grad_norm": 2.28125,
+      "learning_rate": 1.0783410138248848e-05,
+      "loss": 2.8201,
+      "step": 40
+    },
+    {
+      "epoch": 0.0051971098998605655,
+      "grad_norm": 1.9921875,
+      "learning_rate": 1.1059907834101383e-05,
+      "loss": 3.2719,
+      "step": 41
+    },
+    {
+      "epoch": 0.005323868677905945,
+      "grad_norm": 1.734375,
+      "learning_rate": 1.1336405529953916e-05,
+      "loss": 3.2083,
+      "step": 42
+    },
+    {
+      "epoch": 0.005450627455951325,
+      "grad_norm": 2.359375,
+      "learning_rate": 1.1612903225806451e-05,
+      "loss": 2.871,
+      "step": 43
+    },
+    {
+      "epoch": 0.005577386233996704,
+      "grad_norm": 1.96875,
+      "learning_rate": 1.1889400921658986e-05,
+      "loss": 2.6081,
+      "step": 44
+    },
+    {
+      "epoch": 0.005704145012042084,
+      "grad_norm": 2.375,
+      "learning_rate": 1.2165898617511521e-05,
+      "loss": 3.0757,
+      "step": 45
+    },
+    {
+      "epoch": 0.0058309037900874635,
+      "grad_norm": 2.609375,
+      "learning_rate": 1.2442396313364056e-05,
+      "loss": 2.806,
+      "step": 46
+    },
+    {
+      "epoch": 0.005957662568132844,
+      "grad_norm": 2.484375,
+      "learning_rate": 1.271889400921659e-05,
+      "loss": 3.236,
+      "step": 47
+    },
+    {
+      "epoch": 0.006084421346178223,
+      "grad_norm": 2.328125,
+      "learning_rate": 1.2995391705069126e-05,
+      "loss": 3.3716,
+      "step": 48
+    },
+    {
+      "epoch": 0.006211180124223602,
+      "grad_norm": 1.6953125,
+      "learning_rate": 1.327188940092166e-05,
+      "loss": 2.9337,
+      "step": 49
+    },
+    {
+      "epoch": 0.006337938902268982,
+      "grad_norm": 2.40625,
+      "learning_rate": 1.3548387096774194e-05,
+      "loss": 3.1984,
+      "step": 50
+    },
+    {
+      "epoch": 0.006464697680314361,
+      "grad_norm": 1.71875,
+      "learning_rate": 1.3824884792626728e-05,
+      "loss": 2.7284,
+      "step": 51
+    },
+    {
+      "epoch": 0.0065914564583597415,
+      "grad_norm": 1.8046875,
+      "learning_rate": 1.4101382488479263e-05,
+      "loss": 2.902,
+      "step": 52
+    },
+    {
+      "epoch": 0.006718215236405121,
+      "grad_norm": 2.3125,
+      "learning_rate": 1.4377880184331796e-05,
+      "loss": 3.1462,
+      "step": 53
+    },
+    {
+      "epoch": 0.006844974014450501,
+      "grad_norm": 2.359375,
+      "learning_rate": 1.4654377880184331e-05,
+      "loss": 2.921,
+      "step": 54
+    },
+    {
+      "epoch": 0.00697173279249588,
+      "grad_norm": 2.5625,
+      "learning_rate": 1.4930875576036866e-05,
+      "loss": 2.8419,
+      "step": 55
+    },
+    {
+      "epoch": 0.00709849157054126,
+      "grad_norm": 1.875,
+      "learning_rate": 1.5207373271889403e-05,
+      "loss": 2.9957,
+      "step": 56
+    },
+    {
+      "epoch": 0.0072252503485866395,
+      "grad_norm": 2.0625,
+      "learning_rate": 1.5483870967741936e-05,
+      "loss": 2.7474,
+      "step": 57
+    },
+    {
+      "epoch": 0.00735200912663202,
+      "grad_norm": 2.59375,
+      "learning_rate": 1.576036866359447e-05,
+      "loss": 3.5225,
+      "step": 58
+    },
+    {
+      "epoch": 0.007478767904677399,
+      "grad_norm": 2.4375,
+      "learning_rate": 1.6036866359447006e-05,
+      "loss": 2.9143,
+      "step": 59
+    },
+    {
+      "epoch": 0.007605526682722779,
+      "grad_norm": 1.859375,
+      "learning_rate": 1.631336405529954e-05,
+      "loss": 2.8144,
+      "step": 60
+    },
+    {
+      "epoch": 0.007732285460768158,
+      "grad_norm": 2.03125,
+      "learning_rate": 1.6589861751152075e-05,
+      "loss": 2.9237,
+      "step": 61
+    },
+    {
+      "epoch": 0.007859044238813537,
+      "grad_norm": 2.390625,
+      "learning_rate": 1.686635944700461e-05,
+      "loss": 3.0369,
+      "step": 62
+    },
+    {
+      "epoch": 0.007985803016858917,
+      "grad_norm": 2.375,
+      "learning_rate": 1.7142857142857142e-05,
+      "loss": 2.514,
+      "step": 63
+    },
+    {
+      "epoch": 0.008112561794904298,
+      "grad_norm": 2.734375,
+      "learning_rate": 1.741935483870968e-05,
+      "loss": 2.7245,
+      "step": 64
+    },
+    {
+      "epoch": 0.008239320572949677,
+      "grad_norm": 2.359375,
+      "learning_rate": 1.769585253456221e-05,
+      "loss": 2.7239,
+      "step": 65
+    },
+    {
+      "epoch": 0.008366079350995056,
+      "grad_norm": 1.9375,
+      "learning_rate": 1.7972350230414745e-05,
+      "loss": 2.9843,
+      "step": 66
+    },
+    {
+      "epoch": 0.008492838129040435,
+      "grad_norm": 2.34375,
+      "learning_rate": 1.824884792626728e-05,
+      "loss": 2.9487,
+      "step": 67
+    },
+    {
+      "epoch": 0.008619596907085816,
+      "grad_norm": 2.0625,
+      "learning_rate": 1.8525345622119815e-05,
+      "loss": 3.063,
+      "step": 68
+    },
+    {
+      "epoch": 0.008746355685131196,
+      "grad_norm": 2.0,
+      "learning_rate": 1.8801843317972348e-05,
+      "loss": 2.6854,
+      "step": 69
+    },
+    {
+      "epoch": 0.008873114463176575,
+      "grad_norm": 2.203125,
+      "learning_rate": 1.9078341013824884e-05,
+      "loss": 2.663,
+      "step": 70
+    },
+    {
+      "epoch": 0.008999873241221954,
+      "grad_norm": 1.90625,
+      "learning_rate": 1.935483870967742e-05,
+      "loss": 2.9695,
+      "step": 71
+    },
+    {
+      "epoch": 0.009126632019267335,
+      "grad_norm": 1.8359375,
+      "learning_rate": 1.9631336405529957e-05,
+      "loss": 2.4734,
+      "step": 72
+    },
+    {
+      "epoch": 0.009253390797312714,
+      "grad_norm": 3.015625,
+      "learning_rate": 1.990783410138249e-05,
+      "loss": 2.9538,
+      "step": 73
+    },
+    {
+      "epoch": 0.009380149575358094,
+      "grad_norm": 2.015625,
+      "learning_rate": 2.0184331797235024e-05,
+      "loss": 2.6884,
+      "step": 74
+    },
+    {
+      "epoch": 0.009506908353403473,
+      "grad_norm": 3.109375,
+      "learning_rate": 2.046082949308756e-05,
+      "loss": 2.6867,
+      "step": 75
+    },
+    {
+      "epoch": 0.009633667131448852,
+      "grad_norm": 1.5390625,
+      "learning_rate": 2.0737327188940094e-05,
+      "loss": 2.5821,
+      "step": 76
+    },
+    {
+      "epoch": 0.009760425909494233,
+      "grad_norm": 1.890625,
+      "learning_rate": 2.1013824884792627e-05,
+      "loss": 2.3303,
+      "step": 77
+    },
+    {
+      "epoch": 0.009887184687539612,
+      "grad_norm": 1.3515625,
+      "learning_rate": 2.1290322580645163e-05,
+      "loss": 2.6871,
+      "step": 78
+    },
+    {
+      "epoch": 0.010013943465584992,
+      "grad_norm": 2.390625,
+      "learning_rate": 2.1566820276497696e-05,
+      "loss": 2.8288,
+      "step": 79
+    },
+    {
+      "epoch": 0.01014070224363037,
+      "grad_norm": 1.4296875,
+      "learning_rate": 2.184331797235023e-05,
+      "loss": 2.5753,
+      "step": 80
+    },
+    {
+      "epoch": 0.010267461021675752,
+      "grad_norm": 1.3671875,
+      "learning_rate": 2.2119815668202766e-05,
+      "loss": 2.9023,
+      "step": 81
+    },
+    {
+      "epoch": 0.010394219799721131,
+      "grad_norm": 1.453125,
+      "learning_rate": 2.23963133640553e-05,
+      "loss": 2.7125,
+      "step": 82
+    },
+    {
+      "epoch": 0.01052097857776651,
+      "grad_norm": 1.4765625,
+      "learning_rate": 2.2672811059907833e-05,
+      "loss": 2.9662,
+      "step": 83
+    },
+    {
+      "epoch": 0.01064773735581189,
+      "grad_norm": 1.625,
+      "learning_rate": 2.294930875576037e-05,
+      "loss": 2.606,
+      "step": 84
+    },
+    {
+      "epoch": 0.01077449613385727,
+      "grad_norm": 1.8671875,
+      "learning_rate": 2.3225806451612902e-05,
+      "loss": 2.6132,
+      "step": 85
+    },
+    {
+      "epoch": 0.01090125491190265,
+      "grad_norm": 1.640625,
+      "learning_rate": 2.350230414746544e-05,
+      "loss": 2.3759,
+      "step": 86
+    },
+    {
+      "epoch": 0.011028013689948029,
+      "grad_norm": 2.40625,
+      "learning_rate": 2.3778801843317972e-05,
+      "loss": 2.4852,
+      "step": 87
+    },
+    {
+      "epoch": 0.011154772467993408,
+      "grad_norm": 1.375,
+      "learning_rate": 2.4055299539170505e-05,
+      "loss": 2.3836,
+      "step": 88
+    },
+    {
+      "epoch": 0.011281531246038787,
+      "grad_norm": 1.6796875,
+      "learning_rate": 2.4331797235023042e-05,
+      "loss": 2.8551,
+      "step": 89
+    },
+    {
+      "epoch": 0.011408290024084168,
+      "grad_norm": 1.265625,
+      "learning_rate": 2.460829493087558e-05,
+      "loss": 2.3646,
+      "step": 90
+    },
+    {
+      "epoch": 0.011535048802129548,
+      "grad_norm": 1.546875,
+      "learning_rate": 2.488479262672811e-05,
+      "loss": 2.247,
+      "step": 91
+    },
+    {
+      "epoch": 0.011661807580174927,
+      "grad_norm": 1.28125,
+      "learning_rate": 2.5161290322580648e-05,
+      "loss": 2.5951,
+      "step": 92
+    },
+    {
+      "epoch": 0.011788566358220306,
+      "grad_norm": 1.625,
+      "learning_rate": 2.543778801843318e-05,
+      "loss": 2.4882,
+      "step": 93
+    },
+    {
+      "epoch": 0.011915325136265687,
+      "grad_norm": 1.8125,
+      "learning_rate": 2.5714285714285714e-05,
+      "loss": 2.5159,
+      "step": 94
+    },
+    {
+      "epoch": 0.012042083914311066,
+      "grad_norm": 1.3125,
+      "learning_rate": 2.599078341013825e-05,
+      "loss": 2.5952,
+      "step": 95
+    },
+    {
+      "epoch": 0.012168842692356446,
+      "grad_norm": 1.6875,
+      "learning_rate": 2.6267281105990784e-05,
+      "loss": 2.3814,
+      "step": 96
+    },
+    {
+      "epoch": 0.012295601470401825,
+      "grad_norm": 1.109375,
+      "learning_rate": 2.654377880184332e-05,
+      "loss": 2.6268,
+      "step": 97
+    },
+    {
+      "epoch": 0.012422360248447204,
+      "grad_norm": 1.71875,
+      "learning_rate": 2.6820276497695854e-05,
+      "loss": 2.7113,
+      "step": 98
+    },
+    {
+      "epoch": 0.012549119026492585,
+      "grad_norm": 1.2890625,
+      "learning_rate": 2.7096774193548387e-05,
+      "loss": 2.0994,
+      "step": 99
+    },
+    {
+      "epoch": 0.012675877804537964,
+      "grad_norm": 1.65625,
+      "learning_rate": 2.7373271889400924e-05,
+      "loss": 2.1403,
+      "step": 100
+    },
+    {
+      "epoch": 0.012802636582583344,
+      "grad_norm": 1.3671875,
+      "learning_rate": 2.7649769585253457e-05,
+      "loss": 2.1037,
+      "step": 101
+    },
+    {
+      "epoch": 0.012929395360628723,
+      "grad_norm": 1.5078125,
+      "learning_rate": 2.792626728110599e-05,
+      "loss": 2.3441,
+      "step": 102
+    },
+    {
+      "epoch": 0.013056154138674104,
+      "grad_norm": 1.5703125,
+      "learning_rate": 2.8202764976958527e-05,
+      "loss": 2.4583,
+      "step": 103
+    },
+    {
+      "epoch": 0.013182912916719483,
+      "grad_norm": 1.3671875,
+      "learning_rate": 2.847926267281106e-05,
+      "loss": 2.278,
+      "step": 104
+    },
+    {
+      "epoch": 0.013309671694764862,
+      "grad_norm": 1.296875,
+      "learning_rate": 2.8755760368663593e-05,
+      "loss": 2.212,
+      "step": 105
+    },
+    {
+      "epoch": 0.013436430472810242,
+      "grad_norm": 1.390625,
+      "learning_rate": 2.903225806451613e-05,
+      "loss": 2.0267,
+      "step": 106
+    },
+    {
+      "epoch": 0.013563189250855623,
+      "grad_norm": 1.3671875,
+      "learning_rate": 2.9308755760368663e-05,
+      "loss": 2.1103,
+      "step": 107
+    },
+    {
+      "epoch": 0.013689948028901002,
+      "grad_norm": 1.3671875,
+      "learning_rate": 2.9585253456221196e-05,
+      "loss": 2.2625,
+      "step": 108
+    },
+    {
+      "epoch": 0.013816706806946381,
+      "grad_norm": 2.296875,
+      "learning_rate": 2.9861751152073732e-05,
+      "loss": 2.0816,
+      "step": 109
+    },
+    {
+      "epoch": 0.01394346558499176,
+      "grad_norm": 1.3125,
+      "learning_rate": 3.0138248847926272e-05,
+      "loss": 2.191,
+      "step": 110
+    },
+    {
+      "epoch": 0.01407022436303714,
+      "grad_norm": 1.265625,
+      "learning_rate": 3.0414746543778806e-05,
+      "loss": 2.4229,
+      "step": 111
+    },
+    {
+      "epoch": 0.01419698314108252,
+      "grad_norm": 1.3828125,
+      "learning_rate": 3.0691244239631335e-05,
+      "loss": 2.3382,
+      "step": 112
+    },
+    {
+      "epoch": 0.0143237419191279,
+      "grad_norm": 1.234375,
+      "learning_rate": 3.096774193548387e-05,
+      "loss": 2.3096,
+      "step": 113
+    },
+    {
+      "epoch": 0.014450500697173279,
+      "grad_norm": 1.265625,
+      "learning_rate": 3.12442396313364e-05,
+      "loss": 2.5878,
+      "step": 114
+    },
+    {
+      "epoch": 0.014577259475218658,
+      "grad_norm": 1.28125,
+      "learning_rate": 3.152073732718894e-05,
+      "loss": 2.0876,
+      "step": 115
+    },
+    {
+      "epoch": 0.01470401825326404,
+      "grad_norm": 1.2109375,
+      "learning_rate": 3.1797235023041475e-05,
+      "loss": 2.1557,
+      "step": 116
+    },
+    {
+      "epoch": 0.014830777031309418,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.207373271889401e-05,
+      "loss": 2.1464,
+      "step": 117
+    },
+    {
+      "epoch": 0.014957535809354798,
+      "grad_norm": 1.171875,
+      "learning_rate": 3.235023041474654e-05,
+      "loss": 2.7055,
+      "step": 118
+    },
+    {
+      "epoch": 0.015084294587400177,
+      "grad_norm": 1.25,
+      "learning_rate": 3.262672811059908e-05,
+      "loss": 2.5028,
+      "step": 119
+    },
+    {
+      "epoch": 0.015211053365445558,
+      "grad_norm": 1.5,
+      "learning_rate": 3.2903225806451614e-05,
+      "loss": 2.3636,
+      "step": 120
+    },
+    {
+      "epoch": 0.015337812143490937,
+      "grad_norm": 1.265625,
+      "learning_rate": 3.317972350230415e-05,
+      "loss": 2.5675,
+      "step": 121
+    },
+    {
+      "epoch": 0.015464570921536316,
+      "grad_norm": 1.4140625,
+      "learning_rate": 3.345622119815669e-05,
+      "loss": 2.2406,
+      "step": 122
+    },
+    {
+      "epoch": 0.015591329699581696,
+      "grad_norm": 1.3125,
+      "learning_rate": 3.373271889400922e-05,
+      "loss": 2.3789,
+      "step": 123
+    },
+    {
+      "epoch": 0.015718088477627075,
+      "grad_norm": 1.234375,
+      "learning_rate": 3.4009216589861754e-05,
+      "loss": 2.4767,
+      "step": 124
+    },
+    {
+      "epoch": 0.015844847255672454,
+      "grad_norm": 1.265625,
+      "learning_rate": 3.4285714285714284e-05,
+      "loss": 2.3279,
+      "step": 125
+    },
+    {
+      "epoch": 0.015971606033717833,
+      "grad_norm": 1.265625,
+      "learning_rate": 3.456221198156682e-05,
+      "loss": 2.2047,
+      "step": 126
+    },
+    {
+      "epoch": 0.016098364811763216,
+      "grad_norm": 1.0390625,
+      "learning_rate": 3.483870967741936e-05,
+      "loss": 2.8244,
+      "step": 127
+    },
+    {
+      "epoch": 0.016225123589808595,
+      "grad_norm": 1.4140625,
+      "learning_rate": 3.511520737327189e-05,
+      "loss": 1.9411,
+      "step": 128
+    },
+    {
+      "epoch": 0.016351882367853975,
+      "grad_norm": 1.328125,
+      "learning_rate": 3.539170506912442e-05,
+      "loss": 2.1226,
+      "step": 129
+    },
+    {
+      "epoch": 0.016478641145899354,
+      "grad_norm": 1.2265625,
+      "learning_rate": 3.566820276497696e-05,
+      "loss": 2.353,
+      "step": 130
+    },
+    {
+      "epoch": 0.016605399923944733,
+      "grad_norm": 1.3046875,
+      "learning_rate": 3.594470046082949e-05,
+      "loss": 2.2297,
+      "step": 131
+    },
+    {
+      "epoch": 0.016732158701990112,
+      "grad_norm": 1.1953125,
+      "learning_rate": 3.622119815668203e-05,
+      "loss": 2.6692,
+      "step": 132
+    },
+    {
+      "epoch": 0.01685891748003549,
+      "grad_norm": 1.1015625,
+      "learning_rate": 3.649769585253456e-05,
+      "loss": 2.0721,
+      "step": 133
+    },
+    {
+      "epoch": 0.01698567625808087,
+      "grad_norm": 1.2890625,
+      "learning_rate": 3.67741935483871e-05,
+      "loss": 2.0208,
+      "step": 134
+    },
+    {
+      "epoch": 0.01711243503612625,
+      "grad_norm": 1.1484375,
+      "learning_rate": 3.705069124423963e-05,
+      "loss": 2.6096,
+      "step": 135
+    },
+    {
+      "epoch": 0.017239193814171633,
+      "grad_norm": 1.2421875,
+      "learning_rate": 3.7327188940092166e-05,
+      "loss": 2.4566,
+      "step": 136
+    },
+    {
+      "epoch": 0.017365952592217012,
+      "grad_norm": 1.234375,
+      "learning_rate": 3.7603686635944695e-05,
+      "loss": 2.1071,
+      "step": 137
+    },
+    {
+      "epoch": 0.01749271137026239,
+      "grad_norm": 1.140625,
+      "learning_rate": 3.788018433179724e-05,
+      "loss": 2.3634,
+      "step": 138
+    },
+    {
+      "epoch": 0.01761947014830777,
+      "grad_norm": 1.2109375,
+      "learning_rate": 3.815668202764977e-05,
+      "loss": 1.7798,
+      "step": 139
+    },
+    {
+      "epoch": 0.01774622892635315,
+      "grad_norm": 1.171875,
+      "learning_rate": 3.8433179723502305e-05,
+      "loss": 2.5508,
+      "step": 140
+    },
+    {
+      "epoch": 0.01787298770439853,
+      "grad_norm": 1.1328125,
+      "learning_rate": 3.870967741935484e-05,
+      "loss": 2.1198,
+      "step": 141
+    },
+    {
+      "epoch": 0.017999746482443908,
+      "grad_norm": 1.171875,
+      "learning_rate": 3.898617511520737e-05,
+      "loss": 2.2581,
+      "step": 142
+    },
+    {
+      "epoch": 0.018126505260489287,
+      "grad_norm": 1.15625,
+      "learning_rate": 3.9262672811059915e-05,
+      "loss": 2.1564,
+      "step": 143
+    },
+    {
+      "epoch": 0.01825326403853467,
+      "grad_norm": 1.125,
+      "learning_rate": 3.9539170506912445e-05,
+      "loss": 2.2281,
+      "step": 144
+    },
+    {
+      "epoch": 0.01838002281658005,
+      "grad_norm": 1.1015625,
+      "learning_rate": 3.981566820276498e-05,
+      "loss": 2.4633,
+      "step": 145
+    },
+    {
+      "epoch": 0.01850678159462543,
+      "grad_norm": 1.1484375,
+      "learning_rate": 4.009216589861751e-05,
+      "loss": 2.0364,
+      "step": 146
+    },
+    {
+      "epoch": 0.018633540372670808,
+      "grad_norm": 1.0859375,
+      "learning_rate": 4.036866359447005e-05,
+      "loss": 1.9947,
+      "step": 147
+    },
+    {
+      "epoch": 0.018760299150716187,
+      "grad_norm": 1.2734375,
+      "learning_rate": 4.064516129032258e-05,
+      "loss": 2.0986,
+      "step": 148
+    },
+    {
+      "epoch": 0.018887057928761566,
+      "grad_norm": 1.2421875,
+      "learning_rate": 4.092165898617512e-05,
+      "loss": 2.432,
+      "step": 149
+    },
+    {
+      "epoch": 0.019013816706806946,
+      "grad_norm": 1.1640625,
+      "learning_rate": 4.119815668202765e-05,
+      "loss": 2.6983,
+      "step": 150
+    },
+    {
+      "epoch": 0.019140575484852325,
+      "grad_norm": 1.1796875,
+      "learning_rate": 4.147465437788019e-05,
+      "loss": 1.8463,
+      "step": 151
+    },
+    {
+      "epoch": 0.019267334262897704,
+      "grad_norm": 0.98046875,
+      "learning_rate": 4.175115207373272e-05,
+      "loss": 2.1697,
+      "step": 152
+    },
+    {
+      "epoch": 0.019394093040943087,
+      "grad_norm": 1.28125,
+      "learning_rate": 4.202764976958525e-05,
+      "loss": 2.4353,
+      "step": 153
+    },
+    {
+      "epoch": 0.019520851818988466,
+      "grad_norm": 1.234375,
+      "learning_rate": 4.230414746543778e-05,
+      "loss": 2.7946,
+      "step": 154
+    },
+    {
+      "epoch": 0.019647610597033845,
+      "grad_norm": 1.0859375,
+      "learning_rate": 4.2580645161290327e-05,
+      "loss": 1.9568,
+      "step": 155
+    },
+    {
+      "epoch": 0.019774369375079225,
+      "grad_norm": 1.015625,
+      "learning_rate": 4.2857142857142856e-05,
+      "loss": 2.6783,
+      "step": 156
+    },
+    {
+      "epoch": 0.019901128153124604,
+      "grad_norm": 1.484375,
+      "learning_rate": 4.313364055299539e-05,
+      "loss": 2.4218,
+      "step": 157
+    },
+    {
+      "epoch": 0.020027886931169983,
+      "grad_norm": 1.1171875,
+      "learning_rate": 4.341013824884792e-05,
+      "loss": 2.4173,
+      "step": 158
+    },
+    {
+      "epoch": 0.020154645709215362,
+      "grad_norm": 1.1328125,
+      "learning_rate": 4.368663594470046e-05,
+      "loss": 2.422,
+      "step": 159
+    },
+    {
+      "epoch": 0.02028140448726074,
+      "grad_norm": 1.5859375,
+      "learning_rate": 4.3963133640553e-05,
+      "loss": 2.5844,
+      "step": 160
+    },
+    {
+      "epoch": 0.02040816326530612,
+      "grad_norm": 1.15625,
+      "learning_rate": 4.423963133640553e-05,
+      "loss": 2.7378,
+      "step": 161
+    },
+    {
+      "epoch": 0.020534922043351504,
+      "grad_norm": 0.9921875,
+      "learning_rate": 4.451612903225807e-05,
+      "loss": 2.1257,
+      "step": 162
+    },
+    {
+      "epoch": 0.020661680821396883,
+      "grad_norm": 1.203125,
+      "learning_rate": 4.47926267281106e-05,
+      "loss": 2.3838,
+      "step": 163
+    },
+    {
+      "epoch": 0.020788439599442262,
+      "grad_norm": 1.34375,
+      "learning_rate": 4.5069124423963135e-05,
+      "loss": 1.8546,
+      "step": 164
+    },
+    {
+      "epoch": 0.02091519837748764,
+      "grad_norm": 1.1953125,
+      "learning_rate": 4.5345622119815665e-05,
+      "loss": 2.0101,
+      "step": 165
+    },
+    {
+      "epoch": 0.02104195715553302,
+      "grad_norm": 1.1484375,
+      "learning_rate": 4.562211981566821e-05,
+      "loss": 2.4907,
+      "step": 166
+    },
+    {
+      "epoch": 0.0211687159335784,
+      "grad_norm": 1.25,
+      "learning_rate": 4.589861751152074e-05,
+      "loss": 2.028,
+      "step": 167
+    },
+    {
+      "epoch": 0.02129547471162378,
+      "grad_norm": 1.3515625,
+      "learning_rate": 4.6175115207373275e-05,
+      "loss": 2.1113,
+      "step": 168
+    },
+    {
+      "epoch": 0.021422233489669158,
+      "grad_norm": 1.4921875,
+      "learning_rate": 4.6451612903225805e-05,
+      "loss": 1.9932,
+      "step": 169
+    },
+    {
+      "epoch": 0.02154899226771454,
+      "grad_norm": 1.203125,
+      "learning_rate": 4.672811059907834e-05,
+      "loss": 2.5246,
+      "step": 170
+    },
+    {
+      "epoch": 0.02167575104575992,
+      "grad_norm": 1.2421875,
+      "learning_rate": 4.700460829493088e-05,
+      "loss": 2.5694,
+      "step": 171
+    },
+    {
+      "epoch": 0.0218025098238053,
+      "grad_norm": 1.15625,
+      "learning_rate": 4.7281105990783414e-05,
+      "loss": 1.9679,
+      "step": 172
+    },
+    {
+      "epoch": 0.02192926860185068,
+      "grad_norm": 1.1875,
+      "learning_rate": 4.7557603686635944e-05,
+      "loss": 2.281,
+      "step": 173
+    },
+    {
+      "epoch": 0.022056027379896058,
+      "grad_norm": 1.28125,
+      "learning_rate": 4.783410138248848e-05,
+      "loss": 2.2681,
+      "step": 174
+    },
+    {
+      "epoch": 0.022182786157941437,
+      "grad_norm": 1.1875,
+      "learning_rate": 4.811059907834101e-05,
+      "loss": 2.0401,
+      "step": 175
+    },
+    {
+      "epoch": 0.022309544935986816,
+      "grad_norm": 1.5859375,
+      "learning_rate": 4.838709677419355e-05,
+      "loss": 2.3099,
+      "step": 176
+    },
+    {
+      "epoch": 0.022436303714032196,
+      "grad_norm": 1.125,
+      "learning_rate": 4.8663594470046084e-05,
+      "loss": 3.075,
+      "step": 177
+    },
+    {
+      "epoch": 0.022563062492077575,
+      "grad_norm": 1.3203125,
+      "learning_rate": 4.894009216589862e-05,
+      "loss": 2.2041,
+      "step": 178
+    },
+    {
+      "epoch": 0.022689821270122958,
+      "grad_norm": 1.265625,
+      "learning_rate": 4.921658986175116e-05,
+      "loss": 2.7096,
+      "step": 179
+    },
+    {
+      "epoch": 0.022816580048168337,
+      "grad_norm": 1.2734375,
+      "learning_rate": 4.9493087557603686e-05,
+      "loss": 2.4346,
+      "step": 180
+    },
+    {
+      "epoch": 0.022943338826213716,
+      "grad_norm": 1.1171875,
+      "learning_rate": 4.976958525345622e-05,
+      "loss": 2.1963,
+      "step": 181
+    },
+    {
+      "epoch": 0.023070097604259095,
+      "grad_norm": 1.4453125,
+      "learning_rate": 5.004608294930876e-05,
+      "loss": 2.5156,
+      "step": 182
+    },
+    {
+      "epoch": 0.023196856382304475,
+      "grad_norm": 1.1796875,
+      "learning_rate": 5.0322580645161296e-05,
+      "loss": 2.0192,
+      "step": 183
+    },
+    {
+      "epoch": 0.023323615160349854,
+      "grad_norm": 1.1484375,
+      "learning_rate": 5.0599078341013826e-05,
+      "loss": 2.6039,
+      "step": 184
+    },
+    {
+      "epoch": 0.023450373938395233,
+      "grad_norm": 1.1171875,
+      "learning_rate": 5.087557603686636e-05,
+      "loss": 1.9497,
+      "step": 185
+    },
+    {
+      "epoch": 0.023577132716440612,
+      "grad_norm": 1.1953125,
+      "learning_rate": 5.115207373271889e-05,
+      "loss": 2.2239,
+      "step": 186
+    },
+    {
+      "epoch": 0.02370389149448599,
+      "grad_norm": 1.0,
+      "learning_rate": 5.142857142857143e-05,
+      "loss": 2.3346,
+      "step": 187
+    },
+    {
+      "epoch": 0.023830650272531374,
+      "grad_norm": 1.1015625,
+      "learning_rate": 5.1705069124423965e-05,
+      "loss": 2.5889,
+      "step": 188
+    },
+    {
+      "epoch": 0.023957409050576754,
+      "grad_norm": 1.046875,
+      "learning_rate": 5.19815668202765e-05,
+      "loss": 2.2847,
+      "step": 189
+    },
+    {
+      "epoch": 0.024084167828622133,
+      "grad_norm": 0.9375,
+      "learning_rate": 5.225806451612903e-05,
+      "loss": 2.385,
+      "step": 190
+    },
+    {
+      "epoch": 0.024210926606667512,
+      "grad_norm": 1.03125,
+      "learning_rate": 5.253456221198157e-05,
+      "loss": 2.0346,
+      "step": 191
+    },
+    {
+      "epoch": 0.02433768538471289,
+      "grad_norm": 1.0390625,
+      "learning_rate": 5.28110599078341e-05,
+      "loss": 2.5936,
+      "step": 192
+    },
+    {
+      "epoch": 0.02446444416275827,
+      "grad_norm": 1.1484375,
+      "learning_rate": 5.308755760368664e-05,
+      "loss": 2.4177,
+      "step": 193
+    },
+    {
+      "epoch": 0.02459120294080365,
+      "grad_norm": 0.96484375,
+      "learning_rate": 5.336405529953917e-05,
+      "loss": 2.28,
+      "step": 194
+    },
+    {
+      "epoch": 0.02471796171884903,
+      "grad_norm": 1.2109375,
+      "learning_rate": 5.364055299539171e-05,
+      "loss": 2.0162,
+      "step": 195
+    },
+    {
+      "epoch": 0.024844720496894408,
+      "grad_norm": 1.1796875,
+      "learning_rate": 5.391705069124424e-05,
+      "loss": 2.3761,
+      "step": 196
+    },
+    {
+      "epoch": 0.02497147927493979,
+      "grad_norm": 1.125,
+      "learning_rate": 5.4193548387096774e-05,
+      "loss": 2.2806,
+      "step": 197
+    },
+    {
+      "epoch": 0.02509823805298517,
+      "grad_norm": 1.1796875,
+      "learning_rate": 5.4470046082949304e-05,
+      "loss": 2.5046,
+      "step": 198
+    },
+    {
+      "epoch": 0.02522499683103055,
+      "grad_norm": 1.234375,
+      "learning_rate": 5.474654377880185e-05,
+      "loss": 2.7865,
+      "step": 199
+    },
+    {
+      "epoch": 0.02535175560907593,
+      "grad_norm": 1.078125,
+      "learning_rate": 5.5023041474654384e-05,
+      "loss": 2.6493,
+      "step": 200
+    },
+    {
+      "epoch": 0.025478514387121308,
+      "grad_norm": 1.203125,
+      "learning_rate": 5.5299539170506914e-05,
+      "loss": 2.0132,
+      "step": 201
+    },
+    {
+      "epoch": 0.025605273165166687,
+      "grad_norm": 1.203125,
+      "learning_rate": 5.557603686635945e-05,
+      "loss": 2.4885,
+      "step": 202
+    },
+    {
+      "epoch": 0.025732031943212066,
+      "grad_norm": 1.171875,
+      "learning_rate": 5.585253456221198e-05,
+      "loss": 2.2706,
+      "step": 203
+    },
+    {
+      "epoch": 0.025858790721257446,
+      "grad_norm": 1.203125,
+      "learning_rate": 5.612903225806452e-05,
+      "loss": 2.5947,
+      "step": 204
+    },
+    {
+      "epoch": 0.02598554949930283,
+      "grad_norm": 1.140625,
+      "learning_rate": 5.640552995391705e-05,
+      "loss": 2.4087,
+      "step": 205
+    },
+    {
+      "epoch": 0.026112308277348208,
+      "grad_norm": 1.0546875,
+      "learning_rate": 5.668202764976959e-05,
+      "loss": 2.2352,
+      "step": 206
+    },
+    {
+      "epoch": 0.026239067055393587,
+      "grad_norm": 1.2578125,
+      "learning_rate": 5.695852534562212e-05,
+      "loss": 2.4017,
+      "step": 207
+    },
+    {
+      "epoch": 0.026365825833438966,
+      "grad_norm": 1.265625,
+      "learning_rate": 5.7235023041474656e-05,
+      "loss": 2.1867,
+      "step": 208
+    },
+    {
+      "epoch": 0.026492584611484345,
+      "grad_norm": 1.0703125,
+      "learning_rate": 5.7511520737327186e-05,
+      "loss": 2.9177,
+      "step": 209
+    },
+    {
+      "epoch": 0.026619343389529725,
+      "grad_norm": 1.1484375,
+      "learning_rate": 5.778801843317973e-05,
+      "loss": 2.3404,
+      "step": 210
+    },
+    {
+      "epoch": 0.026746102167575104,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.806451612903226e-05,
+      "loss": 2.7923,
+      "step": 211
+    },
+    {
+      "epoch": 0.026872860945620483,
+      "grad_norm": 1.046875,
+      "learning_rate": 5.8341013824884796e-05,
+      "loss": 2.4418,
+      "step": 212
+    },
+    {
+      "epoch": 0.026999619723665862,
+      "grad_norm": 1.1171875,
+      "learning_rate": 5.8617511520737325e-05,
+      "loss": 2.441,
+      "step": 213
+    },
+    {
+      "epoch": 0.027126378501711245,
+      "grad_norm": 1.0859375,
+      "learning_rate": 5.889400921658986e-05,
+      "loss": 2.1844,
+      "step": 214
+    },
+    {
+      "epoch": 0.027253137279756624,
+      "grad_norm": 1.34375,
+      "learning_rate": 5.917050691244239e-05,
+      "loss": 2.1207,
+      "step": 215
+    },
+    {
+      "epoch": 0.027379896057802004,
+      "grad_norm": 0.96875,
+      "learning_rate": 5.9447004608294935e-05,
+      "loss": 2.3711,
+      "step": 216
+    },
+    {
+      "epoch": 0.027506654835847383,
+      "grad_norm": 1.03125,
+      "learning_rate": 5.9723502304147465e-05,
+      "loss": 2.2324,
+      "step": 217
+    },
+    {
+      "epoch": 0.027633413613892762,
+      "grad_norm": 1.1328125,
+      "learning_rate": 6e-05,
+      "loss": 2.9252,
+      "step": 218
+    },
+    {
+      "epoch": 0.02776017239193814,
+      "grad_norm": 1.0546875,
+      "learning_rate": 6.0276497695852545e-05,
+      "loss": 2.1201,
+      "step": 219
+    },
+    {
+      "epoch": 0.02788693116998352,
+      "grad_norm": 1.0234375,
+      "learning_rate": 6.055299539170507e-05,
+      "loss": 2.0246,
+      "step": 220
+    },
+    {
+      "epoch": 0.0280136899480289,
+      "grad_norm": 0.96875,
+      "learning_rate": 6.082949308755761e-05,
+      "loss": 2.0742,
+      "step": 221
+    },
+    {
+      "epoch": 0.02814044872607428,
+      "grad_norm": 0.9296875,
+      "learning_rate": 6.110599078341014e-05,
+      "loss": 2.2892,
+      "step": 222
+    },
+    {
+      "epoch": 0.028267207504119662,
+      "grad_norm": 0.9453125,
+      "learning_rate": 6.138248847926267e-05,
+      "loss": 1.9504,
+      "step": 223
+    },
+    {
+      "epoch": 0.02839396628216504,
+      "grad_norm": 0.8984375,
+      "learning_rate": 6.165898617511521e-05,
+      "loss": 2.3808,
+      "step": 224
+    },
+    {
+      "epoch": 0.02852072506021042,
+      "grad_norm": 1.2109375,
+      "learning_rate": 6.193548387096774e-05,
+      "loss": 2.1929,
+      "step": 225
+    },
+    {
+      "epoch": 0.0286474838382558,
+      "grad_norm": 1.1484375,
+      "learning_rate": 6.221198156682029e-05,
+      "loss": 1.7926,
+      "step": 226
+    },
+    {
+      "epoch": 0.02877424261630118,
+      "grad_norm": 0.9453125,
+      "learning_rate": 6.24884792626728e-05,
+      "loss": 1.8952,
+      "step": 227
+    },
+    {
+      "epoch": 0.028901001394346558,
+      "grad_norm": 1.0859375,
+      "learning_rate": 6.276497695852535e-05,
+      "loss": 2.6975,
+      "step": 228
+    },
+    {
+      "epoch": 0.029027760172391937,
+      "grad_norm": 1.265625,
+      "learning_rate": 6.304147465437788e-05,
+      "loss": 2.7235,
+      "step": 229
+    },
+    {
+      "epoch": 0.029154518950437316,
+      "grad_norm": 1.2265625,
+      "learning_rate": 6.331797235023042e-05,
+      "loss": 2.0742,
+      "step": 230
+    },
+    {
+      "epoch": 0.029281277728482696,
+      "grad_norm": 1.0625,
+      "learning_rate": 6.359447004608295e-05,
+      "loss": 2.2087,
+      "step": 231
+    },
+    {
+      "epoch": 0.02940803650652808,
+      "grad_norm": 1.359375,
+      "learning_rate": 6.387096774193548e-05,
+      "loss": 2.2738,
+      "step": 232
+    },
+    {
+      "epoch": 0.029534795284573458,
+      "grad_norm": 1.09375,
+      "learning_rate": 6.414746543778802e-05,
+      "loss": 2.1704,
+      "step": 233
+    },
+    {
+      "epoch": 0.029661554062618837,
+      "grad_norm": 0.98046875,
+      "learning_rate": 6.442396313364055e-05,
+      "loss": 2.0741,
+      "step": 234
+    },
+    {
+      "epoch": 0.029788312840664216,
+      "grad_norm": 1.03125,
+      "learning_rate": 6.470046082949308e-05,
+      "loss": 2.2153,
+      "step": 235
+    },
+    {
+      "epoch": 0.029915071618709595,
+      "grad_norm": 1.0546875,
+      "learning_rate": 6.497695852534563e-05,
+      "loss": 2.4314,
+      "step": 236
+    },
+    {
+      "epoch": 0.030041830396754975,
+      "grad_norm": 1.171875,
+      "learning_rate": 6.525345622119816e-05,
+      "loss": 2.597,
+      "step": 237
+    },
+    {
+      "epoch": 0.030168589174800354,
+      "grad_norm": 1.0859375,
+      "learning_rate": 6.55299539170507e-05,
+      "loss": 2.2135,
+      "step": 238
+    },
+    {
+      "epoch": 0.030295347952845733,
+      "grad_norm": 1.109375,
+      "learning_rate": 6.580645161290323e-05,
+      "loss": 2.0169,
+      "step": 239
+    },
+    {
+      "epoch": 0.030422106730891116,
+      "grad_norm": 1.265625,
+      "learning_rate": 6.608294930875576e-05,
+      "loss": 1.8607,
+      "step": 240
+    },
+    {
+      "epoch": 0.030548865508936495,
+      "grad_norm": 1.296875,
+      "learning_rate": 6.63594470046083e-05,
+      "loss": 2.0917,
+      "step": 241
+    },
+    {
+      "epoch": 0.030675624286981874,
+      "grad_norm": 1.0703125,
+      "learning_rate": 6.663594470046083e-05,
+      "loss": 2.047,
+      "step": 242
+    },
+    {
+      "epoch": 0.030802383065027254,
+      "grad_norm": 1.625,
+      "learning_rate": 6.691244239631338e-05,
+      "loss": 2.3032,
+      "step": 243
+    },
+    {
+      "epoch": 0.030929141843072633,
+      "grad_norm": 1.0546875,
+      "learning_rate": 6.718894009216589e-05,
+      "loss": 2.1386,
+      "step": 244
+    },
+    {
+      "epoch": 0.031055900621118012,
+      "grad_norm": 1.28125,
+      "learning_rate": 6.746543778801843e-05,
+      "loss": 1.9799,
+      "step": 245
+    },
+    {
+      "epoch": 0.03118265939916339,
+      "grad_norm": 1.015625,
+      "learning_rate": 6.774193548387098e-05,
+      "loss": 2.4034,
+      "step": 246
+    },
+    {
+      "epoch": 0.031309418177208774,
+      "grad_norm": 1.125,
+      "learning_rate": 6.801843317972351e-05,
+      "loss": 2.103,
+      "step": 247
+    },
+    {
+      "epoch": 0.03143617695525415,
+      "grad_norm": 0.93359375,
+      "learning_rate": 6.829493087557604e-05,
+      "loss": 2.2768,
+      "step": 248
+    },
+    {
+      "epoch": 0.03156293573329953,
+      "grad_norm": 1.09375,
+      "learning_rate": 6.857142857142857e-05,
+      "loss": 2.4442,
+      "step": 249
+    },
+    {
+      "epoch": 0.03168969451134491,
+      "grad_norm": 1.125,
+      "learning_rate": 6.884792626728111e-05,
+      "loss": 2.193,
+      "step": 250
+    },
+    {
+      "epoch": 0.03181645328939029,
+      "grad_norm": 0.9609375,
+      "learning_rate": 6.912442396313364e-05,
+      "loss": 2.1956,
+      "step": 251
+    },
+    {
+      "epoch": 0.03194321206743567,
+      "grad_norm": 1.1640625,
+      "learning_rate": 6.940092165898617e-05,
+      "loss": 2.3793,
+      "step": 252
+    },
+    {
+      "epoch": 0.03206997084548105,
+      "grad_norm": 1.25,
+      "learning_rate": 6.967741935483871e-05,
+      "loss": 2.2532,
+      "step": 253
+    },
+    {
+      "epoch": 0.03219672962352643,
+      "grad_norm": 1.3046875,
+      "learning_rate": 6.995391705069124e-05,
+      "loss": 2.3969,
+      "step": 254
+    },
+    {
+      "epoch": 0.03232348840157181,
+      "grad_norm": 1.25,
+      "learning_rate": 7.023041474654379e-05,
+      "loss": 2.408,
+      "step": 255
+    },
+    {
+      "epoch": 0.03245024717961719,
+      "grad_norm": 1.0546875,
+      "learning_rate": 7.05069124423963e-05,
+      "loss": 2.004,
+      "step": 256
+    },
+    {
+      "epoch": 0.032577005957662566,
+      "grad_norm": 1.1875,
+      "learning_rate": 7.078341013824885e-05,
+      "loss": 2.2098,
+      "step": 257
+    },
+    {
+      "epoch": 0.03270376473570795,
+      "grad_norm": 1.015625,
+      "learning_rate": 7.105990783410139e-05,
+      "loss": 2.2987,
+      "step": 258
+    },
+    {
+      "epoch": 0.032830523513753325,
+      "grad_norm": 1.2109375,
+      "learning_rate": 7.133640552995392e-05,
+      "loss": 2.3733,
+      "step": 259
+    },
+    {
+      "epoch": 0.03295728229179871,
+      "grad_norm": 1.0859375,
+      "learning_rate": 7.161290322580646e-05,
+      "loss": 2.2915,
+      "step": 260
+    },
+    {
+      "epoch": 0.03308404106984408,
+      "grad_norm": 1.3125,
+      "learning_rate": 7.188940092165898e-05,
+      "loss": 2.0945,
+      "step": 261
+    },
+    {
+      "epoch": 0.033210799847889466,
+      "grad_norm": 1.0859375,
+      "learning_rate": 7.216589861751152e-05,
+      "loss": 2.2548,
+      "step": 262
+    },
+    {
+      "epoch": 0.03333755862593485,
+      "grad_norm": 0.96875,
+      "learning_rate": 7.244239631336407e-05,
+      "loss": 2.9858,
+      "step": 263
+    },
+    {
+      "epoch": 0.033464317403980225,
+      "grad_norm": 1.0546875,
+      "learning_rate": 7.27188940092166e-05,
+      "loss": 2.3404,
+      "step": 264
+    },
+    {
+      "epoch": 0.03359107618202561,
+      "grad_norm": 1.796875,
+      "learning_rate": 7.299539170506913e-05,
+      "loss": 1.5766,
+      "step": 265
+    },
+    {
+      "epoch": 0.03371783496007098,
+      "grad_norm": 1.0859375,
+      "learning_rate": 7.327188940092166e-05,
+      "loss": 1.6851,
+      "step": 266
+    },
+    {
+      "epoch": 0.033844593738116366,
+      "grad_norm": 1.390625,
+      "learning_rate": 7.35483870967742e-05,
+      "loss": 2.1749,
+      "step": 267
+    },
+    {
+      "epoch": 0.03397135251616174,
+      "grad_norm": 1.078125,
+      "learning_rate": 7.382488479262673e-05,
+      "loss": 1.9289,
+      "step": 268
+    },
+    {
+      "epoch": 0.034098111294207124,
+      "grad_norm": 1.234375,
+      "learning_rate": 7.410138248847926e-05,
+      "loss": 2.2994,
+      "step": 269
+    },
+    {
+      "epoch": 0.0342248700722525,
+      "grad_norm": 0.98046875,
+      "learning_rate": 7.43778801843318e-05,
+      "loss": 2.4787,
+      "step": 270
+    },
+    {
+      "epoch": 0.03435162885029788,
+      "grad_norm": 0.98828125,
+      "learning_rate": 7.465437788018433e-05,
+      "loss": 2.7086,
+      "step": 271
+    },
+    {
+      "epoch": 0.034478387628343266,
+      "grad_norm": 1.25,
+      "learning_rate": 7.493087557603687e-05,
+      "loss": 2.8374,
+      "step": 272
+    },
+    {
+      "epoch": 0.03460514640638864,
+      "grad_norm": 1.265625,
+      "learning_rate": 7.520737327188939e-05,
+      "loss": 2.8127,
+      "step": 273
+    },
+    {
+      "epoch": 0.034731905184434024,
+      "grad_norm": 1.0,
+      "learning_rate": 7.548387096774193e-05,
+      "loss": 2.3578,
+      "step": 274
+    },
+    {
+      "epoch": 0.0348586639624794,
+      "grad_norm": 1.0859375,
+      "learning_rate": 7.576036866359448e-05,
+      "loss": 2.3934,
+      "step": 275
+    },
+    {
+      "epoch": 0.03498542274052478,
+      "grad_norm": 0.96875,
+      "learning_rate": 7.603686635944701e-05,
+      "loss": 2.0839,
+      "step": 276
+    },
+    {
+      "epoch": 0.03511218151857016,
+      "grad_norm": 1.0625,
+      "learning_rate": 7.631336405529954e-05,
+      "loss": 2.184,
+      "step": 277
+    },
+    {
+      "epoch": 0.03523894029661554,
+      "grad_norm": 1.0,
+      "learning_rate": 7.658986175115207e-05,
+      "loss": 2.2954,
+      "step": 278
+    },
+    {
+      "epoch": 0.035365699074660924,
+      "grad_norm": 1.0859375,
+      "learning_rate": 7.686635944700461e-05,
+      "loss": 2.5797,
+      "step": 279
+    },
+    {
+      "epoch": 0.0354924578527063,
+      "grad_norm": 1.1015625,
+      "learning_rate": 7.714285714285715e-05,
+      "loss": 2.3425,
+      "step": 280
+    },
+    {
+      "epoch": 0.03561921663075168,
+      "grad_norm": 1.078125,
+      "learning_rate": 7.741935483870968e-05,
+      "loss": 1.8804,
+      "step": 281
+    },
+    {
+      "epoch": 0.03574597540879706,
+      "grad_norm": 1.0859375,
+      "learning_rate": 7.769585253456221e-05,
+      "loss": 2.3378,
+      "step": 282
+    },
+    {
+      "epoch": 0.03587273418684244,
+      "grad_norm": 1.0625,
+      "learning_rate": 7.797235023041474e-05,
+      "loss": 2.2018,
+      "step": 283
+    },
+    {
+      "epoch": 0.035999492964887816,
+      "grad_norm": 1.0546875,
+      "learning_rate": 7.824884792626729e-05,
+      "loss": 2.1572,
+      "step": 284
+    },
+    {
+      "epoch": 0.0361262517429332,
+      "grad_norm": 1.1484375,
+      "learning_rate": 7.852534562211983e-05,
+      "loss": 2.2567,
+      "step": 285
+    },
+    {
+      "epoch": 0.036253010520978575,
+      "grad_norm": 0.96875,
+      "learning_rate": 7.880184331797235e-05,
+      "loss": 2.1081,
+      "step": 286
+    },
+    {
+      "epoch": 0.03637976929902396,
+      "grad_norm": 1.1484375,
+      "learning_rate": 7.907834101382489e-05,
+      "loss": 2.7091,
+      "step": 287
+    },
+    {
+      "epoch": 0.03650652807706934,
+      "grad_norm": 1.0546875,
+      "learning_rate": 7.935483870967742e-05,
+      "loss": 2.6408,
+      "step": 288
+    },
+    {
+      "epoch": 0.036633286855114716,
+      "grad_norm": 0.96484375,
+      "learning_rate": 7.963133640552996e-05,
+      "loss": 2.6255,
+      "step": 289
+    },
+    {
+      "epoch": 0.0367600456331601,
+      "grad_norm": 0.95703125,
+      "learning_rate": 7.990783410138248e-05,
+      "loss": 2.3187,
+      "step": 290
+    },
+    {
+      "epoch": 0.036886804411205475,
+      "grad_norm": 1.09375,
+      "learning_rate": 8.018433179723502e-05,
+      "loss": 2.16,
+      "step": 291
+    },
+    {
+      "epoch": 0.03701356318925086,
+      "grad_norm": 1.0546875,
+      "learning_rate": 8.046082949308757e-05,
+      "loss": 2.4117,
+      "step": 292
+    },
+    {
+      "epoch": 0.03714032196729623,
+      "grad_norm": 1.1484375,
+      "learning_rate": 8.07373271889401e-05,
+      "loss": 1.9142,
+      "step": 293
+    },
+    {
+      "epoch": 0.037267080745341616,
+      "grad_norm": 1.1796875,
+      "learning_rate": 8.101382488479262e-05,
+      "loss": 1.9936,
+      "step": 294
+    },
+    {
+      "epoch": 0.03739383952338699,
+      "grad_norm": 1.0390625,
+      "learning_rate": 8.129032258064515e-05,
+      "loss": 2.1341,
+      "step": 295
+    },
+    {
+      "epoch": 0.037520598301432374,
+      "grad_norm": 0.95703125,
+      "learning_rate": 8.15668202764977e-05,
+      "loss": 2.312,
+      "step": 296
+    },
+    {
+      "epoch": 0.03764735707947776,
+      "grad_norm": 1.078125,
+      "learning_rate": 8.184331797235024e-05,
+      "loss": 2.1323,
+      "step": 297
+    },
+    {
+      "epoch": 0.03777411585752313,
+      "grad_norm": 0.9921875,
+      "learning_rate": 8.211981566820277e-05,
+      "loss": 2.478,
+      "step": 298
+    },
+    {
+      "epoch": 0.037900874635568516,
+      "grad_norm": 1.046875,
+      "learning_rate": 8.23963133640553e-05,
+      "loss": 2.0635,
+      "step": 299
+    },
+    {
+      "epoch": 0.03802763341361389,
+      "grad_norm": 1.046875,
+      "learning_rate": 8.267281105990783e-05,
+      "loss": 2.217,
+      "step": 300
+    },
+    {
+      "epoch": 0.038154392191659274,
+      "grad_norm": 1.078125,
+      "learning_rate": 8.294930875576037e-05,
+      "loss": 2.4008,
+      "step": 301
+    },
+    {
+      "epoch": 0.03828115096970465,
+      "grad_norm": 0.9375,
+      "learning_rate": 8.322580645161292e-05,
+      "loss": 2.3014,
+      "step": 302
+    },
+    {
+      "epoch": 0.03840790974775003,
+      "grad_norm": 0.9765625,
+      "learning_rate": 8.350230414746543e-05,
+      "loss": 2.3554,
+      "step": 303
+    },
+    {
+      "epoch": 0.03853466852579541,
+      "grad_norm": 1.109375,
+      "learning_rate": 8.377880184331798e-05,
+      "loss": 2.1672,
+      "step": 304
+    },
+    {
+      "epoch": 0.03866142730384079,
+      "grad_norm": 1.03125,
+      "learning_rate": 8.40552995391705e-05,
+      "loss": 1.9915,
+      "step": 305
+    },
+    {
+      "epoch": 0.038788186081886174,
+      "grad_norm": 1.0078125,
+      "learning_rate": 8.433179723502305e-05,
+      "loss": 2.3689,
+      "step": 306
+    },
+    {
+      "epoch": 0.03891494485993155,
+      "grad_norm": 1.0703125,
+      "learning_rate": 8.460829493087557e-05,
+      "loss": 2.736,
+      "step": 307
+    },
+    {
+      "epoch": 0.03904170363797693,
+      "grad_norm": 1.0078125,
+      "learning_rate": 8.488479262672811e-05,
+      "loss": 2.3187,
+      "step": 308
+    },
+    {
+      "epoch": 0.03916846241602231,
+      "grad_norm": 1.515625,
+      "learning_rate": 8.516129032258065e-05,
+      "loss": 1.9598,
+      "step": 309
+    },
+    {
+      "epoch": 0.03929522119406769,
+      "grad_norm": 0.90625,
+      "learning_rate": 8.543778801843318e-05,
+      "loss": 2.3678,
+      "step": 310
+    },
+    {
+      "epoch": 0.039421979972113066,
+      "grad_norm": 1.265625,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.8228,
+      "step": 311
+    },
+    {
+      "epoch": 0.03954873875015845,
+      "grad_norm": 1.03125,
+      "learning_rate": 8.599078341013824e-05,
+      "loss": 2.2008,
+      "step": 312
+    },
+    {
+      "epoch": 0.039675497528203825,
+      "grad_norm": 1.03125,
+      "learning_rate": 8.626728110599079e-05,
+      "loss": 2.1278,
+      "step": 313
+    },
+    {
+      "epoch": 0.03980225630624921,
+      "grad_norm": 1.078125,
+      "learning_rate": 8.654377880184333e-05,
+      "loss": 1.8647,
+      "step": 314
+    },
+    {
+      "epoch": 0.03992901508429459,
+      "grad_norm": 0.91796875,
+      "learning_rate": 8.682027649769585e-05,
+      "loss": 2.0758,
+      "step": 315
+    },
+    {
+      "epoch": 0.040055773862339966,
+      "grad_norm": 1.1484375,
+      "learning_rate": 8.709677419354839e-05,
+      "loss": 2.0726,
+      "step": 316
+    },
+    {
+      "epoch": 0.04018253264038535,
+      "grad_norm": 1.1484375,
+      "learning_rate": 8.737327188940092e-05,
+      "loss": 2.6807,
+      "step": 317
+    },
+    {
+      "epoch": 0.040309291418430725,
+      "grad_norm": 0.98828125,
+      "learning_rate": 8.764976958525346e-05,
+      "loss": 2.0432,
+      "step": 318
+    },
+    {
+      "epoch": 0.04043605019647611,
+      "grad_norm": 0.94921875,
+      "learning_rate": 8.7926267281106e-05,
+      "loss": 2.4002,
+      "step": 319
+    },
+    {
+      "epoch": 0.04056280897452148,
+      "grad_norm": 1.1171875,
+      "learning_rate": 8.820276497695852e-05,
+      "loss": 2.0059,
+      "step": 320
+    },
+    {
+      "epoch": 0.040689567752566866,
+      "grad_norm": 1.078125,
+      "learning_rate": 8.847926267281106e-05,
+      "loss": 2.0576,
+      "step": 321
+    },
+    {
+      "epoch": 0.04081632653061224,
+      "grad_norm": 1.2265625,
+      "learning_rate": 8.87557603686636e-05,
+      "loss": 2.389,
+      "step": 322
+    },
+    {
+      "epoch": 0.040943085308657624,
+      "grad_norm": 1.0625,
+      "learning_rate": 8.903225806451614e-05,
+      "loss": 2.0016,
+      "step": 323
+    },
+    {
+      "epoch": 0.04106984408670301,
+      "grad_norm": 1.015625,
+      "learning_rate": 8.930875576036867e-05,
+      "loss": 1.918,
+      "step": 324
+    },
+    {
+      "epoch": 0.04119660286474838,
+      "grad_norm": 1.40625,
+      "learning_rate": 8.95852534562212e-05,
+      "loss": 1.5321,
+      "step": 325
+    },
+    {
+      "epoch": 0.041323361642793766,
+      "grad_norm": 0.93359375,
+      "learning_rate": 8.986175115207374e-05,
+      "loss": 1.9709,
+      "step": 326
+    },
+    {
+      "epoch": 0.04145012042083914,
+      "grad_norm": 1.0625,
+      "learning_rate": 9.013824884792627e-05,
+      "loss": 1.9838,
+      "step": 327
+    },
+    {
+      "epoch": 0.041576879198884524,
+      "grad_norm": 1.1015625,
+      "learning_rate": 9.04147465437788e-05,
+      "loss": 2.0751,
+      "step": 328
+    },
+    {
+      "epoch": 0.0417036379769299,
+      "grad_norm": 1.25,
+      "learning_rate": 9.069124423963133e-05,
+      "loss": 2.2254,
+      "step": 329
+    },
+    {
+      "epoch": 0.04183039675497528,
+      "grad_norm": 0.9453125,
+      "learning_rate": 9.096774193548387e-05,
+      "loss": 1.9971,
+      "step": 330
+    },
+    {
+      "epoch": 0.04195715553302066,
+      "grad_norm": 1.828125,
+      "learning_rate": 9.124423963133642e-05,
+      "loss": 2.3412,
+      "step": 331
+    },
+    {
+      "epoch": 0.04208391431106604,
+      "grad_norm": 0.9140625,
+      "learning_rate": 9.152073732718893e-05,
+      "loss": 2.3622,
+      "step": 332
+    },
+    {
+      "epoch": 0.042210673089111424,
+      "grad_norm": 1.1015625,
+      "learning_rate": 9.179723502304148e-05,
+      "loss": 2.0821,
+      "step": 333
+    },
+    {
+      "epoch": 0.0423374318671568,
+      "grad_norm": 0.87890625,
+      "learning_rate": 9.2073732718894e-05,
+      "loss": 2.4345,
+      "step": 334
+    },
+    {
+      "epoch": 0.04246419064520218,
+      "grad_norm": 0.9609375,
+      "learning_rate": 9.235023041474655e-05,
+      "loss": 2.0549,
+      "step": 335
+    },
+    {
+      "epoch": 0.04259094942324756,
+      "grad_norm": 1.0390625,
+      "learning_rate": 9.262672811059908e-05,
+      "loss": 2.4581,
+      "step": 336
+    },
+    {
+      "epoch": 0.04271770820129294,
+      "grad_norm": 1.0859375,
+      "learning_rate": 9.290322580645161e-05,
+      "loss": 2.3993,
+      "step": 337
+    },
+    {
+      "epoch": 0.042844466979338316,
+      "grad_norm": 1.2890625,
+      "learning_rate": 9.317972350230415e-05,
+      "loss": 2.5258,
+      "step": 338
+    },
+    {
+      "epoch": 0.0429712257573837,
+      "grad_norm": 0.98828125,
+      "learning_rate": 9.345622119815668e-05,
+      "loss": 2.0652,
+      "step": 339
+    },
+    {
+      "epoch": 0.04309798453542908,
+      "grad_norm": 1.0234375,
+      "learning_rate": 9.373271889400923e-05,
+      "loss": 2.0846,
+      "step": 340
+    },
+    {
+      "epoch": 0.04322474331347446,
+      "grad_norm": 0.9375,
+      "learning_rate": 9.400921658986176e-05,
+      "loss": 2.193,
+      "step": 341
+    },
+    {
+      "epoch": 0.04335150209151984,
+      "grad_norm": 0.9765625,
+      "learning_rate": 9.428571428571429e-05,
+      "loss": 2.1681,
+      "step": 342
+    },
+    {
+      "epoch": 0.043478260869565216,
+      "grad_norm": 1.046875,
+      "learning_rate": 9.456221198156683e-05,
+      "loss": 2.4051,
+      "step": 343
+    },
+    {
+      "epoch": 0.0436050196476106,
+      "grad_norm": 1.03125,
+      "learning_rate": 9.483870967741936e-05,
+      "loss": 2.7981,
+      "step": 344
+    },
+    {
+      "epoch": 0.043731778425655975,
+      "grad_norm": 1.171875,
+      "learning_rate": 9.511520737327189e-05,
+      "loss": 2.3485,
+      "step": 345
+    },
+    {
+      "epoch": 0.04385853720370136,
+      "grad_norm": 1.203125,
+      "learning_rate": 9.539170506912443e-05,
+      "loss": 2.6022,
+      "step": 346
+    },
+    {
+      "epoch": 0.04398529598174673,
+      "grad_norm": 1.0078125,
+      "learning_rate": 9.566820276497696e-05,
+      "loss": 2.3691,
+      "step": 347
+    },
+    {
+      "epoch": 0.044112054759792116,
+      "grad_norm": 0.98828125,
+      "learning_rate": 9.59447004608295e-05,
+      "loss": 2.1588,
+      "step": 348
+    },
+    {
+      "epoch": 0.0442388135378375,
+      "grad_norm": 1.125,
+      "learning_rate": 9.622119815668202e-05,
+      "loss": 2.2599,
+      "step": 349
+    },
+    {
+      "epoch": 0.044365572315882874,
+      "grad_norm": 0.98046875,
+      "learning_rate": 9.649769585253456e-05,
+      "loss": 2.129,
+      "step": 350
+    },
+    {
+      "epoch": 0.04449233109392826,
+      "grad_norm": 1.140625,
+      "learning_rate": 9.67741935483871e-05,
+      "loss": 2.103,
+      "step": 351
+    },
+    {
+      "epoch": 0.04461908987197363,
+      "grad_norm": 0.9921875,
+      "learning_rate": 9.705069124423964e-05,
+      "loss": 2.165,
+      "step": 352
+    },
+    {
+      "epoch": 0.044745848650019016,
+      "grad_norm": 0.96484375,
+      "learning_rate": 9.732718894009217e-05,
+      "loss": 2.0007,
+      "step": 353
+    },
+    {
+      "epoch": 0.04487260742806439,
+      "grad_norm": 0.9296875,
+      "learning_rate": 9.76036866359447e-05,
+      "loss": 1.9785,
+      "step": 354
+    },
+    {
+      "epoch": 0.044999366206109774,
+      "grad_norm": 1.2109375,
+      "learning_rate": 9.788018433179724e-05,
+      "loss": 2.2277,
+      "step": 355
+    },
+    {
+      "epoch": 0.04512612498415515,
+      "grad_norm": 1.0703125,
+      "learning_rate": 9.815668202764977e-05,
+      "loss": 2.4022,
+      "step": 356
+    },
+    {
+      "epoch": 0.04525288376220053,
+      "grad_norm": 0.9609375,
+      "learning_rate": 9.843317972350231e-05,
+      "loss": 2.4135,
+      "step": 357
+    },
+    {
+      "epoch": 0.045379642540245915,
+      "grad_norm": 1.109375,
+      "learning_rate": 9.870967741935484e-05,
+      "loss": 2.6693,
+      "step": 358
+    },
+    {
+      "epoch": 0.04550640131829129,
+      "grad_norm": 1.3515625,
+      "learning_rate": 9.898617511520737e-05,
+      "loss": 2.6541,
+      "step": 359
+    },
+    {
+      "epoch": 0.045633160096336674,
+      "grad_norm": 1.2421875,
+      "learning_rate": 9.926267281105992e-05,
+      "loss": 2.2509,
+      "step": 360
+    },
+    {
+      "epoch": 0.04575991887438205,
+      "grad_norm": 1.171875,
+      "learning_rate": 9.953917050691245e-05,
+      "loss": 2.212,
+      "step": 361
+    },
+    {
+      "epoch": 0.04588667765242743,
+      "grad_norm": 1.15625,
+      "learning_rate": 9.981566820276498e-05,
+      "loss": 2.0584,
+      "step": 362
+    },
+    {
+      "epoch": 0.04601343643047281,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00010009216589861752,
+      "loss": 2.0944,
+      "step": 363
+    },
+    {
+      "epoch": 0.04614019520851819,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00010036866359447005,
+      "loss": 1.799,
+      "step": 364
+    },
+    {
+      "epoch": 0.046266953986563567,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00010064516129032259,
+      "loss": 2.5278,
+      "step": 365
+    },
+    {
+      "epoch": 0.04639371276460895,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00010092165898617511,
+      "loss": 2.1668,
+      "step": 366
+    },
+    {
+      "epoch": 0.04652047154265433,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00010119815668202765,
+      "loss": 1.8589,
+      "step": 367
+    },
+    {
+      "epoch": 0.04664723032069971,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00010147465437788018,
+      "loss": 2.1096,
+      "step": 368
+    },
+    {
+      "epoch": 0.04677398909874509,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00010175115207373273,
+      "loss": 2.3124,
+      "step": 369
+    },
+    {
+      "epoch": 0.046900747876790466,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00010202764976958525,
+      "loss": 2.3002,
+      "step": 370
+    },
+    {
+      "epoch": 0.04702750665483585,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00010230414746543778,
+      "loss": 2.5014,
+      "step": 371
+    },
+    {
+      "epoch": 0.047154265432881225,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00010258064516129033,
+      "loss": 2.0533,
+      "step": 372
+    },
+    {
+      "epoch": 0.04728102421092661,
+      "grad_norm": 3.296875,
+      "learning_rate": 0.00010285714285714286,
+      "loss": 2.2184,
+      "step": 373
+    },
+    {
+      "epoch": 0.04740778298897198,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00010313364055299539,
+      "loss": 2.3654,
+      "step": 374
+    },
+    {
+      "epoch": 0.047534541767017366,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00010341013824884793,
+      "loss": 2.5627,
+      "step": 375
+    },
+    {
+      "epoch": 0.04766130054506275,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00010368663594470046,
+      "loss": 2.3615,
+      "step": 376
+    },
+    {
+      "epoch": 0.047788059323108124,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.000103963133640553,
+      "loss": 2.1589,
+      "step": 377
+    },
+    {
+      "epoch": 0.04791481810115351,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00010423963133640553,
+      "loss": 1.9195,
+      "step": 378
+    },
+    {
+      "epoch": 0.04804157687919888,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00010451612903225806,
+      "loss": 1.9344,
+      "step": 379
+    },
+    {
+      "epoch": 0.048168335657244266,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00010479262672811061,
+      "loss": 1.9311,
+      "step": 380
+    },
+    {
+      "epoch": 0.04829509443528964,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00010506912442396314,
+      "loss": 2.2372,
+      "step": 381
+    },
+    {
+      "epoch": 0.048421853213335024,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00010534562211981568,
+      "loss": 1.9217,
+      "step": 382
+    },
+    {
+      "epoch": 0.0485486119913804,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0001056221198156682,
+      "loss": 2.1078,
+      "step": 383
+    },
+    {
+      "epoch": 0.04867537076942578,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00010589861751152074,
+      "loss": 2.2819,
+      "step": 384
+    },
+    {
+      "epoch": 0.048802129547471165,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00010617511520737328,
+      "loss": 2.2347,
+      "step": 385
+    },
+    {
+      "epoch": 0.04892888832551654,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00010645161290322581,
+      "loss": 2.1271,
+      "step": 386
+    },
+    {
+      "epoch": 0.049055647103561924,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00010672811059907834,
+      "loss": 1.9699,
+      "step": 387
+    },
+    {
+      "epoch": 0.0491824058816073,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00010700460829493087,
+      "loss": 2.63,
+      "step": 388
+    },
+    {
+      "epoch": 0.04930916465965268,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00010728110599078342,
+      "loss": 2.3463,
+      "step": 389
+    },
+    {
+      "epoch": 0.04943592343769806,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00010755760368663595,
+      "loss": 2.006,
+      "step": 390
+    },
+    {
+      "epoch": 0.04956268221574344,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00010783410138248848,
+      "loss": 1.9709,
+      "step": 391
+    },
+    {
+      "epoch": 0.049689440993788817,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00010811059907834102,
+      "loss": 2.47,
+      "step": 392
+    },
+    {
+      "epoch": 0.0498161997718342,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00010838709677419355,
+      "loss": 2.589,
+      "step": 393
+    },
+    {
+      "epoch": 0.04994295854987958,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00010866359447004609,
+      "loss": 2.0223,
+      "step": 394
+    },
+    {
+      "epoch": 0.05006971732792496,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00010894009216589861,
+      "loss": 2.3123,
+      "step": 395
+    },
+    {
+      "epoch": 0.05019647610597034,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00010921658986175115,
+      "loss": 2.0089,
+      "step": 396
+    },
+    {
+      "epoch": 0.050323234884015716,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0001094930875576037,
+      "loss": 2.6241,
+      "step": 397
+    },
+    {
+      "epoch": 0.0504499936620611,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00010976958525345622,
+      "loss": 2.1172,
+      "step": 398
+    },
+    {
+      "epoch": 0.050576752440106475,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011004608294930877,
+      "loss": 2.4678,
+      "step": 399
+    },
+    {
+      "epoch": 0.05070351121815186,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011032258064516128,
+      "loss": 2.5936,
+      "step": 400
+    },
+    {
+      "epoch": 0.05083026999619724,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011059907834101383,
+      "loss": 1.7543,
+      "step": 401
+    },
+    {
+      "epoch": 0.050957028774242616,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00011087557603686637,
+      "loss": 2.1328,
+      "step": 402
+    },
+    {
+      "epoch": 0.051083787552288,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0001111520737327189,
+      "loss": 2.5187,
+      "step": 403
+    },
+    {
+      "epoch": 0.051210546330333374,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011142857142857143,
+      "loss": 2.5182,
+      "step": 404
+    },
+    {
+      "epoch": 0.05133730510837876,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011170506912442396,
+      "loss": 2.1758,
+      "step": 405
+    },
+    {
+      "epoch": 0.05146406388642413,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0001119815668202765,
+      "loss": 2.5022,
+      "step": 406
+    },
+    {
+      "epoch": 0.051590822664469516,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011225806451612903,
+      "loss": 2.5626,
+      "step": 407
+    },
+    {
+      "epoch": 0.05171758144251489,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00011253456221198156,
+      "loss": 2.8027,
+      "step": 408
+    },
+    {
+      "epoch": 0.051844340220560274,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0001128110599078341,
+      "loss": 2.2778,
+      "step": 409
+    },
+    {
+      "epoch": 0.05197109899860566,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011308755760368664,
+      "loss": 1.9029,
+      "step": 410
+    },
+    {
+      "epoch": 0.05209785777665103,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011336405529953918,
+      "loss": 1.8463,
+      "step": 411
+    },
+    {
+      "epoch": 0.052224616554696415,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0001136405529953917,
+      "loss": 2.2628,
+      "step": 412
+    },
+    {
+      "epoch": 0.05235137533274179,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00011391705069124424,
+      "loss": 2.696,
+      "step": 413
+    },
+    {
+      "epoch": 0.052478134110787174,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00011419354838709678,
+      "loss": 2.8536,
+      "step": 414
+    },
+    {
+      "epoch": 0.05260489288883255,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011447004608294931,
+      "loss": 2.5778,
+      "step": 415
+    },
+    {
+      "epoch": 0.05273165166687793,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00011474654377880186,
+      "loss": 2.0624,
+      "step": 416
+    },
+    {
+      "epoch": 0.05285841044492331,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011502304147465437,
+      "loss": 1.6379,
+      "step": 417
+    },
+    {
+      "epoch": 0.05298516922296869,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011529953917050692,
+      "loss": 2.1152,
+      "step": 418
+    },
+    {
+      "epoch": 0.05311192800101407,
+      "grad_norm": 2.453125,
+      "learning_rate": 0.00011557603686635946,
+      "loss": 2.7291,
+      "step": 419
+    },
+    {
+      "epoch": 0.05323868677905945,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011585253456221199,
+      "loss": 2.3185,
+      "step": 420
+    },
+    {
+      "epoch": 0.05336544555710483,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011612903225806452,
+      "loss": 1.657,
+      "step": 421
+    },
+    {
+      "epoch": 0.05349220433515021,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011640552995391705,
+      "loss": 2.2992,
+      "step": 422
+    },
+    {
+      "epoch": 0.05361896311319559,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011668202764976959,
+      "loss": 2.1632,
+      "step": 423
+    },
+    {
+      "epoch": 0.053745721891240966,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00011695852534562213,
+      "loss": 2.1941,
+      "step": 424
+    },
+    {
+      "epoch": 0.05387248066928635,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011723502304147465,
+      "loss": 2.4234,
+      "step": 425
+    },
+    {
+      "epoch": 0.053999239447331725,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0001175115207373272,
+      "loss": 2.2055,
+      "step": 426
+    },
+    {
+      "epoch": 0.05412599822537711,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00011778801843317972,
+      "loss": 2.3199,
+      "step": 427
+    },
+    {
+      "epoch": 0.05425275700342249,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011806451612903227,
+      "loss": 2.2118,
+      "step": 428
+    },
+    {
+      "epoch": 0.054379515781467866,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00011834101382488478,
+      "loss": 2.4926,
+      "step": 429
+    },
+    {
+      "epoch": 0.05450627455951325,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011861751152073733,
+      "loss": 2.5658,
+      "step": 430
+    },
+    {
+      "epoch": 0.054633033337558624,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011889400921658987,
+      "loss": 1.9025,
+      "step": 431
+    },
+    {
+      "epoch": 0.05475979211560401,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001191705069124424,
+      "loss": 1.9359,
+      "step": 432
+    },
+    {
+      "epoch": 0.05488655089364938,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011944700460829493,
+      "loss": 2.6443,
+      "step": 433
+    },
+    {
+      "epoch": 0.055013309671694766,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011972350230414746,
+      "loss": 2.2787,
+      "step": 434
+    },
+    {
+      "epoch": 0.05514006844974014,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00012,
+      "loss": 1.9707,
+      "step": 435
+    },
+    {
+      "epoch": 0.055266827227785524,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011999999564342667,
+      "loss": 2.3204,
+      "step": 436
+    },
+    {
+      "epoch": 0.05539358600583091,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0001199999825737073,
+      "loss": 2.1407,
+      "step": 437
+    },
+    {
+      "epoch": 0.05552034478387628,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011999996079084382,
+      "loss": 2.2962,
+      "step": 438
+    },
+    {
+      "epoch": 0.055647103561921665,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011999993029483935,
+      "loss": 2.325,
+      "step": 439
+    },
+    {
+      "epoch": 0.05577386233996704,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011999989108569834,
+      "loss": 2.3317,
+      "step": 440
+    },
+    {
+      "epoch": 0.055900621118012424,
+      "grad_norm": 2.609375,
+      "learning_rate": 0.0001199998431634265,
+      "loss": 2.0142,
+      "step": 441
+    },
+    {
+      "epoch": 0.0560273798960578,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00011999978652803075,
+      "loss": 2.8675,
+      "step": 442
+    },
+    {
+      "epoch": 0.05615413867410318,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011999972117951936,
+      "loss": 2.1372,
+      "step": 443
+    },
+    {
+      "epoch": 0.05628089745214856,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0001199996471179018,
+      "loss": 2.1987,
+      "step": 444
+    },
+    {
+      "epoch": 0.05640765623019394,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011999956434318879,
+      "loss": 2.0152,
+      "step": 445
+    },
+    {
+      "epoch": 0.056534415008239323,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011999947285539242,
+      "loss": 2.7334,
+      "step": 446
+    },
+    {
+      "epoch": 0.0566611737862847,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011999937265452592,
+      "loss": 2.0989,
+      "step": 447
+    },
+    {
+      "epoch": 0.05678793256433008,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011999926374060386,
+      "loss": 1.8519,
+      "step": 448
+    },
+    {
+      "epoch": 0.05691469134237546,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011999914611364205,
+      "loss": 1.8668,
+      "step": 449
+    },
+    {
+      "epoch": 0.05704145012042084,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011999901977365759,
+      "loss": 2.182,
+      "step": 450
+    },
+    {
+      "epoch": 0.057168208898466216,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0001199988847206688,
+      "loss": 2.5253,
+      "step": 451
+    },
+    {
+      "epoch": 0.0572949676765116,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011999874095469532,
+      "loss": 1.9986,
+      "step": 452
+    },
+    {
+      "epoch": 0.057421726454556975,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.000119998588475758,
+      "loss": 2.2526,
+      "step": 453
+    },
+    {
+      "epoch": 0.05754848523260236,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011999842728387901,
+      "loss": 2.3086,
+      "step": 454
+    },
+    {
+      "epoch": 0.05767524401064774,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011999825737908175,
+      "loss": 2.2628,
+      "step": 455
+    },
+    {
+      "epoch": 0.057802002788693116,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011999807876139088,
+      "loss": 2.4269,
+      "step": 456
+    },
+    {
+      "epoch": 0.0579287615667385,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011999789143083236,
+      "loss": 2.2286,
+      "step": 457
+    },
+    {
+      "epoch": 0.058055520344783874,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011999769538743336,
+      "loss": 1.9248,
+      "step": 458
+    },
+    {
+      "epoch": 0.05818227912282926,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011999749063122237,
+      "loss": 2.3115,
+      "step": 459
+    },
+    {
+      "epoch": 0.05830903790087463,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011999727716222914,
+      "loss": 2.4456,
+      "step": 460
+    },
+    {
+      "epoch": 0.058435796678920016,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011999705498048465,
+      "loss": 2.441,
+      "step": 461
+    },
+    {
+      "epoch": 0.05856255545696539,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00011999682408602119,
+      "loss": 2.6528,
+      "step": 462
+    },
+    {
+      "epoch": 0.058689314235010774,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011999658447887225,
+      "loss": 1.8711,
+      "step": 463
+    },
+    {
+      "epoch": 0.05881607301305616,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011999633615907265,
+      "loss": 2.5684,
+      "step": 464
+    },
+    {
+      "epoch": 0.05894283179110153,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011999607912665845,
+      "loss": 2.3015,
+      "step": 465
+    },
+    {
+      "epoch": 0.059069590569146915,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011999581338166696,
+      "loss": 2.2543,
+      "step": 466
+    },
+    {
+      "epoch": 0.05919634934719229,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011999553892413681,
+      "loss": 2.1107,
+      "step": 467
+    },
+    {
+      "epoch": 0.059323108125237674,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011999525575410781,
+      "loss": 2.3834,
+      "step": 468
+    },
+    {
+      "epoch": 0.05944986690328305,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0001199949638716211,
+      "loss": 2.1325,
+      "step": 469
+    },
+    {
+      "epoch": 0.05957662568132843,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.00011999466327671907,
+      "loss": 2.8059,
+      "step": 470
+    },
+    {
+      "epoch": 0.059703384459373815,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011999435396944537,
+      "loss": 2.0948,
+      "step": 471
+    },
+    {
+      "epoch": 0.05983014323741919,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011999403594984492,
+      "loss": 2.1935,
+      "step": 472
+    },
+    {
+      "epoch": 0.059956902015464573,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011999370921796391,
+      "loss": 1.9857,
+      "step": 473
+    },
+    {
+      "epoch": 0.06008366079350995,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00011999337377384979,
+      "loss": 2.4936,
+      "step": 474
+    },
+    {
+      "epoch": 0.06021041957155533,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011999302961755125,
+      "loss": 2.361,
+      "step": 475
+    },
+    {
+      "epoch": 0.06033717834960071,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011999267674911826,
+      "loss": 2.0592,
+      "step": 476
+    },
+    {
+      "epoch": 0.06046393712764609,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0001199923151686021,
+      "loss": 2.2916,
+      "step": 477
+    },
+    {
+      "epoch": 0.060590695905691466,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011999194487605526,
+      "loss": 2.1491,
+      "step": 478
+    },
+    {
+      "epoch": 0.06071745468373685,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011999156587153153,
+      "loss": 2.0982,
+      "step": 479
+    },
+    {
+      "epoch": 0.06084421346178223,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011999117815508591,
+      "loss": 2.0827,
+      "step": 480
+    },
+    {
+      "epoch": 0.06097097223982761,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011999078172677474,
+      "loss": 2.1167,
+      "step": 481
+    },
+    {
+      "epoch": 0.06109773101787299,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00011999037658665559,
+      "loss": 1.9961,
+      "step": 482
+    },
+    {
+      "epoch": 0.061224489795918366,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011998996273478724,
+      "loss": 2.4519,
+      "step": 483
+    },
+    {
+      "epoch": 0.06135124857396375,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011998954017122987,
+      "loss": 2.3454,
+      "step": 484
+    },
+    {
+      "epoch": 0.061478007352009124,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011998910889604478,
+      "loss": 2.5932,
+      "step": 485
+    },
+    {
+      "epoch": 0.06160476613005451,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011998866890929464,
+      "loss": 2.3735,
+      "step": 486
+    },
+    {
+      "epoch": 0.06173152490809988,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011998822021104332,
+      "loss": 2.9586,
+      "step": 487
+    },
+    {
+      "epoch": 0.061858283686145266,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00011998776280135599,
+      "loss": 2.3048,
+      "step": 488
+    },
+    {
+      "epoch": 0.06198504246419065,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011998729668029908,
+      "loss": 2.1469,
+      "step": 489
+    },
+    {
+      "epoch": 0.062111801242236024,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011998682184794025,
+      "loss": 2.1199,
+      "step": 490
+    },
+    {
+      "epoch": 0.06223856002028141,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011998633830434847,
+      "loss": 2.0902,
+      "step": 491
+    },
+    {
+      "epoch": 0.06236531879832678,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.000119985846049594,
+      "loss": 2.0548,
+      "step": 492
+    },
+    {
+      "epoch": 0.062492077576372165,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011998534508374828,
+      "loss": 2.055,
+      "step": 493
+    },
+    {
+      "epoch": 0.06261883635441755,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011998483540688406,
+      "loss": 2.606,
+      "step": 494
+    },
+    {
+      "epoch": 0.06274559513246292,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011998431701907537,
+      "loss": 1.7809,
+      "step": 495
+    },
+    {
+      "epoch": 0.0628723539105083,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00011998378992039749,
+      "loss": 2.2643,
+      "step": 496
+    },
+    {
+      "epoch": 0.06299911268855368,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00011998325411092697,
+      "loss": 1.9097,
+      "step": 497
+    },
+    {
+      "epoch": 0.06312587146659907,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011998270959074158,
+      "loss": 2.3142,
+      "step": 498
+    },
+    {
+      "epoch": 0.06325263024464445,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011998215635992044,
+      "loss": 2.7631,
+      "step": 499
+    },
+    {
+      "epoch": 0.06337938902268982,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011998159441854389,
+      "loss": 2.0316,
+      "step": 500
+    },
+    {
+      "epoch": 0.0635061478007352,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0001199810237666935,
+      "loss": 1.8626,
+      "step": 501
+    },
+    {
+      "epoch": 0.06363290657878058,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011998044440445218,
+      "loss": 3.0385,
+      "step": 502
+    },
+    {
+      "epoch": 0.06375966535682596,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011997985633190403,
+      "loss": 2.4948,
+      "step": 503
+    },
+    {
+      "epoch": 0.06388642413487133,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011997925954913445,
+      "loss": 2.323,
+      "step": 504
+    },
+    {
+      "epoch": 0.06401318291291672,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011997865405623012,
+      "loss": 2.3155,
+      "step": 505
+    },
+    {
+      "epoch": 0.0641399416909621,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011997803985327898,
+      "loss": 2.2917,
+      "step": 506
+    },
+    {
+      "epoch": 0.06426670046900748,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011997741694037022,
+      "loss": 2.2503,
+      "step": 507
+    },
+    {
+      "epoch": 0.06439345924705286,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011997678531759427,
+      "loss": 2.3282,
+      "step": 508
+    },
+    {
+      "epoch": 0.06452021802509823,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011997614498504287,
+      "loss": 2.0595,
+      "step": 509
+    },
+    {
+      "epoch": 0.06464697680314362,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011997549594280903,
+      "loss": 1.6902,
+      "step": 510
+    },
+    {
+      "epoch": 0.064773735581189,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011997483819098696,
+      "loss": 2.2141,
+      "step": 511
+    },
+    {
+      "epoch": 0.06490049435923438,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0001199741717296722,
+      "loss": 1.9506,
+      "step": 512
+    },
+    {
+      "epoch": 0.06502725313727975,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011997349655896156,
+      "loss": 2.2917,
+      "step": 513
+    },
+    {
+      "epoch": 0.06515401191532513,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011997281267895306,
+      "loss": 1.6998,
+      "step": 514
+    },
+    {
+      "epoch": 0.06528077069337052,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011997212008974602,
+      "loss": 1.9828,
+      "step": 515
+    },
+    {
+      "epoch": 0.0654075294714159,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011997141879144099,
+      "loss": 2.0288,
+      "step": 516
+    },
+    {
+      "epoch": 0.06553428824946128,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011997070878413985,
+      "loss": 2.0933,
+      "step": 517
+    },
+    {
+      "epoch": 0.06566104702750665,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0001199699900679457,
+      "loss": 1.6551,
+      "step": 518
+    },
+    {
+      "epoch": 0.06578780580555203,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011996926264296288,
+      "loss": 2.3048,
+      "step": 519
+    },
+    {
+      "epoch": 0.06591456458359742,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011996852650929706,
+      "loss": 1.6582,
+      "step": 520
+    },
+    {
+      "epoch": 0.0660413233616428,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011996778166705514,
+      "loss": 2.3414,
+      "step": 521
+    },
+    {
+      "epoch": 0.06616808213968817,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011996702811634526,
+      "loss": 1.8715,
+      "step": 522
+    },
+    {
+      "epoch": 0.06629484091773355,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011996626585727685,
+      "loss": 2.2727,
+      "step": 523
+    },
+    {
+      "epoch": 0.06642159969577893,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011996549488996066,
+      "loss": 2.2925,
+      "step": 524
+    },
+    {
+      "epoch": 0.06654835847382432,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011996471521450858,
+      "loss": 2.1146,
+      "step": 525
+    },
+    {
+      "epoch": 0.0666751172518697,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00011996392683103387,
+      "loss": 2.3482,
+      "step": 526
+    },
+    {
+      "epoch": 0.06680187602991507,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011996312973965099,
+      "loss": 2.6984,
+      "step": 527
+    },
+    {
+      "epoch": 0.06692863480796045,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011996232394047575,
+      "loss": 2.4002,
+      "step": 528
+    },
+    {
+      "epoch": 0.06705539358600583,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011996150943362511,
+      "loss": 1.9119,
+      "step": 529
+    },
+    {
+      "epoch": 0.06718215236405121,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011996068621921738,
+      "loss": 2.3112,
+      "step": 530
+    },
+    {
+      "epoch": 0.06730891114209658,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0001199598542973721,
+      "loss": 2.1442,
+      "step": 531
+    },
+    {
+      "epoch": 0.06743566992014197,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011995901366821007,
+      "loss": 2.1507,
+      "step": 532
+    },
+    {
+      "epoch": 0.06756242869818735,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.00011995816433185337,
+      "loss": 1.9828,
+      "step": 533
+    },
+    {
+      "epoch": 0.06768918747623273,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011995730628842537,
+      "loss": 2.3836,
+      "step": 534
+    },
+    {
+      "epoch": 0.06781594625427811,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011995643953805062,
+      "loss": 1.911,
+      "step": 535
+    },
+    {
+      "epoch": 0.06794270503232348,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011995556408085505,
+      "loss": 1.9752,
+      "step": 536
+    },
+    {
+      "epoch": 0.06806946381036887,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011995467991696576,
+      "loss": 2.4653,
+      "step": 537
+    },
+    {
+      "epoch": 0.06819622258841425,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011995378704651113,
+      "loss": 2.3855,
+      "step": 538
+    },
+    {
+      "epoch": 0.06832298136645963,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00011995288546962085,
+      "loss": 2.1457,
+      "step": 539
+    },
+    {
+      "epoch": 0.068449740144505,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00011995197518642582,
+      "loss": 2.0958,
+      "step": 540
+    },
+    {
+      "epoch": 0.06857649892255038,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011995105619705828,
+      "loss": 2.4348,
+      "step": 541
+    },
+    {
+      "epoch": 0.06870325770059577,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011995012850165164,
+      "loss": 1.7981,
+      "step": 542
+    },
+    {
+      "epoch": 0.06883001647864115,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011994919210034063,
+      "loss": 1.9514,
+      "step": 543
+    },
+    {
+      "epoch": 0.06895677525668653,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011994824699326122,
+      "loss": 2.7192,
+      "step": 544
+    },
+    {
+      "epoch": 0.0690835340347319,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011994729318055069,
+      "loss": 2.3608,
+      "step": 545
+    },
+    {
+      "epoch": 0.06921029281277728,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011994633066234753,
+      "loss": 2.338,
+      "step": 546
+    },
+    {
+      "epoch": 0.06933705159082267,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011994535943879152,
+      "loss": 1.9992,
+      "step": 547
+    },
+    {
+      "epoch": 0.06946381036886805,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011994437951002371,
+      "loss": 1.6627,
+      "step": 548
+    },
+    {
+      "epoch": 0.06959056914691342,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011994339087618638,
+      "loss": 2.2918,
+      "step": 549
+    },
+    {
+      "epoch": 0.0697173279249588,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011994239353742312,
+      "loss": 1.7978,
+      "step": 550
+    },
+    {
+      "epoch": 0.06984408670300418,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011994138749387876,
+      "loss": 2.1075,
+      "step": 551
+    },
+    {
+      "epoch": 0.06997084548104957,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011994037274569938,
+      "loss": 2.4512,
+      "step": 552
+    },
+    {
+      "epoch": 0.07009760425909495,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011993934929303237,
+      "loss": 1.7422,
+      "step": 553
+    },
+    {
+      "epoch": 0.07022436303714032,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011993831713602633,
+      "loss": 2.8057,
+      "step": 554
+    },
+    {
+      "epoch": 0.0703511218151857,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011993727627483116,
+      "loss": 2.0537,
+      "step": 555
+    },
+    {
+      "epoch": 0.07047788059323108,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.000119936226709598,
+      "loss": 2.1056,
+      "step": 556
+    },
+    {
+      "epoch": 0.07060463937127646,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011993516844047931,
+      "loss": 2.4015,
+      "step": 557
+    },
+    {
+      "epoch": 0.07073139814932185,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011993410146762871,
+      "loss": 1.9852,
+      "step": 558
+    },
+    {
+      "epoch": 0.07085815692736722,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00011993302579120118,
+      "loss": 2.0538,
+      "step": 559
+    },
+    {
+      "epoch": 0.0709849157054126,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00011993194141135293,
+      "loss": 2.4099,
+      "step": 560
+    },
+    {
+      "epoch": 0.07111167448345798,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001199308483282414,
+      "loss": 2.1438,
+      "step": 561
+    },
+    {
+      "epoch": 0.07123843326150336,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011992974654202539,
+      "loss": 1.7242,
+      "step": 562
+    },
+    {
+      "epoch": 0.07136519203954873,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011992863605286483,
+      "loss": 1.9941,
+      "step": 563
+    },
+    {
+      "epoch": 0.07149195081759412,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011992751686092103,
+      "loss": 2.4498,
+      "step": 564
+    },
+    {
+      "epoch": 0.0716187095956395,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011992638896635651,
+      "loss": 1.9882,
+      "step": 565
+    },
+    {
+      "epoch": 0.07174546837368488,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011992525236933504,
+      "loss": 2.1906,
+      "step": 566
+    },
+    {
+      "epoch": 0.07187222715173026,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011992410707002168,
+      "loss": 2.5371,
+      "step": 567
+    },
+    {
+      "epoch": 0.07199898592977563,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0001199229530685828,
+      "loss": 2.6712,
+      "step": 568
+    },
+    {
+      "epoch": 0.07212574470782102,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011992179036518592,
+      "loss": 2.1417,
+      "step": 569
+    },
+    {
+      "epoch": 0.0722525034858664,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.0001199206189599999,
+      "loss": 2.8154,
+      "step": 570
+    },
+    {
+      "epoch": 0.07237926226391178,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011991943885319489,
+      "loss": 2.2032,
+      "step": 571
+    },
+    {
+      "epoch": 0.07250602104195715,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011991825004494222,
+      "loss": 2.1526,
+      "step": 572
+    },
+    {
+      "epoch": 0.07263277982000253,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011991705253541455,
+      "loss": 1.9459,
+      "step": 573
+    },
+    {
+      "epoch": 0.07275953859804792,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011991584632478576,
+      "loss": 1.9034,
+      "step": 574
+    },
+    {
+      "epoch": 0.0728862973760933,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011991463141323103,
+      "loss": 2.1874,
+      "step": 575
+    },
+    {
+      "epoch": 0.07301305615413868,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011991340780092679,
+      "loss": 2.4372,
+      "step": 576
+    },
+    {
+      "epoch": 0.07313981493218405,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011991217548805074,
+      "loss": 2.0467,
+      "step": 577
+    },
+    {
+      "epoch": 0.07326657371022943,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011991093447478183,
+      "loss": 2.1048,
+      "step": 578
+    },
+    {
+      "epoch": 0.07339333248827482,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011990968476130024,
+      "loss": 2.3717,
+      "step": 579
+    },
+    {
+      "epoch": 0.0735200912663202,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00011990842634778751,
+      "loss": 2.0886,
+      "step": 580
+    },
+    {
+      "epoch": 0.07364685004436557,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011990715923442637,
+      "loss": 1.8821,
+      "step": 581
+    },
+    {
+      "epoch": 0.07377360882241095,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0001199058834214008,
+      "loss": 1.9976,
+      "step": 582
+    },
+    {
+      "epoch": 0.07390036760045633,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0001199045989088961,
+      "loss": 2.4496,
+      "step": 583
+    },
+    {
+      "epoch": 0.07402712637850171,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0001199033056970988,
+      "loss": 2.4148,
+      "step": 584
+    },
+    {
+      "epoch": 0.0741538851565471,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0001199020037861967,
+      "loss": 2.2053,
+      "step": 585
+    },
+    {
+      "epoch": 0.07428064393459247,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011990069317637886,
+      "loss": 2.4354,
+      "step": 586
+    },
+    {
+      "epoch": 0.07440740271263785,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0001198993738678356,
+      "loss": 2.3413,
+      "step": 587
+    },
+    {
+      "epoch": 0.07453416149068323,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011989804586075852,
+      "loss": 1.8386,
+      "step": 588
+    },
+    {
+      "epoch": 0.07466092026872861,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011989670915534047,
+      "loss": 1.7969,
+      "step": 589
+    },
+    {
+      "epoch": 0.07478767904677398,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011989536375177556,
+      "loss": 2.0728,
+      "step": 590
+    },
+    {
+      "epoch": 0.07491443782481937,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011989400965025915,
+      "loss": 1.5632,
+      "step": 591
+    },
+    {
+      "epoch": 0.07504119660286475,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011989264685098793,
+      "loss": 2.2075,
+      "step": 592
+    },
+    {
+      "epoch": 0.07516795538091013,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011989127535415976,
+      "loss": 2.5995,
+      "step": 593
+    },
+    {
+      "epoch": 0.07529471415895551,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00011988989515997382,
+      "loss": 2.0291,
+      "step": 594
+    },
+    {
+      "epoch": 0.07542147293700088,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011988850626863055,
+      "loss": 2.3419,
+      "step": 595
+    },
+    {
+      "epoch": 0.07554823171504627,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011988710868033165,
+      "loss": 1.6881,
+      "step": 596
+    },
+    {
+      "epoch": 0.07567499049309165,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011988570239528005,
+      "loss": 1.6839,
+      "step": 597
+    },
+    {
+      "epoch": 0.07580174927113703,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011988428741367999,
+      "loss": 2.0041,
+      "step": 598
+    },
+    {
+      "epoch": 0.0759285080491824,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011988286373573693,
+      "loss": 2.347,
+      "step": 599
+    },
+    {
+      "epoch": 0.07605526682722778,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011988143136165767,
+      "loss": 2.8373,
+      "step": 600
+    },
+    {
+      "epoch": 0.07618202560527317,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011987999029165015,
+      "loss": 2.2407,
+      "step": 601
+    },
+    {
+      "epoch": 0.07630878438331855,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011987854052592368,
+      "loss": 1.8534,
+      "step": 602
+    },
+    {
+      "epoch": 0.07643554316136393,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011987708206468878,
+      "loss": 2.4979,
+      "step": 603
+    },
+    {
+      "epoch": 0.0765623019394093,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011987561490815723,
+      "loss": 2.3751,
+      "step": 604
+    },
+    {
+      "epoch": 0.07668906071745468,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011987413905654213,
+      "loss": 2.1247,
+      "step": 605
+    },
+    {
+      "epoch": 0.07681581949550007,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011987265451005778,
+      "loss": 2.1684,
+      "step": 606
+    },
+    {
+      "epoch": 0.07694257827354545,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00011987116126891977,
+      "loss": 2.2431,
+      "step": 607
+    },
+    {
+      "epoch": 0.07706933705159082,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011986965933334493,
+      "loss": 1.834,
+      "step": 608
+    },
+    {
+      "epoch": 0.0771960958296362,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0001198681487035514,
+      "loss": 2.3618,
+      "step": 609
+    },
+    {
+      "epoch": 0.07732285460768158,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011986662937975852,
+      "loss": 2.0986,
+      "step": 610
+    },
+    {
+      "epoch": 0.07744961338572696,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011986510136218695,
+      "loss": 1.9086,
+      "step": 611
+    },
+    {
+      "epoch": 0.07757637216377235,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011986356465105856,
+      "loss": 2.4112,
+      "step": 612
+    },
+    {
+      "epoch": 0.07770313094181772,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011986201924659656,
+      "loss": 1.9192,
+      "step": 613
+    },
+    {
+      "epoch": 0.0778298897198631,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011986046514902532,
+      "loss": 2.5386,
+      "step": 614
+    },
+    {
+      "epoch": 0.07795664849790848,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011985890235857053,
+      "loss": 3.2956,
+      "step": 615
+    },
+    {
+      "epoch": 0.07808340727595386,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011985733087545917,
+      "loss": 2.4149,
+      "step": 616
+    },
+    {
+      "epoch": 0.07821016605399923,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011985575069991942,
+      "loss": 2.296,
+      "step": 617
+    },
+    {
+      "epoch": 0.07833692483204462,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011985416183218078,
+      "loss": 1.956,
+      "step": 618
+    },
+    {
+      "epoch": 0.07846368361009,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011985256427247395,
+      "loss": 2.044,
+      "step": 619
+    },
+    {
+      "epoch": 0.07859044238813538,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011985095802103094,
+      "loss": 2.2935,
+      "step": 620
+    },
+    {
+      "epoch": 0.07871720116618076,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011984934307808502,
+      "loss": 1.8738,
+      "step": 621
+    },
+    {
+      "epoch": 0.07884395994422613,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011984771944387071,
+      "loss": 2.1287,
+      "step": 622
+    },
+    {
+      "epoch": 0.07897071872227152,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011984608711862376,
+      "loss": 1.948,
+      "step": 623
+    },
+    {
+      "epoch": 0.0790974775003169,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011984444610258125,
+      "loss": 2.0518,
+      "step": 624
+    },
+    {
+      "epoch": 0.07922423627836228,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011984279639598149,
+      "loss": 1.7571,
+      "step": 625
+    },
+    {
+      "epoch": 0.07935099505640765,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011984113799906402,
+      "loss": 1.9322,
+      "step": 626
+    },
+    {
+      "epoch": 0.07947775383445303,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011983947091206968,
+      "loss": 2.4565,
+      "step": 627
+    },
+    {
+      "epoch": 0.07960451261249842,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00011983779513524058,
+      "loss": 2.3367,
+      "step": 628
+    },
+    {
+      "epoch": 0.0797312713905438,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011983611066882005,
+      "loss": 2.5078,
+      "step": 629
+    },
+    {
+      "epoch": 0.07985803016858918,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011983441751305274,
+      "loss": 2.3488,
+      "step": 630
+    },
+    {
+      "epoch": 0.07998478894663455,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011983271566818449,
+      "loss": 1.6402,
+      "step": 631
+    },
+    {
+      "epoch": 0.08011154772467993,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011983100513446247,
+      "loss": 2.0724,
+      "step": 632
+    },
+    {
+      "epoch": 0.08023830650272532,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011982928591213507,
+      "loss": 2.1708,
+      "step": 633
+    },
+    {
+      "epoch": 0.0803650652807707,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00011982755800145196,
+      "loss": 2.2252,
+      "step": 634
+    },
+    {
+      "epoch": 0.08049182405881607,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00011982582140266405,
+      "loss": 2.4464,
+      "step": 635
+    },
+    {
+      "epoch": 0.08061858283686145,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011982407611602353,
+      "loss": 2.0225,
+      "step": 636
+    },
+    {
+      "epoch": 0.08074534161490683,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011982232214178389,
+      "loss": 1.8768,
+      "step": 637
+    },
+    {
+      "epoch": 0.08087210039295221,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011982055948019977,
+      "loss": 2.2465,
+      "step": 638
+    },
+    {
+      "epoch": 0.0809988591709976,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00011981878813152721,
+      "loss": 1.8231,
+      "step": 639
+    },
+    {
+      "epoch": 0.08112561794904297,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0001198170080960234,
+      "loss": 1.9809,
+      "step": 640
+    },
+    {
+      "epoch": 0.08125237672708835,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011981521937394684,
+      "loss": 1.7854,
+      "step": 641
+    },
+    {
+      "epoch": 0.08137913550513373,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011981342196555732,
+      "loss": 2.0439,
+      "step": 642
+    },
+    {
+      "epoch": 0.08150589428317911,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0001198116158711158,
+      "loss": 2.6965,
+      "step": 643
+    },
+    {
+      "epoch": 0.08163265306122448,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011980980109088462,
+      "loss": 1.985,
+      "step": 644
+    },
+    {
+      "epoch": 0.08175941183926987,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001198079776251273,
+      "loss": 1.7421,
+      "step": 645
+    },
+    {
+      "epoch": 0.08188617061731525,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011980614547410861,
+      "loss": 2.3657,
+      "step": 646
+    },
+    {
+      "epoch": 0.08201292939536063,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011980430463809464,
+      "loss": 1.9375,
+      "step": 647
+    },
+    {
+      "epoch": 0.08213968817340601,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00011980245511735273,
+      "loss": 1.8679,
+      "step": 648
+    },
+    {
+      "epoch": 0.08226644695145138,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011980059691215143,
+      "loss": 2.269,
+      "step": 649
+    },
+    {
+      "epoch": 0.08239320572949677,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011979873002276063,
+      "loss": 1.7489,
+      "step": 650
+    },
+    {
+      "epoch": 0.08251996450754215,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011979685444945141,
+      "loss": 1.9268,
+      "step": 651
+    },
+    {
+      "epoch": 0.08264672328558753,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00011979497019249612,
+      "loss": 1.6361,
+      "step": 652
+    },
+    {
+      "epoch": 0.0827734820636329,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011979307725216843,
+      "loss": 2.2859,
+      "step": 653
+    },
+    {
+      "epoch": 0.08290024084167828,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011979117562874322,
+      "loss": 1.7728,
+      "step": 654
+    },
+    {
+      "epoch": 0.08302699961972367,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011978926532249663,
+      "loss": 1.7889,
+      "step": 655
+    },
+    {
+      "epoch": 0.08315375839776905,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00011978734633370608,
+      "loss": 2.2557,
+      "step": 656
+    },
+    {
+      "epoch": 0.08328051717581443,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011978541866265025,
+      "loss": 2.1239,
+      "step": 657
+    },
+    {
+      "epoch": 0.0834072759538598,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011978348230960906,
+      "loss": 2.6118,
+      "step": 658
+    },
+    {
+      "epoch": 0.08353403473190518,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011978153727486372,
+      "loss": 2.7797,
+      "step": 659
+    },
+    {
+      "epoch": 0.08366079350995057,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011977958355869667,
+      "loss": 1.9079,
+      "step": 660
+    },
+    {
+      "epoch": 0.08378755228799595,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011977762116139164,
+      "loss": 2.0391,
+      "step": 661
+    },
+    {
+      "epoch": 0.08391431106604132,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011977565008323361,
+      "loss": 1.9109,
+      "step": 662
+    },
+    {
+      "epoch": 0.0840410698440867,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0001197736703245088,
+      "loss": 1.8121,
+      "step": 663
+    },
+    {
+      "epoch": 0.08416782862213208,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011977168188550474,
+      "loss": 1.6679,
+      "step": 664
+    },
+    {
+      "epoch": 0.08429458740017746,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00011976968476651016,
+      "loss": 2.61,
+      "step": 665
+    },
+    {
+      "epoch": 0.08442134617822285,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011976767896781508,
+      "loss": 2.5032,
+      "step": 666
+    },
+    {
+      "epoch": 0.08454810495626822,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00011976566448971082,
+      "loss": 2.2979,
+      "step": 667
+    },
+    {
+      "epoch": 0.0846748637343136,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011976364133248985,
+      "loss": 1.7313,
+      "step": 668
+    },
+    {
+      "epoch": 0.08480162251235898,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011976160949644604,
+      "loss": 2.395,
+      "step": 669
+    },
+    {
+      "epoch": 0.08492838129040436,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011975956898187444,
+      "loss": 2.5355,
+      "step": 670
+    },
+    {
+      "epoch": 0.08505514006844973,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011975751978907132,
+      "loss": 2.0805,
+      "step": 671
+    },
+    {
+      "epoch": 0.08518189884649512,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011975546191833432,
+      "loss": 2.1901,
+      "step": 672
+    },
+    {
+      "epoch": 0.0853086576245405,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011975339536996225,
+      "loss": 2.3183,
+      "step": 673
+    },
+    {
+      "epoch": 0.08543541640258588,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011975132014425523,
+      "loss": 2.1307,
+      "step": 674
+    },
+    {
+      "epoch": 0.08556217518063126,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011974923624151461,
+      "loss": 1.8928,
+      "step": 675
+    },
+    {
+      "epoch": 0.08568893395867663,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011974714366204301,
+      "loss": 1.9199,
+      "step": 676
+    },
+    {
+      "epoch": 0.08581569273672202,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011974504240614434,
+      "loss": 2.1302,
+      "step": 677
+    },
+    {
+      "epoch": 0.0859424515147674,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0001197429324741237,
+      "loss": 2.2597,
+      "step": 678
+    },
+    {
+      "epoch": 0.08606921029281278,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011974081386628754,
+      "loss": 2.5662,
+      "step": 679
+    },
+    {
+      "epoch": 0.08619596907085816,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00011973868658294348,
+      "loss": 2.3477,
+      "step": 680
+    },
+    {
+      "epoch": 0.08632272784890353,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00011973655062440045,
+      "loss": 2.0649,
+      "step": 681
+    },
+    {
+      "epoch": 0.08644948662694892,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011973440599096864,
+      "loss": 1.9167,
+      "step": 682
+    },
+    {
+      "epoch": 0.0865762454049943,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011973225268295953,
+      "loss": 1.9955,
+      "step": 683
+    },
+    {
+      "epoch": 0.08670300418303968,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011973009070068575,
+      "loss": 2.5441,
+      "step": 684
+    },
+    {
+      "epoch": 0.08682976296108505,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011972792004446131,
+      "loss": 1.5964,
+      "step": 685
+    },
+    {
+      "epoch": 0.08695652173913043,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0001197257407146014,
+      "loss": 2.15,
+      "step": 686
+    },
+    {
+      "epoch": 0.08708328051717582,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011972355271142253,
+      "loss": 2.0484,
+      "step": 687
+    },
+    {
+      "epoch": 0.0872100392952212,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011972135603524243,
+      "loss": 2.1226,
+      "step": 688
+    },
+    {
+      "epoch": 0.08733679807326658,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011971915068638009,
+      "loss": 2.4057,
+      "step": 689
+    },
+    {
+      "epoch": 0.08746355685131195,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00011971693666515577,
+      "loss": 2.2777,
+      "step": 690
+    },
+    {
+      "epoch": 0.08759031562935733,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011971471397189101,
+      "loss": 2.3298,
+      "step": 691
+    },
+    {
+      "epoch": 0.08771707440740271,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011971248260690855,
+      "loss": 1.7263,
+      "step": 692
+    },
+    {
+      "epoch": 0.0878438331854481,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011971024257053246,
+      "loss": 1.8121,
+      "step": 693
+    },
+    {
+      "epoch": 0.08797059196349347,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011970799386308802,
+      "loss": 2.1231,
+      "step": 694
+    },
+    {
+      "epoch": 0.08809735074153885,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011970573648490178,
+      "loss": 1.5007,
+      "step": 695
+    },
+    {
+      "epoch": 0.08822410951958423,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011970347043630157,
+      "loss": 2.1878,
+      "step": 696
+    },
+    {
+      "epoch": 0.08835086829762961,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011970119571761648,
+      "loss": 1.8548,
+      "step": 697
+    },
+    {
+      "epoch": 0.088477627075675,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011969891232917678,
+      "loss": 2.0327,
+      "step": 698
+    },
+    {
+      "epoch": 0.08860438585372037,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00011969662027131412,
+      "loss": 2.5235,
+      "step": 699
+    },
+    {
+      "epoch": 0.08873114463176575,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011969431954436135,
+      "loss": 1.8969,
+      "step": 700
+    },
+    {
+      "epoch": 0.08885790340981113,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011969201014865251,
+      "loss": 2.3541,
+      "step": 701
+    },
+    {
+      "epoch": 0.08898466218785651,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011968969208452307,
+      "loss": 2.3007,
+      "step": 702
+    },
+    {
+      "epoch": 0.08911142096590188,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011968736535230958,
+      "loss": 2.0309,
+      "step": 703
+    },
+    {
+      "epoch": 0.08923817974394727,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011968502995234996,
+      "loss": 1.8516,
+      "step": 704
+    },
+    {
+      "epoch": 0.08936493852199265,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011968268588498334,
+      "loss": 2.1298,
+      "step": 705
+    },
+    {
+      "epoch": 0.08949169730003803,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011968033315055015,
+      "loss": 1.8224,
+      "step": 706
+    },
+    {
+      "epoch": 0.08961845607808341,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011967797174939201,
+      "loss": 1.7282,
+      "step": 707
+    },
+    {
+      "epoch": 0.08974521485612878,
+      "grad_norm": 1.8984375,
+      "learning_rate": 0.00011967560168185188,
+      "loss": 1.8683,
+      "step": 708
+    },
+    {
+      "epoch": 0.08987197363417417,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011967322294827391,
+      "loss": 2.2558,
+      "step": 709
+    },
+    {
+      "epoch": 0.08999873241221955,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011967083554900356,
+      "loss": 2.0726,
+      "step": 710
+    },
+    {
+      "epoch": 0.09012549119026493,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00011966843948438751,
+      "loss": 2.7413,
+      "step": 711
+    },
+    {
+      "epoch": 0.0902522499683103,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011966603475477373,
+      "loss": 2.3527,
+      "step": 712
+    },
+    {
+      "epoch": 0.09037900874635568,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00011966362136051142,
+      "loss": 2.3866,
+      "step": 713
+    },
+    {
+      "epoch": 0.09050576752440107,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011966119930195105,
+      "loss": 1.8672,
+      "step": 714
+    },
+    {
+      "epoch": 0.09063252630244645,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011965876857944435,
+      "loss": 2.241,
+      "step": 715
+    },
+    {
+      "epoch": 0.09075928508049183,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011965632919334432,
+      "loss": 1.8195,
+      "step": 716
+    },
+    {
+      "epoch": 0.0908860438585372,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0001196538811440052,
+      "loss": 2.044,
+      "step": 717
+    },
+    {
+      "epoch": 0.09101280263658258,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011965142443178247,
+      "loss": 1.8305,
+      "step": 718
+    },
+    {
+      "epoch": 0.09113956141462796,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011964895905703293,
+      "loss": 2.2118,
+      "step": 719
+    },
+    {
+      "epoch": 0.09126632019267335,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00011964648502011455,
+      "loss": 2.5554,
+      "step": 720
+    },
+    {
+      "epoch": 0.09139307897071872,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011964400232138668,
+      "loss": 1.9718,
+      "step": 721
+    },
+    {
+      "epoch": 0.0915198377487641,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.00011964151096120979,
+      "loss": 1.6874,
+      "step": 722
+    },
+    {
+      "epoch": 0.09164659652680948,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0001196390109399457,
+      "loss": 2.1046,
+      "step": 723
+    },
+    {
+      "epoch": 0.09177335530485486,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011963650225795745,
+      "loss": 2.0455,
+      "step": 724
+    },
+    {
+      "epoch": 0.09190011408290025,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011963398491560935,
+      "loss": 2.0469,
+      "step": 725
+    },
+    {
+      "epoch": 0.09202687286094562,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.000119631458913267,
+      "loss": 2.1176,
+      "step": 726
+    },
+    {
+      "epoch": 0.092153631638991,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011962892425129717,
+      "loss": 2.1013,
+      "step": 727
+    },
+    {
+      "epoch": 0.09228039041703638,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.000119626380930068,
+      "loss": 2.6649,
+      "step": 728
+    },
+    {
+      "epoch": 0.09240714919508176,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011962382894994878,
+      "loss": 2.2427,
+      "step": 729
+    },
+    {
+      "epoch": 0.09253390797312713,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0001196212683113101,
+      "loss": 2.2411,
+      "step": 730
+    },
+    {
+      "epoch": 0.09266066675117252,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011961869901452387,
+      "loss": 2.2934,
+      "step": 731
+    },
+    {
+      "epoch": 0.0927874255292179,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011961612105996313,
+      "loss": 1.7202,
+      "step": 732
+    },
+    {
+      "epoch": 0.09291418430726328,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011961353444800231,
+      "loss": 2.1739,
+      "step": 733
+    },
+    {
+      "epoch": 0.09304094308530866,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011961093917901702,
+      "loss": 2.2496,
+      "step": 734
+    },
+    {
+      "epoch": 0.09316770186335403,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011960833525338412,
+      "loss": 2.0156,
+      "step": 735
+    },
+    {
+      "epoch": 0.09329446064139942,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011960572267148176,
+      "loss": 2.3718,
+      "step": 736
+    },
+    {
+      "epoch": 0.0934212194194448,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00011960310143368936,
+      "loss": 2.9012,
+      "step": 737
+    },
+    {
+      "epoch": 0.09354797819749018,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011960047154038753,
+      "loss": 2.3472,
+      "step": 738
+    },
+    {
+      "epoch": 0.09367473697553555,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011959783299195821,
+      "loss": 2.6858,
+      "step": 739
+    },
+    {
+      "epoch": 0.09380149575358093,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011959518578878457,
+      "loss": 2.0731,
+      "step": 740
+    },
+    {
+      "epoch": 0.09392825453162632,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011959252993125104,
+      "loss": 2.0515,
+      "step": 741
+    },
+    {
+      "epoch": 0.0940550133096717,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011958986541974326,
+      "loss": 2.0835,
+      "step": 742
+    },
+    {
+      "epoch": 0.09418177208771708,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00011958719225464821,
+      "loss": 2.1471,
+      "step": 743
+    },
+    {
+      "epoch": 0.09430853086576245,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011958451043635406,
+      "loss": 1.9637,
+      "step": 744
+    },
+    {
+      "epoch": 0.09443528964380783,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011958181996525029,
+      "loss": 2.4131,
+      "step": 745
+    },
+    {
+      "epoch": 0.09456204842185321,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011957912084172758,
+      "loss": 1.9366,
+      "step": 746
+    },
+    {
+      "epoch": 0.0946888071998986,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001195764130661779,
+      "loss": 2.0421,
+      "step": 747
+    },
+    {
+      "epoch": 0.09481556597794397,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0001195736966389945,
+      "loss": 2.4487,
+      "step": 748
+    },
+    {
+      "epoch": 0.09494232475598935,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011957097156057179,
+      "loss": 2.1194,
+      "step": 749
+    },
+    {
+      "epoch": 0.09506908353403473,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011956823783130558,
+      "loss": 2.0232,
+      "step": 750
+    },
+    {
+      "epoch": 0.09519584231208011,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011956549545159281,
+      "loss": 1.7714,
+      "step": 751
+    },
+    {
+      "epoch": 0.0953226010901255,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011956274442183175,
+      "loss": 1.8993,
+      "step": 752
+    },
+    {
+      "epoch": 0.09544935986817087,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0001195599847424219,
+      "loss": 2.0853,
+      "step": 753
+    },
+    {
+      "epoch": 0.09557611864621625,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011955721641376398,
+      "loss": 2.4571,
+      "step": 754
+    },
+    {
+      "epoch": 0.09570287742426163,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011955443943626006,
+      "loss": 2.002,
+      "step": 755
+    },
+    {
+      "epoch": 0.09582963620230701,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011955165381031339,
+      "loss": 1.9114,
+      "step": 756
+    },
+    {
+      "epoch": 0.09595639498035238,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011954885953632848,
+      "loss": 1.9698,
+      "step": 757
+    },
+    {
+      "epoch": 0.09608315375839777,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011954605661471113,
+      "loss": 2.376,
+      "step": 758
+    },
+    {
+      "epoch": 0.09620991253644315,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011954324504586837,
+      "loss": 2.3287,
+      "step": 759
+    },
+    {
+      "epoch": 0.09633667131448853,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0001195404248302085,
+      "loss": 2.1746,
+      "step": 760
+    },
+    {
+      "epoch": 0.09646343009253391,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011953759596814105,
+      "loss": 1.9198,
+      "step": 761
+    },
+    {
+      "epoch": 0.09659018887057928,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011953475846007686,
+      "loss": 2.4026,
+      "step": 762
+    },
+    {
+      "epoch": 0.09671694764862467,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011953191230642796,
+      "loss": 1.7982,
+      "step": 763
+    },
+    {
+      "epoch": 0.09684370642667005,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011952905750760767,
+      "loss": 1.6029,
+      "step": 764
+    },
+    {
+      "epoch": 0.09697046520471543,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011952619406403057,
+      "loss": 2.3202,
+      "step": 765
+    },
+    {
+      "epoch": 0.0970972239827608,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0001195233219761125,
+      "loss": 1.938,
+      "step": 766
+    },
+    {
+      "epoch": 0.09722398276080618,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011952044124427051,
+      "loss": 2.004,
+      "step": 767
+    },
+    {
+      "epoch": 0.09735074153885157,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011951755186892294,
+      "loss": 1.8755,
+      "step": 768
+    },
+    {
+      "epoch": 0.09747750031689695,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011951465385048943,
+      "loss": 1.9519,
+      "step": 769
+    },
+    {
+      "epoch": 0.09760425909494233,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011951174718939076,
+      "loss": 2.5218,
+      "step": 770
+    },
+    {
+      "epoch": 0.0977310178729877,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0001195088318860491,
+      "loss": 1.939,
+      "step": 771
+    },
+    {
+      "epoch": 0.09785777665103308,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011950590794088774,
+      "loss": 1.885,
+      "step": 772
+    },
+    {
+      "epoch": 0.09798453542907846,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011950297535433134,
+      "loss": 1.7499,
+      "step": 773
+    },
+    {
+      "epoch": 0.09811129420712385,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011950003412680576,
+      "loss": 1.4875,
+      "step": 774
+    },
+    {
+      "epoch": 0.09823805298516922,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00011949708425873811,
+      "loss": 2.3029,
+      "step": 775
+    },
+    {
+      "epoch": 0.0983648117632146,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011949412575055678,
+      "loss": 2.1795,
+      "step": 776
+    },
+    {
+      "epoch": 0.09849157054125998,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0001194911586026914,
+      "loss": 1.8516,
+      "step": 777
+    },
+    {
+      "epoch": 0.09861832931930536,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011948818281557285,
+      "loss": 2.0553,
+      "step": 778
+    },
+    {
+      "epoch": 0.09874508809735075,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011948519838963327,
+      "loss": 2.2203,
+      "step": 779
+    },
+    {
+      "epoch": 0.09887184687539612,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.00011948220532530605,
+      "loss": 2.6721,
+      "step": 780
+    },
+    {
+      "epoch": 0.0989986056534415,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011947920362302586,
+      "loss": 2.2112,
+      "step": 781
+    },
+    {
+      "epoch": 0.09912536443148688,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011947619328322861,
+      "loss": 1.5344,
+      "step": 782
+    },
+    {
+      "epoch": 0.09925212320953226,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011947317430635142,
+      "loss": 1.8941,
+      "step": 783
+    },
+    {
+      "epoch": 0.09937888198757763,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011947014669283273,
+      "loss": 2.3235,
+      "step": 784
+    },
+    {
+      "epoch": 0.09950564076562302,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0001194671104431122,
+      "loss": 2.0323,
+      "step": 785
+    },
+    {
+      "epoch": 0.0996323995436684,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011946406555763074,
+      "loss": 2.2627,
+      "step": 786
+    },
+    {
+      "epoch": 0.09975915832171378,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011946101203683057,
+      "loss": 2.0878,
+      "step": 787
+    },
+    {
+      "epoch": 0.09988591709975916,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00011945794988115509,
+      "loss": 2.3721,
+      "step": 788
+    },
+    {
+      "epoch": 0.10001267587780453,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011945487909104896,
+      "loss": 1.7971,
+      "step": 789
+    },
+    {
+      "epoch": 0.10013943465584992,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011945179966695816,
+      "loss": 2.027,
+      "step": 790
+    },
+    {
+      "epoch": 0.1002661934338953,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011944871160932986,
+      "loss": 1.7246,
+      "step": 791
+    },
+    {
+      "epoch": 0.10039295221194068,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0001194456149186125,
+      "loss": 1.9774,
+      "step": 792
+    },
+    {
+      "epoch": 0.10051971098998605,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00011944250959525579,
+      "loss": 1.9793,
+      "step": 793
+    },
+    {
+      "epoch": 0.10064646976803143,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011943939563971068,
+      "loss": 1.6943,
+      "step": 794
+    },
+    {
+      "epoch": 0.10077322854607682,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011943627305242937,
+      "loss": 1.7865,
+      "step": 795
+    },
+    {
+      "epoch": 0.1008999873241222,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011943314183386531,
+      "loss": 2.0512,
+      "step": 796
+    },
+    {
+      "epoch": 0.10102674610216758,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011943000198447325,
+      "loss": 2.1047,
+      "step": 797
+    },
+    {
+      "epoch": 0.10115350488021295,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011942685350470912,
+      "loss": 1.9063,
+      "step": 798
+    },
+    {
+      "epoch": 0.10128026365825833,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011942369639503015,
+      "loss": 2.6003,
+      "step": 799
+    },
+    {
+      "epoch": 0.10140702243630371,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011942053065589483,
+      "loss": 1.9722,
+      "step": 800
+    },
+    {
+      "epoch": 0.1015337812143491,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011941735628776285,
+      "loss": 2.2075,
+      "step": 801
+    },
+    {
+      "epoch": 0.10166053999239448,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011941417329109522,
+      "loss": 2.4068,
+      "step": 802
+    },
+    {
+      "epoch": 0.10178729877043985,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011941098166635416,
+      "loss": 2.1338,
+      "step": 803
+    },
+    {
+      "epoch": 0.10191405754848523,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00011940778141400316,
+      "loss": 2.2051,
+      "step": 804
+    },
+    {
+      "epoch": 0.10204081632653061,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011940457253450694,
+      "loss": 1.949,
+      "step": 805
+    },
+    {
+      "epoch": 0.102167575104576,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0001194013550283315,
+      "loss": 2.0288,
+      "step": 806
+    },
+    {
+      "epoch": 0.10229433388262137,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011939812889594411,
+      "loss": 1.9459,
+      "step": 807
+    },
+    {
+      "epoch": 0.10242109266066675,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011939489413781323,
+      "loss": 2.3006,
+      "step": 808
+    },
+    {
+      "epoch": 0.10254785143871213,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011939165075440863,
+      "loss": 2.0992,
+      "step": 809
+    },
+    {
+      "epoch": 0.10267461021675751,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0001193883987462013,
+      "loss": 2.0763,
+      "step": 810
+    },
+    {
+      "epoch": 0.1028013689948029,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011938513811366349,
+      "loss": 1.8806,
+      "step": 811
+    },
+    {
+      "epoch": 0.10292812777284827,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011938186885726873,
+      "loss": 2.4763,
+      "step": 812
+    },
+    {
+      "epoch": 0.10305488655089365,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00011937859097749175,
+      "loss": 1.8215,
+      "step": 813
+    },
+    {
+      "epoch": 0.10318164532893903,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011937530447480858,
+      "loss": 2.2605,
+      "step": 814
+    },
+    {
+      "epoch": 0.10330840410698441,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011937200934969648,
+      "loss": 1.9445,
+      "step": 815
+    },
+    {
+      "epoch": 0.10343516288502978,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011936870560263394,
+      "loss": 1.9748,
+      "step": 816
+    },
+    {
+      "epoch": 0.10356192166307517,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011936539323410077,
+      "loss": 2.0225,
+      "step": 817
+    },
+    {
+      "epoch": 0.10368868044112055,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011936207224457796,
+      "loss": 2.5079,
+      "step": 818
+    },
+    {
+      "epoch": 0.10381543921916593,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011935874263454779,
+      "loss": 2.2673,
+      "step": 819
+    },
+    {
+      "epoch": 0.10394219799721131,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011935540440449378,
+      "loss": 1.8439,
+      "step": 820
+    },
+    {
+      "epoch": 0.10406895677525668,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0001193520575549007,
+      "loss": 2.134,
+      "step": 821
+    },
+    {
+      "epoch": 0.10419571555330207,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011934870208625458,
+      "loss": 2.3663,
+      "step": 822
+    },
+    {
+      "epoch": 0.10432247433134745,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0001193453379990427,
+      "loss": 2.3123,
+      "step": 823
+    },
+    {
+      "epoch": 0.10444923310939283,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011934196529375362,
+      "loss": 1.8325,
+      "step": 824
+    },
+    {
+      "epoch": 0.1045759918874382,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011933858397087707,
+      "loss": 2.1381,
+      "step": 825
+    },
+    {
+      "epoch": 0.10470275066548358,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001193351940309041,
+      "loss": 1.7612,
+      "step": 826
+    },
+    {
+      "epoch": 0.10482950944352896,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.000119331795474327,
+      "loss": 1.5592,
+      "step": 827
+    },
+    {
+      "epoch": 0.10495626822157435,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011932838830163932,
+      "loss": 2.3928,
+      "step": 828
+    },
+    {
+      "epoch": 0.10508302699961973,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011932497251333582,
+      "loss": 2.0957,
+      "step": 829
+    },
+    {
+      "epoch": 0.1052097857776651,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011932154810991255,
+      "loss": 2.2185,
+      "step": 830
+    },
+    {
+      "epoch": 0.10533654455571048,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0001193181150918668,
+      "loss": 2.3615,
+      "step": 831
+    },
+    {
+      "epoch": 0.10546330333375586,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0001193146734596971,
+      "loss": 2.3545,
+      "step": 832
+    },
+    {
+      "epoch": 0.10559006211180125,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011931122321390325,
+      "loss": 1.8813,
+      "step": 833
+    },
+    {
+      "epoch": 0.10571682088984662,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0001193077643549863,
+      "loss": 2.3247,
+      "step": 834
+    },
+    {
+      "epoch": 0.105843579667892,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011930429688344852,
+      "loss": 1.9465,
+      "step": 835
+    },
+    {
+      "epoch": 0.10597033844593738,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011930082079979346,
+      "loss": 1.6975,
+      "step": 836
+    },
+    {
+      "epoch": 0.10609709722398276,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00011929733610452594,
+      "loss": 2.9427,
+      "step": 837
+    },
+    {
+      "epoch": 0.10622385600202815,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011929384279815197,
+      "loss": 2.4923,
+      "step": 838
+    },
+    {
+      "epoch": 0.10635061478007352,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011929034088117887,
+      "loss": 2.6193,
+      "step": 839
+    },
+    {
+      "epoch": 0.1064773735581189,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00011928683035411516,
+      "loss": 2.2488,
+      "step": 840
+    },
+    {
+      "epoch": 0.10660413233616428,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011928331121747065,
+      "loss": 2.3346,
+      "step": 841
+    },
+    {
+      "epoch": 0.10673089111420966,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011927978347175639,
+      "loss": 1.835,
+      "step": 842
+    },
+    {
+      "epoch": 0.10685764989225503,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011927624711748467,
+      "loss": 1.8929,
+      "step": 843
+    },
+    {
+      "epoch": 0.10698440867030042,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011927270215516903,
+      "loss": 2.0617,
+      "step": 844
+    },
+    {
+      "epoch": 0.1071111674483458,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011926914858532429,
+      "loss": 2.0454,
+      "step": 845
+    },
+    {
+      "epoch": 0.10723792622639118,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011926558640846647,
+      "loss": 2.1259,
+      "step": 846
+    },
+    {
+      "epoch": 0.10736468500443656,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.00011926201562511287,
+      "loss": 2.4442,
+      "step": 847
+    },
+    {
+      "epoch": 0.10749144378248193,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011925843623578205,
+      "loss": 1.5956,
+      "step": 848
+    },
+    {
+      "epoch": 0.10761820256052732,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011925484824099378,
+      "loss": 2.1079,
+      "step": 849
+    },
+    {
+      "epoch": 0.1077449613385727,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011925125164126916,
+      "loss": 2.1141,
+      "step": 850
+    },
+    {
+      "epoch": 0.10787172011661808,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0001192476464371304,
+      "loss": 2.0957,
+      "step": 851
+    },
+    {
+      "epoch": 0.10799847889466345,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011924403262910113,
+      "loss": 2.2518,
+      "step": 852
+    },
+    {
+      "epoch": 0.10812523767270883,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011924041021770611,
+      "loss": 2.2177,
+      "step": 853
+    },
+    {
+      "epoch": 0.10825199645075421,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011923677920347135,
+      "loss": 2.1927,
+      "step": 854
+    },
+    {
+      "epoch": 0.1083787552287996,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011923313958692419,
+      "loss": 2.0256,
+      "step": 855
+    },
+    {
+      "epoch": 0.10850551400684498,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011922949136859316,
+      "loss": 2.3203,
+      "step": 856
+    },
+    {
+      "epoch": 0.10863227278489035,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011922583454900804,
+      "loss": 2.2034,
+      "step": 857
+    },
+    {
+      "epoch": 0.10875903156293573,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00011922216912869987,
+      "loss": 2.2134,
+      "step": 858
+    },
+    {
+      "epoch": 0.10888579034098111,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011921849510820093,
+      "loss": 2.4265,
+      "step": 859
+    },
+    {
+      "epoch": 0.1090125491190265,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00011921481248804481,
+      "loss": 1.8787,
+      "step": 860
+    },
+    {
+      "epoch": 0.10913930789707187,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011921112126876625,
+      "loss": 1.6549,
+      "step": 861
+    },
+    {
+      "epoch": 0.10926606667511725,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011920742145090128,
+      "loss": 2.3385,
+      "step": 862
+    },
+    {
+      "epoch": 0.10939282545316263,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0001192037130349872,
+      "loss": 2.4479,
+      "step": 863
+    },
+    {
+      "epoch": 0.10951958423120801,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011919999602156256,
+      "loss": 2.0568,
+      "step": 864
+    },
+    {
+      "epoch": 0.1096463430092534,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011919627041116709,
+      "loss": 2.3132,
+      "step": 865
+    },
+    {
+      "epoch": 0.10977310178729877,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0001191925362043419,
+      "loss": 2.0651,
+      "step": 866
+    },
+    {
+      "epoch": 0.10989986056534415,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00011918879340162918,
+      "loss": 1.8188,
+      "step": 867
+    },
+    {
+      "epoch": 0.11002661934338953,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011918504200357251,
+      "loss": 1.7025,
+      "step": 868
+    },
+    {
+      "epoch": 0.11015337812143491,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011918128201071667,
+      "loss": 1.9744,
+      "step": 869
+    },
+    {
+      "epoch": 0.11028013689948028,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00011917751342360765,
+      "loss": 1.9545,
+      "step": 870
+    },
+    {
+      "epoch": 0.11040689567752567,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011917373624279276,
+      "loss": 2.4713,
+      "step": 871
+    },
+    {
+      "epoch": 0.11053365445557105,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011916995046882045,
+      "loss": 2.1163,
+      "step": 872
+    },
+    {
+      "epoch": 0.11066041323361643,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011916615610224058,
+      "loss": 2.5615,
+      "step": 873
+    },
+    {
+      "epoch": 0.11078717201166181,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011916235314360408,
+      "loss": 1.7737,
+      "step": 874
+    },
+    {
+      "epoch": 0.11091393078970718,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011915854159346326,
+      "loss": 2.237,
+      "step": 875
+    },
+    {
+      "epoch": 0.11104068956775257,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011915472145237163,
+      "loss": 1.9866,
+      "step": 876
+    },
+    {
+      "epoch": 0.11116744834579795,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.00011915089272088392,
+      "loss": 2.194,
+      "step": 877
+    },
+    {
+      "epoch": 0.11129420712384333,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.00011914705539955616,
+      "loss": 2.4161,
+      "step": 878
+    },
+    {
+      "epoch": 0.1114209659018887,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011914320948894561,
+      "loss": 2.0445,
+      "step": 879
+    },
+    {
+      "epoch": 0.11154772467993408,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011913935498961073,
+      "loss": 2.358,
+      "step": 880
+    },
+    {
+      "epoch": 0.11167448345797946,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011913549190211129,
+      "loss": 1.9631,
+      "step": 881
+    },
+    {
+      "epoch": 0.11180124223602485,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0001191316202270083,
+      "loss": 2.1943,
+      "step": 882
+    },
+    {
+      "epoch": 0.11192800101407023,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011912773996486399,
+      "loss": 1.7102,
+      "step": 883
+    },
+    {
+      "epoch": 0.1120547597921156,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011912385111624182,
+      "loss": 2.1962,
+      "step": 884
+    },
+    {
+      "epoch": 0.11218151857016098,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011911995368170656,
+      "loss": 1.858,
+      "step": 885
+    },
+    {
+      "epoch": 0.11230827734820636,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011911604766182418,
+      "loss": 1.8814,
+      "step": 886
+    },
+    {
+      "epoch": 0.11243503612625175,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011911213305716191,
+      "loss": 2.0915,
+      "step": 887
+    },
+    {
+      "epoch": 0.11256179490429712,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011910820986828823,
+      "loss": 2.1956,
+      "step": 888
+    },
+    {
+      "epoch": 0.1126885536823425,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011910427809577285,
+      "loss": 1.9686,
+      "step": 889
+    },
+    {
+      "epoch": 0.11281531246038788,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011910033774018675,
+      "loss": 2.0356,
+      "step": 890
+    },
+    {
+      "epoch": 0.11294207123843326,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011909638880210214,
+      "loss": 2.3523,
+      "step": 891
+    },
+    {
+      "epoch": 0.11306883001647865,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0001190924312820925,
+      "loss": 2.0312,
+      "step": 892
+    },
+    {
+      "epoch": 0.11319558879452402,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011908846518073248,
+      "loss": 1.9166,
+      "step": 893
+    },
+    {
+      "epoch": 0.1133223475725694,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001190844904985981,
+      "loss": 2.4145,
+      "step": 894
+    },
+    {
+      "epoch": 0.11344910635061478,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011908050723626653,
+      "loss": 2.2743,
+      "step": 895
+    },
+    {
+      "epoch": 0.11357586512866016,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011907651539431621,
+      "loss": 1.7663,
+      "step": 896
+    },
+    {
+      "epoch": 0.11370262390670553,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011907251497332685,
+      "loss": 2.7327,
+      "step": 897
+    },
+    {
+      "epoch": 0.11382938268475092,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.00011906850597387938,
+      "loss": 2.132,
+      "step": 898
+    },
+    {
+      "epoch": 0.1139561414627963,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011906448839655597,
+      "loss": 1.7172,
+      "step": 899
+    },
+    {
+      "epoch": 0.11408290024084168,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011906046224194008,
+      "loss": 1.7345,
+      "step": 900
+    },
+    {
+      "epoch": 0.11420965901888706,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011905642751061635,
+      "loss": 1.8894,
+      "step": 901
+    },
+    {
+      "epoch": 0.11433641779693243,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011905238420317071,
+      "loss": 2.0753,
+      "step": 902
+    },
+    {
+      "epoch": 0.11446317657497782,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011904833232019036,
+      "loss": 1.8567,
+      "step": 903
+    },
+    {
+      "epoch": 0.1145899353530232,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011904427186226365,
+      "loss": 1.9942,
+      "step": 904
+    },
+    {
+      "epoch": 0.11471669413106858,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011904020282998028,
+      "loss": 2.0263,
+      "step": 905
+    },
+    {
+      "epoch": 0.11484345290911395,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00011903612522393114,
+      "loss": 2.0286,
+      "step": 906
+    },
+    {
+      "epoch": 0.11497021168715933,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011903203904470837,
+      "loss": 2.0145,
+      "step": 907
+    },
+    {
+      "epoch": 0.11509697046520471,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011902794429290535,
+      "loss": 1.6808,
+      "step": 908
+    },
+    {
+      "epoch": 0.1152237292432501,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00011902384096911677,
+      "loss": 2.4509,
+      "step": 909
+    },
+    {
+      "epoch": 0.11535048802129548,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011901972907393845,
+      "loss": 2.5562,
+      "step": 910
+    },
+    {
+      "epoch": 0.11547724679934085,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011901560860796754,
+      "loss": 2.1583,
+      "step": 911
+    },
+    {
+      "epoch": 0.11560400557738623,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011901147957180243,
+      "loss": 2.1251,
+      "step": 912
+    },
+    {
+      "epoch": 0.11573076435543161,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011900734196604268,
+      "loss": 1.7442,
+      "step": 913
+    },
+    {
+      "epoch": 0.115857523133477,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00011900319579128921,
+      "loss": 2.1894,
+      "step": 914
+    },
+    {
+      "epoch": 0.11598428191152237,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0001189990410481441,
+      "loss": 1.6838,
+      "step": 915
+    },
+    {
+      "epoch": 0.11611104068956775,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011899487773721069,
+      "loss": 1.7357,
+      "step": 916
+    },
+    {
+      "epoch": 0.11623779946761313,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011899070585909357,
+      "loss": 2.0866,
+      "step": 917
+    },
+    {
+      "epoch": 0.11636455824565851,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0001189865254143986,
+      "loss": 2.235,
+      "step": 918
+    },
+    {
+      "epoch": 0.1164913170237039,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011898233640373284,
+      "loss": 2.0509,
+      "step": 919
+    },
+    {
+      "epoch": 0.11661807580174927,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0001189781388277046,
+      "loss": 2.0683,
+      "step": 920
+    },
+    {
+      "epoch": 0.11674483457979465,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00011897393268692349,
+      "loss": 1.9045,
+      "step": 921
+    },
+    {
+      "epoch": 0.11687159335784003,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0001189697179820003,
+      "loss": 2.1663,
+      "step": 922
+    },
+    {
+      "epoch": 0.11699835213588541,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011896549471354708,
+      "loss": 1.9907,
+      "step": 923
+    },
+    {
+      "epoch": 0.11712511091393078,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00011896126288217714,
+      "loss": 2.1921,
+      "step": 924
+    },
+    {
+      "epoch": 0.11725186969197617,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011895702248850502,
+      "loss": 2.297,
+      "step": 925
+    },
+    {
+      "epoch": 0.11737862847002155,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0001189527735331465,
+      "loss": 2.1541,
+      "step": 926
+    },
+    {
+      "epoch": 0.11750538724806693,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011894851601671861,
+      "loss": 2.194,
+      "step": 927
+    },
+    {
+      "epoch": 0.11763214602611231,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011894424993983962,
+      "loss": 1.8901,
+      "step": 928
+    },
+    {
+      "epoch": 0.11775890480415768,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011893997530312906,
+      "loss": 1.9794,
+      "step": 929
+    },
+    {
+      "epoch": 0.11788566358220307,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00011893569210720768,
+      "loss": 3.0634,
+      "step": 930
+    },
+    {
+      "epoch": 0.11801242236024845,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00011893140035269749,
+      "loss": 2.7166,
+      "step": 931
+    },
+    {
+      "epoch": 0.11813918113829383,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011892710004022173,
+      "loss": 2.2892,
+      "step": 932
+    },
+    {
+      "epoch": 0.11826593991633921,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011892279117040488,
+      "loss": 2.1269,
+      "step": 933
+    },
+    {
+      "epoch": 0.11839269869438458,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011891847374387267,
+      "loss": 2.269,
+      "step": 934
+    },
+    {
+      "epoch": 0.11851945747242996,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0001189141477612521,
+      "loss": 1.9353,
+      "step": 935
+    },
+    {
+      "epoch": 0.11864621625047535,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011890981322317133,
+      "loss": 1.5698,
+      "step": 936
+    },
+    {
+      "epoch": 0.11877297502852073,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011890547013025986,
+      "loss": 2.4581,
+      "step": 937
+    },
+    {
+      "epoch": 0.1188997338065661,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0001189011184831484,
+      "loss": 2.5244,
+      "step": 938
+    },
+    {
+      "epoch": 0.11902649258461148,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011889675828246886,
+      "loss": 2.262,
+      "step": 939
+    },
+    {
+      "epoch": 0.11915325136265686,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011889238952885444,
+      "loss": 2.3245,
+      "step": 940
+    },
+    {
+      "epoch": 0.11928001014070225,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00011888801222293957,
+      "loss": 2.4705,
+      "step": 941
+    },
+    {
+      "epoch": 0.11940676891874763,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011888362636535991,
+      "loss": 1.9687,
+      "step": 942
+    },
+    {
+      "epoch": 0.119533527696793,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011887923195675237,
+      "loss": 2.3092,
+      "step": 943
+    },
+    {
+      "epoch": 0.11966028647483838,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011887482899775511,
+      "loss": 1.6603,
+      "step": 944
+    },
+    {
+      "epoch": 0.11978704525288376,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011887041748900754,
+      "loss": 1.8861,
+      "step": 945
+    },
+    {
+      "epoch": 0.11991380403092915,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.00011886599743115027,
+      "loss": 2.3836,
+      "step": 946
+    },
+    {
+      "epoch": 0.12004056280897452,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011886156882482515,
+      "loss": 2.3982,
+      "step": 947
+    },
+    {
+      "epoch": 0.1201673215870199,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011885713167067536,
+      "loss": 1.7543,
+      "step": 948
+    },
+    {
+      "epoch": 0.12029408036506528,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011885268596934522,
+      "loss": 2.0612,
+      "step": 949
+    },
+    {
+      "epoch": 0.12042083914311066,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011884823172148033,
+      "loss": 2.6011,
+      "step": 950
+    },
+    {
+      "epoch": 0.12054759792115605,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011884376892772756,
+      "loss": 2.3846,
+      "step": 951
+    },
+    {
+      "epoch": 0.12067435669920142,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011883929758873495,
+      "loss": 2.112,
+      "step": 952
+    },
+    {
+      "epoch": 0.1208011154772468,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00011883481770515186,
+      "loss": 2.3332,
+      "step": 953
+    },
+    {
+      "epoch": 0.12092787425529218,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011883032927762887,
+      "loss": 2.358,
+      "step": 954
+    },
+    {
+      "epoch": 0.12105463303333756,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011882583230681773,
+      "loss": 2.8022,
+      "step": 955
+    },
+    {
+      "epoch": 0.12118139181138293,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00011882132679337154,
+      "loss": 2.041,
+      "step": 956
+    },
+    {
+      "epoch": 0.12130815058942832,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011881681273794454,
+      "loss": 2.3209,
+      "step": 957
+    },
+    {
+      "epoch": 0.1214349093674737,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0001188122901411923,
+      "loss": 2.1664,
+      "step": 958
+    },
+    {
+      "epoch": 0.12156166814551908,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011880775900377155,
+      "loss": 2.1146,
+      "step": 959
+    },
+    {
+      "epoch": 0.12168842692356446,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011880321932634031,
+      "loss": 2.1281,
+      "step": 960
+    },
+    {
+      "epoch": 0.12181518570160983,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011879867110955784,
+      "loss": 2.0122,
+      "step": 961
+    },
+    {
+      "epoch": 0.12194194447965521,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011879411435408465,
+      "loss": 2.1311,
+      "step": 962
+    },
+    {
+      "epoch": 0.1220687032577006,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011878954906058239,
+      "loss": 2.6127,
+      "step": 963
+    },
+    {
+      "epoch": 0.12219546203574598,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0001187849752297141,
+      "loss": 2.0978,
+      "step": 964
+    },
+    {
+      "epoch": 0.12232222081379135,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011878039286214396,
+      "loss": 1.7482,
+      "step": 965
+    },
+    {
+      "epoch": 0.12244897959183673,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011877580195853745,
+      "loss": 1.8181,
+      "step": 966
+    },
+    {
+      "epoch": 0.12257573836988211,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011877120251956118,
+      "loss": 2.082,
+      "step": 967
+    },
+    {
+      "epoch": 0.1227024971479275,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011876659454588316,
+      "loss": 2.0985,
+      "step": 968
+    },
+    {
+      "epoch": 0.12282925592597288,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0001187619780381725,
+      "loss": 2.2291,
+      "step": 969
+    },
+    {
+      "epoch": 0.12295601470401825,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00011875735299709962,
+      "loss": 2.54,
+      "step": 970
+    },
+    {
+      "epoch": 0.12308277348206363,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011875271942333619,
+      "loss": 2.2419,
+      "step": 971
+    },
+    {
+      "epoch": 0.12320953226010901,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011874807731755508,
+      "loss": 2.433,
+      "step": 972
+    },
+    {
+      "epoch": 0.1233362910381544,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011874342668043039,
+      "loss": 2.8132,
+      "step": 973
+    },
+    {
+      "epoch": 0.12346304981619977,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0001187387675126375,
+      "loss": 1.6897,
+      "step": 974
+    },
+    {
+      "epoch": 0.12358980859424515,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011873409981485302,
+      "loss": 2.1097,
+      "step": 975
+    },
+    {
+      "epoch": 0.12371656737229053,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011872942358775475,
+      "loss": 2.0381,
+      "step": 976
+    },
+    {
+      "epoch": 0.12384332615033591,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011872473883202182,
+      "loss": 1.7534,
+      "step": 977
+    },
+    {
+      "epoch": 0.1239700849283813,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0001187200455483345,
+      "loss": 2.208,
+      "step": 978
+    },
+    {
+      "epoch": 0.12409684370642667,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011871534373737438,
+      "loss": 1.5478,
+      "step": 979
+    },
+    {
+      "epoch": 0.12422360248447205,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011871063339982424,
+      "loss": 2.1046,
+      "step": 980
+    },
+    {
+      "epoch": 0.12435036126251743,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0001187059145363681,
+      "loss": 2.331,
+      "step": 981
+    },
+    {
+      "epoch": 0.12447712004056281,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011870118714769123,
+      "loss": 2.9785,
+      "step": 982
+    },
+    {
+      "epoch": 0.12460387881860818,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00011869645123448015,
+      "loss": 2.5089,
+      "step": 983
+    },
+    {
+      "epoch": 0.12473063759665357,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011869170679742261,
+      "loss": 2.3834,
+      "step": 984
+    },
+    {
+      "epoch": 0.12485739637469895,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011868695383720758,
+      "loss": 2.3377,
+      "step": 985
+    },
+    {
+      "epoch": 0.12498415515274433,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00011868219235452527,
+      "loss": 1.8265,
+      "step": 986
+    },
+    {
+      "epoch": 0.1251109139307897,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011867742235006717,
+      "loss": 2.054,
+      "step": 987
+    },
+    {
+      "epoch": 0.1252376727088351,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011867264382452595,
+      "loss": 1.718,
+      "step": 988
+    },
+    {
+      "epoch": 0.12536443148688048,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011866785677859555,
+      "loss": 1.9682,
+      "step": 989
+    },
+    {
+      "epoch": 0.12549119026492583,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011866306121297115,
+      "loss": 1.7012,
+      "step": 990
+    },
+    {
+      "epoch": 0.12561794904297122,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011865825712834912,
+      "loss": 2.1602,
+      "step": 991
+    },
+    {
+      "epoch": 0.1257447078210166,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011865344452542716,
+      "loss": 1.7084,
+      "step": 992
+    },
+    {
+      "epoch": 0.12587146659906198,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.00011864862340490413,
+      "loss": 2.1667,
+      "step": 993
+    },
+    {
+      "epoch": 0.12599822537710736,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011864379376748013,
+      "loss": 2.05,
+      "step": 994
+    },
+    {
+      "epoch": 0.12612498415515275,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011863895561385653,
+      "loss": 2.0009,
+      "step": 995
+    },
+    {
+      "epoch": 0.12625174293319813,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011863410894473594,
+      "loss": 1.8643,
+      "step": 996
+    },
+    {
+      "epoch": 0.1263785017112435,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011862925376082218,
+      "loss": 1.9145,
+      "step": 997
+    },
+    {
+      "epoch": 0.1265052604892889,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011862439006282028,
+      "loss": 1.9072,
+      "step": 998
+    },
+    {
+      "epoch": 0.12663201926733425,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011861951785143657,
+      "loss": 2.0977,
+      "step": 999
+    },
+    {
+      "epoch": 0.12675877804537963,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00011861463712737859,
+      "loss": 2.2561,
+      "step": 1000
+    },
+    {
+      "epoch": 0.12688553682342502,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011860974789135512,
+      "loss": 2.7327,
+      "step": 1001
+    },
+    {
+      "epoch": 0.1270122956014704,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011860485014407617,
+      "loss": 2.2819,
+      "step": 1002
+    },
+    {
+      "epoch": 0.12713905437951578,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011859994388625296,
+      "loss": 2.5569,
+      "step": 1003
+    },
+    {
+      "epoch": 0.12726581315756116,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011859502911859801,
+      "loss": 2.0616,
+      "step": 1004
+    },
+    {
+      "epoch": 0.12739257193560655,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.000118590105841825,
+      "loss": 1.8455,
+      "step": 1005
+    },
+    {
+      "epoch": 0.12751933071365193,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011858517405664891,
+      "loss": 2.0145,
+      "step": 1006
+    },
+    {
+      "epoch": 0.1276460894916973,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011858023376378592,
+      "loss": 2.0677,
+      "step": 1007
+    },
+    {
+      "epoch": 0.12777284826974267,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00011857528496395348,
+      "loss": 2.1184,
+      "step": 1008
+    },
+    {
+      "epoch": 0.12789960704778805,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011857032765787021,
+      "loss": 2.2325,
+      "step": 1009
+    },
+    {
+      "epoch": 0.12802636582583343,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00011856536184625603,
+      "loss": 2.4459,
+      "step": 1010
+    },
+    {
+      "epoch": 0.12815312460387882,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011856038752983207,
+      "loss": 1.7825,
+      "step": 1011
+    },
+    {
+      "epoch": 0.1282798833819242,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011855540470932067,
+      "loss": 2.0643,
+      "step": 1012
+    },
+    {
+      "epoch": 0.12840664215996958,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011855041338544546,
+      "loss": 1.8856,
+      "step": 1013
+    },
+    {
+      "epoch": 0.12853340093801496,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011854541355893127,
+      "loss": 1.6081,
+      "step": 1014
+    },
+    {
+      "epoch": 0.12866015971606035,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011854040523050417,
+      "loss": 1.9381,
+      "step": 1015
+    },
+    {
+      "epoch": 0.12878691849410573,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00011853538840089145,
+      "loss": 2.2497,
+      "step": 1016
+    },
+    {
+      "epoch": 0.12891367727215108,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011853036307082166,
+      "loss": 2.2958,
+      "step": 1017
+    },
+    {
+      "epoch": 0.12904043605019647,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011852532924102459,
+      "loss": 1.8932,
+      "step": 1018
+    },
+    {
+      "epoch": 0.12916719482824185,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011852028691223123,
+      "loss": 2.3335,
+      "step": 1019
+    },
+    {
+      "epoch": 0.12929395360628723,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011851523608517384,
+      "loss": 1.9809,
+      "step": 1020
+    },
+    {
+      "epoch": 0.12942071238433261,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011851017676058585,
+      "loss": 1.6886,
+      "step": 1021
+    },
+    {
+      "epoch": 0.129547471162378,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011850510893920202,
+      "loss": 2.5225,
+      "step": 1022
+    },
+    {
+      "epoch": 0.12967422994042338,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011850003262175829,
+      "loss": 2.3927,
+      "step": 1023
+    },
+    {
+      "epoch": 0.12980098871846876,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011849494780899181,
+      "loss": 2.3379,
+      "step": 1024
+    },
+    {
+      "epoch": 0.12992774749651415,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.000118489854501641,
+      "loss": 2.1308,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1300545062745595,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011848475270044554,
+      "loss": 2.0078,
+      "step": 1026
+    },
+    {
+      "epoch": 0.13018126505260488,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011847964240614627,
+      "loss": 2.0975,
+      "step": 1027
+    },
+    {
+      "epoch": 0.13030802383065027,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011847452361948531,
+      "loss": 2.3797,
+      "step": 1028
+    },
+    {
+      "epoch": 0.13043478260869565,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011846939634120604,
+      "loss": 2.2901,
+      "step": 1029
+    },
+    {
+      "epoch": 0.13056154138674103,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011846426057205297,
+      "loss": 2.0321,
+      "step": 1030
+    },
+    {
+      "epoch": 0.13068830016478641,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011845911631277197,
+      "loss": 2.3257,
+      "step": 1031
+    },
+    {
+      "epoch": 0.1308150589428318,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00011845396356411007,
+      "loss": 2.2145,
+      "step": 1032
+    },
+    {
+      "epoch": 0.13094181772087718,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011844880232681553,
+      "loss": 1.8059,
+      "step": 1033
+    },
+    {
+      "epoch": 0.13106857649892256,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011844363260163788,
+      "loss": 2.2591,
+      "step": 1034
+    },
+    {
+      "epoch": 0.13119533527696792,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011843845438932787,
+      "loss": 1.7012,
+      "step": 1035
+    },
+    {
+      "epoch": 0.1313220940550133,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011843326769063743,
+      "loss": 2.4046,
+      "step": 1036
+    },
+    {
+      "epoch": 0.13144885283305868,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011842807250631985,
+      "loss": 1.6827,
+      "step": 1037
+    },
+    {
+      "epoch": 0.13157561161110407,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011842286883712948,
+      "loss": 1.6645,
+      "step": 1038
+    },
+    {
+      "epoch": 0.13170237038914945,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011841765668382204,
+      "loss": 2.2063,
+      "step": 1039
+    },
+    {
+      "epoch": 0.13182912916719483,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011841243604715444,
+      "loss": 2.4301,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1319558879452402,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011840720692788479,
+      "loss": 1.7588,
+      "step": 1041
+    },
+    {
+      "epoch": 0.1320826467232856,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011840196932677247,
+      "loss": 2.3297,
+      "step": 1042
+    },
+    {
+      "epoch": 0.13220940550133098,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0001183967232445781,
+      "loss": 2.1185,
+      "step": 1043
+    },
+    {
+      "epoch": 0.13233616427937633,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011839146868206348,
+      "loss": 1.351,
+      "step": 1044
+    },
+    {
+      "epoch": 0.13246292305742172,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011838620563999168,
+      "loss": 1.7055,
+      "step": 1045
+    },
+    {
+      "epoch": 0.1325896818354671,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011838093411912701,
+      "loss": 1.7632,
+      "step": 1046
+    },
+    {
+      "epoch": 0.13271644061351248,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011837565412023498,
+      "loss": 2.445,
+      "step": 1047
+    },
+    {
+      "epoch": 0.13284319939155786,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011837036564408236,
+      "loss": 1.59,
+      "step": 1048
+    },
+    {
+      "epoch": 0.13296995816960325,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0001183650686914371,
+      "loss": 2.0756,
+      "step": 1049
+    },
+    {
+      "epoch": 0.13309671694764863,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011835976326306847,
+      "loss": 2.0833,
+      "step": 1050
+    },
+    {
+      "epoch": 0.133223475725694,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011835444935974691,
+      "loss": 2.2792,
+      "step": 1051
+    },
+    {
+      "epoch": 0.1333502345037394,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011834912698224406,
+      "loss": 1.9627,
+      "step": 1052
+    },
+    {
+      "epoch": 0.13347699328178475,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011834379613133287,
+      "loss": 1.5645,
+      "step": 1053
+    },
+    {
+      "epoch": 0.13360375205983013,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011833845680778748,
+      "loss": 1.8105,
+      "step": 1054
+    },
+    {
+      "epoch": 0.13373051083787552,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011833310901238326,
+      "loss": 2.5772,
+      "step": 1055
+    },
+    {
+      "epoch": 0.1338572696159209,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011832775274589678,
+      "loss": 2.4272,
+      "step": 1056
+    },
+    {
+      "epoch": 0.13398402839396628,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011832238800910591,
+      "loss": 1.9244,
+      "step": 1057
+    },
+    {
+      "epoch": 0.13411078717201166,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011831701480278969,
+      "loss": 2.0246,
+      "step": 1058
+    },
+    {
+      "epoch": 0.13423754595005705,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011831163312772844,
+      "loss": 2.3409,
+      "step": 1059
+    },
+    {
+      "epoch": 0.13436430472810243,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011830624298470365,
+      "loss": 2.0585,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1344910635061478,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011830084437449809,
+      "loss": 2.0481,
+      "step": 1061
+    },
+    {
+      "epoch": 0.13461782228419317,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011829543729789573,
+      "loss": 2.2935,
+      "step": 1062
+    },
+    {
+      "epoch": 0.13474458106223855,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011829002175568179,
+      "loss": 2.4912,
+      "step": 1063
+    },
+    {
+      "epoch": 0.13487133984028393,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011828459774864272,
+      "loss": 2.2551,
+      "step": 1064
+    },
+    {
+      "epoch": 0.13499809861832932,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011827916527756617,
+      "loss": 1.6272,
+      "step": 1065
+    },
+    {
+      "epoch": 0.1351248573963747,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011827372434324102,
+      "loss": 2.2448,
+      "step": 1066
+    },
+    {
+      "epoch": 0.13525161617442008,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011826827494645745,
+      "loss": 1.815,
+      "step": 1067
+    },
+    {
+      "epoch": 0.13537837495246546,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00011826281708800679,
+      "loss": 2.3024,
+      "step": 1068
+    },
+    {
+      "epoch": 0.13550513373051085,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011825735076868163,
+      "loss": 1.9046,
+      "step": 1069
+    },
+    {
+      "epoch": 0.13563189250855623,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011825187598927576,
+      "loss": 1.8616,
+      "step": 1070
+    },
+    {
+      "epoch": 0.13575865128660158,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011824639275058424,
+      "loss": 2.5407,
+      "step": 1071
+    },
+    {
+      "epoch": 0.13588541006464697,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011824090105340336,
+      "loss": 1.4736,
+      "step": 1072
+    },
+    {
+      "epoch": 0.13601216884269235,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011823540089853061,
+      "loss": 1.7291,
+      "step": 1073
+    },
+    {
+      "epoch": 0.13613892762073773,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001182298922867647,
+      "loss": 1.4393,
+      "step": 1074
+    },
+    {
+      "epoch": 0.13626568639878311,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011822437521890559,
+      "loss": 1.6238,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1363924451768285,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011821884969575446,
+      "loss": 2.4057,
+      "step": 1076
+    },
+    {
+      "epoch": 0.13651920395487388,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011821331571811375,
+      "loss": 2.2121,
+      "step": 1077
+    },
+    {
+      "epoch": 0.13664596273291926,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011820777328678706,
+      "loss": 1.858,
+      "step": 1078
+    },
+    {
+      "epoch": 0.13677272151096465,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0001182022224025793,
+      "loss": 2.3449,
+      "step": 1079
+    },
+    {
+      "epoch": 0.13689948028901,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011819666306629652,
+      "loss": 1.7973,
+      "step": 1080
+    },
+    {
+      "epoch": 0.13702623906705538,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011819109527874608,
+      "loss": 2.4057,
+      "step": 1081
+    },
+    {
+      "epoch": 0.13715299784510077,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011818551904073652,
+      "loss": 1.5188,
+      "step": 1082
+    },
+    {
+      "epoch": 0.13727975662314615,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011817993435307758,
+      "loss": 2.4068,
+      "step": 1083
+    },
+    {
+      "epoch": 0.13740651540119153,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011817434121658032,
+      "loss": 1.9747,
+      "step": 1084
+    },
+    {
+      "epoch": 0.13753327417923691,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011816873963205692,
+      "loss": 1.7333,
+      "step": 1085
+    },
+    {
+      "epoch": 0.1376600329572823,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011816312960032089,
+      "loss": 1.8735,
+      "step": 1086
+    },
+    {
+      "epoch": 0.13778679173532768,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011815751112218687,
+      "loss": 2.4663,
+      "step": 1087
+    },
+    {
+      "epoch": 0.13791355051337306,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011815188419847078,
+      "loss": 1.8528,
+      "step": 1088
+    },
+    {
+      "epoch": 0.13804030929141842,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011814624882998975,
+      "loss": 1.8944,
+      "step": 1089
+    },
+    {
+      "epoch": 0.1381670680694638,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011814060501756216,
+      "loss": 1.4727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.13829382684750918,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0001181349527620076,
+      "loss": 1.9502,
+      "step": 1091
+    },
+    {
+      "epoch": 0.13842058562555457,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011812929206414688,
+      "loss": 1.8005,
+      "step": 1092
+    },
+    {
+      "epoch": 0.13854734440359995,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00011812362292480204,
+      "loss": 1.675,
+      "step": 1093
+    },
+    {
+      "epoch": 0.13867410318164533,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00011811794534479633,
+      "loss": 2.5253,
+      "step": 1094
+    },
+    {
+      "epoch": 0.1388008619596907,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011811225932495428,
+      "loss": 1.7797,
+      "step": 1095
+    },
+    {
+      "epoch": 0.1389276207377361,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0001181065648661016,
+      "loss": 1.9547,
+      "step": 1096
+    },
+    {
+      "epoch": 0.13905437951578148,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0001181008619690652,
+      "loss": 2.0714,
+      "step": 1097
+    },
+    {
+      "epoch": 0.13918113829382683,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011809515063467329,
+      "loss": 1.884,
+      "step": 1098
+    },
+    {
+      "epoch": 0.13930789707187222,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011808943086375524,
+      "loss": 1.9434,
+      "step": 1099
+    },
+    {
+      "epoch": 0.1394346558499176,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00011808370265714169,
+      "loss": 1.551,
+      "step": 1100
+    },
+    {
+      "epoch": 0.13956141462796298,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011807796601566446,
+      "loss": 1.7796,
+      "step": 1101
+    },
+    {
+      "epoch": 0.13968817340600836,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011807222094015664,
+      "loss": 1.8309,
+      "step": 1102
+    },
+    {
+      "epoch": 0.13981493218405375,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011806646743145252,
+      "loss": 1.9375,
+      "step": 1103
+    },
+    {
+      "epoch": 0.13994169096209913,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0001180607054903876,
+      "loss": 2.7575,
+      "step": 1104
+    },
+    {
+      "epoch": 0.1400684497401445,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011805493511779867,
+      "loss": 2.1476,
+      "step": 1105
+    },
+    {
+      "epoch": 0.1401952085181899,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011804915631452366,
+      "loss": 2.0541,
+      "step": 1106
+    },
+    {
+      "epoch": 0.14032196729623525,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011804336908140176,
+      "loss": 2.0684,
+      "step": 1107
+    },
+    {
+      "epoch": 0.14044872607428063,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0001180375734192734,
+      "loss": 1.6989,
+      "step": 1108
+    },
+    {
+      "epoch": 0.14057548485232602,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011803176932898024,
+      "loss": 2.156,
+      "step": 1109
+    },
+    {
+      "epoch": 0.1407022436303714,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0001180259568113651,
+      "loss": 1.7985,
+      "step": 1110
+    },
+    {
+      "epoch": 0.14082900240841678,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011802013586727213,
+      "loss": 1.9365,
+      "step": 1111
+    },
+    {
+      "epoch": 0.14095576118646216,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0001180143064975466,
+      "loss": 1.6625,
+      "step": 1112
+    },
+    {
+      "epoch": 0.14108251996450755,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011800846870303501,
+      "loss": 2.0967,
+      "step": 1113
+    },
+    {
+      "epoch": 0.14120927874255293,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.00011800262248458521,
+      "loss": 2.0942,
+      "step": 1114
+    },
+    {
+      "epoch": 0.1413360375205983,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011799676784304612,
+      "loss": 2.1922,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1414627962986437,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011799090477926795,
+      "loss": 1.8842,
+      "step": 1116
+    },
+    {
+      "epoch": 0.14158955507668905,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011798503329410214,
+      "loss": 2.5449,
+      "step": 1117
+    },
+    {
+      "epoch": 0.14171631385473443,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011797915338840135,
+      "loss": 2.113,
+      "step": 1118
+    },
+    {
+      "epoch": 0.14184307263277982,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011797326506301943,
+      "loss": 1.9245,
+      "step": 1119
+    },
+    {
+      "epoch": 0.1419698314108252,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00011796736831881152,
+      "loss": 1.8401,
+      "step": 1120
+    },
+    {
+      "epoch": 0.14209659018887058,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011796146315663389,
+      "loss": 1.7462,
+      "step": 1121
+    },
+    {
+      "epoch": 0.14222334896691596,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0001179555495773441,
+      "loss": 1.9618,
+      "step": 1122
+    },
+    {
+      "epoch": 0.14235010774496135,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011794962758180092,
+      "loss": 1.9509,
+      "step": 1123
+    },
+    {
+      "epoch": 0.14247686652300673,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011794369717086435,
+      "loss": 2.2101,
+      "step": 1124
+    },
+    {
+      "epoch": 0.1426036253010521,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0001179377583453956,
+      "loss": 2.23,
+      "step": 1125
+    },
+    {
+      "epoch": 0.14273038407909747,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011793181110625706,
+      "loss": 2.0891,
+      "step": 1126
+    },
+    {
+      "epoch": 0.14285714285714285,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011792585545431243,
+      "loss": 2.3124,
+      "step": 1127
+    },
+    {
+      "epoch": 0.14298390163518823,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011791989139042655,
+      "loss": 2.303,
+      "step": 1128
+    },
+    {
+      "epoch": 0.14311066041323361,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00011791391891546554,
+      "loss": 1.9508,
+      "step": 1129
+    },
+    {
+      "epoch": 0.143237419191279,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0001179079380302967,
+      "loss": 1.7895,
+      "step": 1130
+    },
+    {
+      "epoch": 0.14336417796932438,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00011790194873578857,
+      "loss": 1.5249,
+      "step": 1131
+    },
+    {
+      "epoch": 0.14349093674736976,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00011789595103281093,
+      "loss": 1.668,
+      "step": 1132
+    },
+    {
+      "epoch": 0.14361769552541515,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011788994492223476,
+      "loss": 2.4371,
+      "step": 1133
+    },
+    {
+      "epoch": 0.14374445430346053,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011788393040493223,
+      "loss": 1.8612,
+      "step": 1134
+    },
+    {
+      "epoch": 0.14387121308150588,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011787790748177679,
+      "loss": 1.727,
+      "step": 1135
+    },
+    {
+      "epoch": 0.14399797185955127,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011787187615364307,
+      "loss": 2.2304,
+      "step": 1136
+    },
+    {
+      "epoch": 0.14412473063759665,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011786583642140695,
+      "loss": 2.2087,
+      "step": 1137
+    },
+    {
+      "epoch": 0.14425148941564203,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011785978828594547,
+      "loss": 2.4493,
+      "step": 1138
+    },
+    {
+      "epoch": 0.14437824819368741,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011785373174813702,
+      "loss": 2.0195,
+      "step": 1139
+    },
+    {
+      "epoch": 0.1445050069717328,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011784766680886104,
+      "loss": 1.9201,
+      "step": 1140
+    },
+    {
+      "epoch": 0.14463176574977818,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011784159346899832,
+      "loss": 2.634,
+      "step": 1141
+    },
+    {
+      "epoch": 0.14475852452782356,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011783551172943083,
+      "loss": 1.7474,
+      "step": 1142
+    },
+    {
+      "epoch": 0.14488528330586894,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011782942159104171,
+      "loss": 2.0539,
+      "step": 1143
+    },
+    {
+      "epoch": 0.1450120420839143,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011782332305471541,
+      "loss": 1.8865,
+      "step": 1144
+    },
+    {
+      "epoch": 0.14513880086195968,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011781721612133753,
+      "loss": 1.7505,
+      "step": 1145
+    },
+    {
+      "epoch": 0.14526555964000507,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011781110079179493,
+      "loss": 1.9142,
+      "step": 1146
+    },
+    {
+      "epoch": 0.14539231841805045,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011780497706697568,
+      "loss": 1.8523,
+      "step": 1147
+    },
+    {
+      "epoch": 0.14551907719609583,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011779884494776902,
+      "loss": 2.0329,
+      "step": 1148
+    },
+    {
+      "epoch": 0.1456458359741412,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011779270443506548,
+      "loss": 1.6421,
+      "step": 1149
+    },
+    {
+      "epoch": 0.1457725947521866,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0001177865555297568,
+      "loss": 2.0415,
+      "step": 1150
+    },
+    {
+      "epoch": 0.14589935353023198,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011778039823273588,
+      "loss": 1.8415,
+      "step": 1151
+    },
+    {
+      "epoch": 0.14602611230827736,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.00011777423254489689,
+      "loss": 2.1517,
+      "step": 1152
+    },
+    {
+      "epoch": 0.14615287108632272,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011776805846713521,
+      "loss": 1.7325,
+      "step": 1153
+    },
+    {
+      "epoch": 0.1462796298643681,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011776187600034743,
+      "loss": 2.2662,
+      "step": 1154
+    },
+    {
+      "epoch": 0.14640638864241348,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011775568514543137,
+      "loss": 2.2717,
+      "step": 1155
+    },
+    {
+      "epoch": 0.14653314742045886,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011774948590328605,
+      "loss": 1.9863,
+      "step": 1156
+    },
+    {
+      "epoch": 0.14665990619850425,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011774327827481174,
+      "loss": 1.915,
+      "step": 1157
+    },
+    {
+      "epoch": 0.14678666497654963,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011773706226090986,
+      "loss": 2.0438,
+      "step": 1158
+    },
+    {
+      "epoch": 0.146913423754595,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011773083786248314,
+      "loss": 1.9095,
+      "step": 1159
+    },
+    {
+      "epoch": 0.1470401825326404,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011772460508043546,
+      "loss": 2.2996,
+      "step": 1160
+    },
+    {
+      "epoch": 0.14716694131068578,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011771836391567194,
+      "loss": 2.0049,
+      "step": 1161
+    },
+    {
+      "epoch": 0.14729370008873113,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011771211436909895,
+      "loss": 1.7218,
+      "step": 1162
+    },
+    {
+      "epoch": 0.14742045886677652,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011770585644162399,
+      "loss": 1.4467,
+      "step": 1163
+    },
+    {
+      "epoch": 0.1475472176448219,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011769959013415586,
+      "loss": 1.9326,
+      "step": 1164
+    },
+    {
+      "epoch": 0.14767397642286728,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00011769331544760455,
+      "loss": 2.2604,
+      "step": 1165
+    },
+    {
+      "epoch": 0.14780073520091266,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011768703238288125,
+      "loss": 2.1964,
+      "step": 1166
+    },
+    {
+      "epoch": 0.14792749397895805,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011768074094089838,
+      "loss": 1.799,
+      "step": 1167
+    },
+    {
+      "epoch": 0.14805425275700343,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0001176744411225696,
+      "loss": 1.8015,
+      "step": 1168
+    },
+    {
+      "epoch": 0.1481810115350488,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011766813292880974,
+      "loss": 2.1253,
+      "step": 1169
+    },
+    {
+      "epoch": 0.1483077703130942,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011766181636053489,
+      "loss": 2.204,
+      "step": 1170
+    },
+    {
+      "epoch": 0.14843452909113955,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011765549141866233,
+      "loss": 1.8661,
+      "step": 1171
+    },
+    {
+      "epoch": 0.14856128786918493,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00011764915810411054,
+      "loss": 2.2793,
+      "step": 1172
+    },
+    {
+      "epoch": 0.14868804664723032,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011764281641779927,
+      "loss": 3.2794,
+      "step": 1173
+    },
+    {
+      "epoch": 0.1488148054252757,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011763646636064944,
+      "loss": 1.6779,
+      "step": 1174
+    },
+    {
+      "epoch": 0.14894156420332108,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0001176301079335832,
+      "loss": 2.0053,
+      "step": 1175
+    },
+    {
+      "epoch": 0.14906832298136646,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011762374113752392,
+      "loss": 2.0889,
+      "step": 1176
+    },
+    {
+      "epoch": 0.14919508175941185,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011761736597339618,
+      "loss": 2.628,
+      "step": 1177
+    },
+    {
+      "epoch": 0.14932184053745723,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011761098244212576,
+      "loss": 1.6363,
+      "step": 1178
+    },
+    {
+      "epoch": 0.1494485993155026,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0001176045905446397,
+      "loss": 1.8584,
+      "step": 1179
+    },
+    {
+      "epoch": 0.14957535809354797,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00011759819028186619,
+      "loss": 2.1749,
+      "step": 1180
+    },
+    {
+      "epoch": 0.14970211687159335,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011759178165473469,
+      "loss": 1.6844,
+      "step": 1181
+    },
+    {
+      "epoch": 0.14982887564963873,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011758536466417587,
+      "loss": 2.3646,
+      "step": 1182
+    },
+    {
+      "epoch": 0.14995563442768411,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011757893931112156,
+      "loss": 2.1912,
+      "step": 1183
+    },
+    {
+      "epoch": 0.1500823932057295,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0001175725055965049,
+      "loss": 2.5788,
+      "step": 1184
+    },
+    {
+      "epoch": 0.15020915198377488,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011756606352126013,
+      "loss": 2.6087,
+      "step": 1185
+    },
+    {
+      "epoch": 0.15033591076182026,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0001175596130863228,
+      "loss": 1.5276,
+      "step": 1186
+    },
+    {
+      "epoch": 0.15046266953986565,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011755315429262962,
+      "loss": 1.7347,
+      "step": 1187
+    },
+    {
+      "epoch": 0.15058942831791103,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011754668714111856,
+      "loss": 1.7463,
+      "step": 1188
+    },
+    {
+      "epoch": 0.15071618709595638,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0001175402116327287,
+      "loss": 2.1952,
+      "step": 1189
+    },
+    {
+      "epoch": 0.15084294587400177,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001175337277684005,
+      "loss": 2.3937,
+      "step": 1190
+    },
+    {
+      "epoch": 0.15096970465204715,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011752723554907549,
+      "loss": 2.3407,
+      "step": 1191
+    },
+    {
+      "epoch": 0.15109646343009253,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011752073497569647,
+      "loss": 2.4135,
+      "step": 1192
+    },
+    {
+      "epoch": 0.15122322220813791,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011751422604920744,
+      "loss": 2.067,
+      "step": 1193
+    },
+    {
+      "epoch": 0.1513499809861833,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011750770877055364,
+      "loss": 1.8038,
+      "step": 1194
+    },
+    {
+      "epoch": 0.15147673976422868,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011750118314068149,
+      "loss": 2.1027,
+      "step": 1195
+    },
+    {
+      "epoch": 0.15160349854227406,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011749464916053865,
+      "loss": 2.0465,
+      "step": 1196
+    },
+    {
+      "epoch": 0.15173025732031944,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011748810683107395,
+      "loss": 2.0565,
+      "step": 1197
+    },
+    {
+      "epoch": 0.1518570160983648,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0001174815561532375,
+      "loss": 1.9503,
+      "step": 1198
+    },
+    {
+      "epoch": 0.15198377487641018,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011747499712798056,
+      "loss": 2.2056,
+      "step": 1199
+    },
+    {
+      "epoch": 0.15211053365445557,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011746842975625561,
+      "loss": 1.8435,
+      "step": 1200
+    },
+    {
+      "epoch": 0.15223729243250095,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011746185403901639,
+      "loss": 2.2389,
+      "step": 1201
+    },
+    {
+      "epoch": 0.15236405121054633,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011745526997721782,
+      "loss": 2.3989,
+      "step": 1202
+    },
+    {
+      "epoch": 0.1524908099885917,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.000117448677571816,
+      "loss": 2.0868,
+      "step": 1203
+    },
+    {
+      "epoch": 0.1526175687666371,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011744207682376831,
+      "loss": 1.8994,
+      "step": 1204
+    },
+    {
+      "epoch": 0.15274432754468248,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011743546773403327,
+      "loss": 2.3933,
+      "step": 1205
+    },
+    {
+      "epoch": 0.15287108632272786,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011742885030357067,
+      "loss": 2.0144,
+      "step": 1206
+    },
+    {
+      "epoch": 0.15299784510077322,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00011742222453334148,
+      "loss": 1.8523,
+      "step": 1207
+    },
+    {
+      "epoch": 0.1531246038788186,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011741559042430789,
+      "loss": 1.6526,
+      "step": 1208
+    },
+    {
+      "epoch": 0.15325136265686398,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0001174089479774333,
+      "loss": 2.4686,
+      "step": 1209
+    },
+    {
+      "epoch": 0.15337812143490936,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00011740229719368231,
+      "loss": 2.2871,
+      "step": 1210
+    },
+    {
+      "epoch": 0.15350488021295475,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011739563807402075,
+      "loss": 2.117,
+      "step": 1211
+    },
+    {
+      "epoch": 0.15363163899100013,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011738897061941566,
+      "loss": 1.9351,
+      "step": 1212
+    },
+    {
+      "epoch": 0.1537583977690455,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011738229483083528,
+      "loss": 2.3357,
+      "step": 1213
+    },
+    {
+      "epoch": 0.1538851565470909,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011737561070924903,
+      "loss": 2.2089,
+      "step": 1214
+    },
+    {
+      "epoch": 0.15401191532513628,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011736891825562761,
+      "loss": 1.9727,
+      "step": 1215
+    },
+    {
+      "epoch": 0.15413867410318163,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011736221747094288,
+      "loss": 2.0784,
+      "step": 1216
+    },
+    {
+      "epoch": 0.15426543288122702,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00011735550835616793,
+      "loss": 2.2285,
+      "step": 1217
+    },
+    {
+      "epoch": 0.1543921916592724,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011734879091227703,
+      "loss": 2.1003,
+      "step": 1218
+    },
+    {
+      "epoch": 0.15451895043731778,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0001173420651402457,
+      "loss": 2.2601,
+      "step": 1219
+    },
+    {
+      "epoch": 0.15464570921536316,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011733533104105062,
+      "loss": 1.6037,
+      "step": 1220
+    },
+    {
+      "epoch": 0.15477246799340855,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011732858861566977,
+      "loss": 1.8671,
+      "step": 1221
+    },
+    {
+      "epoch": 0.15489922677145393,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011732183786508223,
+      "loss": 1.929,
+      "step": 1222
+    },
+    {
+      "epoch": 0.1550259855494993,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011731507879026835,
+      "loss": 2.0703,
+      "step": 1223
+    },
+    {
+      "epoch": 0.1551527443275447,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011730831139220967,
+      "loss": 2.0561,
+      "step": 1224
+    },
+    {
+      "epoch": 0.15527950310559005,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00011730153567188896,
+      "loss": 1.7941,
+      "step": 1225
+    },
+    {
+      "epoch": 0.15540626188363543,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011729475163029019,
+      "loss": 1.7607,
+      "step": 1226
+    },
+    {
+      "epoch": 0.15553302066168082,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0001172879592683985,
+      "loss": 1.5151,
+      "step": 1227
+    },
+    {
+      "epoch": 0.1556597794397262,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011728115858720032,
+      "loss": 2.0432,
+      "step": 1228
+    },
+    {
+      "epoch": 0.15578653821777158,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0001172743495876832,
+      "loss": 1.8862,
+      "step": 1229
+    },
+    {
+      "epoch": 0.15591329699581696,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011726753227083593,
+      "loss": 2.0715,
+      "step": 1230
+    },
+    {
+      "epoch": 0.15604005577386235,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011726070663764854,
+      "loss": 1.6279,
+      "step": 1231
+    },
+    {
+      "epoch": 0.15616681455190773,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011725387268911224,
+      "loss": 2.1064,
+      "step": 1232
+    },
+    {
+      "epoch": 0.1562935733299531,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011724703042621944,
+      "loss": 2.0368,
+      "step": 1233
+    },
+    {
+      "epoch": 0.15642033210799847,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011724017984996376,
+      "loss": 2.0223,
+      "step": 1234
+    },
+    {
+      "epoch": 0.15654709088604385,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011723332096134006,
+      "loss": 1.7416,
+      "step": 1235
+    },
+    {
+      "epoch": 0.15667384966408923,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011722645376134437,
+      "loss": 2.2762,
+      "step": 1236
+    },
+    {
+      "epoch": 0.15680060844213461,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011721957825097394,
+      "loss": 1.9597,
+      "step": 1237
+    },
+    {
+      "epoch": 0.15692736722018,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0001172126944312272,
+      "loss": 2.9127,
+      "step": 1238
+    },
+    {
+      "epoch": 0.15705412599822538,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011720580230310385,
+      "loss": 2.1764,
+      "step": 1239
+    },
+    {
+      "epoch": 0.15718088477627076,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011719890186760474,
+      "loss": 1.9176,
+      "step": 1240
+    },
+    {
+      "epoch": 0.15730764355431615,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011719199312573196,
+      "loss": 1.9535,
+      "step": 1241
+    },
+    {
+      "epoch": 0.15743440233236153,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011718507607848876,
+      "loss": 2.4568,
+      "step": 1242
+    },
+    {
+      "epoch": 0.15756116111040688,
+      "grad_norm": 3.859375,
+      "learning_rate": 0.00011717815072687965,
+      "loss": 1.9271,
+      "step": 1243
+    },
+    {
+      "epoch": 0.15768791988845227,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011717121707191033,
+      "loss": 2.1812,
+      "step": 1244
+    },
+    {
+      "epoch": 0.15781467866649765,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011716427511458766,
+      "loss": 1.8522,
+      "step": 1245
+    },
+    {
+      "epoch": 0.15794143744454303,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011715732485591981,
+      "loss": 1.8697,
+      "step": 1246
+    },
+    {
+      "epoch": 0.15806819622258841,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011715036629691603,
+      "loss": 1.8822,
+      "step": 1247
+    },
+    {
+      "epoch": 0.1581949550006338,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011714339943858689,
+      "loss": 2.06,
+      "step": 1248
+    },
+    {
+      "epoch": 0.15832171377867918,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011713642428194406,
+      "loss": 1.8493,
+      "step": 1249
+    },
+    {
+      "epoch": 0.15844847255672456,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011712944082800048,
+      "loss": 2.1765,
+      "step": 1250
+    },
+    {
+      "epoch": 0.15857523133476994,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0001171224490777703,
+      "loss": 2.0089,
+      "step": 1251
+    },
+    {
+      "epoch": 0.1587019901128153,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011711544903226884,
+      "loss": 1.5771,
+      "step": 1252
+    },
+    {
+      "epoch": 0.15882874889086068,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011710844069251264,
+      "loss": 1.9593,
+      "step": 1253
+    },
+    {
+      "epoch": 0.15895550766890607,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011710142405951946,
+      "loss": 1.9038,
+      "step": 1254
+    },
+    {
+      "epoch": 0.15908226644695145,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011709439913430823,
+      "loss": 2.2597,
+      "step": 1255
+    },
+    {
+      "epoch": 0.15920902522499683,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011708736591789913,
+      "loss": 1.9644,
+      "step": 1256
+    },
+    {
+      "epoch": 0.1593357840030422,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011708032441131348,
+      "loss": 1.6797,
+      "step": 1257
+    },
+    {
+      "epoch": 0.1594625427810876,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011707327461557386,
+      "loss": 2.3476,
+      "step": 1258
+    },
+    {
+      "epoch": 0.15958930155913298,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011706621653170406,
+      "loss": 2.1398,
+      "step": 1259
+    },
+    {
+      "epoch": 0.15971606033717836,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.000117059150160729,
+      "loss": 1.8348,
+      "step": 1260
+    },
+    {
+      "epoch": 0.15984281911522372,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011705207550367488,
+      "loss": 2.1193,
+      "step": 1261
+    },
+    {
+      "epoch": 0.1599695778932691,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011704499256156908,
+      "loss": 1.8472,
+      "step": 1262
+    },
+    {
+      "epoch": 0.16009633667131448,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011703790133544016,
+      "loss": 1.8676,
+      "step": 1263
+    },
+    {
+      "epoch": 0.16022309544935986,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00011703080182631792,
+      "loss": 1.7294,
+      "step": 1264
+    },
+    {
+      "epoch": 0.16034985422740525,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011702369403523333,
+      "loss": 1.8304,
+      "step": 1265
+    },
+    {
+      "epoch": 0.16047661300545063,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011701657796321858,
+      "loss": 1.5325,
+      "step": 1266
+    },
+    {
+      "epoch": 0.160603371783496,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011700945361130706,
+      "loss": 1.9624,
+      "step": 1267
+    },
+    {
+      "epoch": 0.1607301305615414,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011700232098053337,
+      "loss": 1.5178,
+      "step": 1268
+    },
+    {
+      "epoch": 0.16085688933958678,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011699518007193332,
+      "loss": 2.1261,
+      "step": 1269
+    },
+    {
+      "epoch": 0.16098364811763213,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011698803088654388,
+      "loss": 2.026,
+      "step": 1270
+    },
+    {
+      "epoch": 0.16111040689567752,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00011698087342540325,
+      "loss": 2.0495,
+      "step": 1271
+    },
+    {
+      "epoch": 0.1612371656737229,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011697370768955082,
+      "loss": 1.6341,
+      "step": 1272
+    },
+    {
+      "epoch": 0.16136392445176828,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011696653368002722,
+      "loss": 1.685,
+      "step": 1273
+    },
+    {
+      "epoch": 0.16149068322981366,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011695935139787422,
+      "loss": 2.2273,
+      "step": 1274
+    },
+    {
+      "epoch": 0.16161744200785905,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011695216084413486,
+      "loss": 2.1221,
+      "step": 1275
+    },
+    {
+      "epoch": 0.16174420078590443,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011694496201985333,
+      "loss": 1.8019,
+      "step": 1276
+    },
+    {
+      "epoch": 0.1618709595639498,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011693775492607504,
+      "loss": 1.783,
+      "step": 1277
+    },
+    {
+      "epoch": 0.1619977183419952,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0001169305395638466,
+      "loss": 2.3844,
+      "step": 1278
+    },
+    {
+      "epoch": 0.16212447712004055,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011692331593421579,
+      "loss": 1.5885,
+      "step": 1279
+    },
+    {
+      "epoch": 0.16225123589808593,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011691608403823165,
+      "loss": 2.1312,
+      "step": 1280
+    },
+    {
+      "epoch": 0.16237799467613132,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0001169088438769444,
+      "loss": 2.1352,
+      "step": 1281
+    },
+    {
+      "epoch": 0.1625047534541767,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011690159545140541,
+      "loss": 2.2286,
+      "step": 1282
+    },
+    {
+      "epoch": 0.16263151223222208,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00011689433876266731,
+      "loss": 2.3093,
+      "step": 1283
+    },
+    {
+      "epoch": 0.16275827101026746,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011688707381178394,
+      "loss": 1.849,
+      "step": 1284
+    },
+    {
+      "epoch": 0.16288502978831285,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011687980059981026,
+      "loss": 1.8192,
+      "step": 1285
+    },
+    {
+      "epoch": 0.16301178856635823,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011687251912780249,
+      "loss": 2.2033,
+      "step": 1286
+    },
+    {
+      "epoch": 0.1631385473444036,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011686522939681806,
+      "loss": 2.23,
+      "step": 1287
+    },
+    {
+      "epoch": 0.16326530612244897,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011685793140791558,
+      "loss": 2.2655,
+      "step": 1288
+    },
+    {
+      "epoch": 0.16339206490049435,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011685062516215483,
+      "loss": 2.3736,
+      "step": 1289
+    },
+    {
+      "epoch": 0.16351882367853973,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011684331066059682,
+      "loss": 1.6952,
+      "step": 1290
+    },
+    {
+      "epoch": 0.16364558245658511,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011683598790430379,
+      "loss": 2.6982,
+      "step": 1291
+    },
+    {
+      "epoch": 0.1637723412346305,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011682865689433912,
+      "loss": 1.724,
+      "step": 1292
+    },
+    {
+      "epoch": 0.16389910001267588,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011682131763176739,
+      "loss": 1.9218,
+      "step": 1293
+    },
+    {
+      "epoch": 0.16402585879072126,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011681397011765444,
+      "loss": 1.9737,
+      "step": 1294
+    },
+    {
+      "epoch": 0.16415261756876665,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011680661435306727,
+      "loss": 1.5576,
+      "step": 1295
+    },
+    {
+      "epoch": 0.16427937634681203,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.00011679925033907403,
+      "loss": 1.9893,
+      "step": 1296
+    },
+    {
+      "epoch": 0.16440613512485738,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.00011679187807674417,
+      "loss": 2.1787,
+      "step": 1297
+    },
+    {
+      "epoch": 0.16453289390290277,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011678449756714826,
+      "loss": 2.3746,
+      "step": 1298
+    },
+    {
+      "epoch": 0.16465965268094815,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011677710881135807,
+      "loss": 1.5504,
+      "step": 1299
+    },
+    {
+      "epoch": 0.16478641145899353,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011676971181044664,
+      "loss": 2.0711,
+      "step": 1300
+    },
+    {
+      "epoch": 0.16491317023703891,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011676230656548811,
+      "loss": 2.4971,
+      "step": 1301
+    },
+    {
+      "epoch": 0.1650399290150843,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011675489307755788,
+      "loss": 1.7373,
+      "step": 1302
+    },
+    {
+      "epoch": 0.16516668779312968,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.00011674747134773254,
+      "loss": 1.9925,
+      "step": 1303
+    },
+    {
+      "epoch": 0.16529344657117506,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011674004137708985,
+      "loss": 2.0737,
+      "step": 1304
+    },
+    {
+      "epoch": 0.16542020534922044,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011673260316670879,
+      "loss": 2.0186,
+      "step": 1305
+    },
+    {
+      "epoch": 0.1655469641272658,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011672515671766953,
+      "loss": 2.2546,
+      "step": 1306
+    },
+    {
+      "epoch": 0.16567372290531118,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011671770203105345,
+      "loss": 1.7104,
+      "step": 1307
+    },
+    {
+      "epoch": 0.16580048168335657,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011671023910794308,
+      "loss": 1.8675,
+      "step": 1308
+    },
+    {
+      "epoch": 0.16592724046140195,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0001167027679494222,
+      "loss": 1.9288,
+      "step": 1309
+    },
+    {
+      "epoch": 0.16605399923944733,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011669528855657578,
+      "loss": 1.9568,
+      "step": 1310
+    },
+    {
+      "epoch": 0.1661807580174927,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011668780093048994,
+      "loss": 2.4522,
+      "step": 1311
+    },
+    {
+      "epoch": 0.1663075167955381,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011668030507225206,
+      "loss": 2.0694,
+      "step": 1312
+    },
+    {
+      "epoch": 0.16643427557358348,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011667280098295066,
+      "loss": 1.6021,
+      "step": 1313
+    },
+    {
+      "epoch": 0.16656103435162886,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00011666528866367548,
+      "loss": 2.3724,
+      "step": 1314
+    },
+    {
+      "epoch": 0.16668779312967422,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011665776811551743,
+      "loss": 2.2543,
+      "step": 1315
+    },
+    {
+      "epoch": 0.1668145519077196,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011665023933956869,
+      "loss": 1.9289,
+      "step": 1316
+    },
+    {
+      "epoch": 0.16694131068576498,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011664270233692253,
+      "loss": 1.7157,
+      "step": 1317
+    },
+    {
+      "epoch": 0.16706806946381036,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0001166351571086735,
+      "loss": 2.54,
+      "step": 1318
+    },
+    {
+      "epoch": 0.16719482824185575,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0001166276036559173,
+      "loss": 2.19,
+      "step": 1319
+    },
+    {
+      "epoch": 0.16732158701990113,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011662004197975083,
+      "loss": 1.7336,
+      "step": 1320
+    },
+    {
+      "epoch": 0.1674483457979465,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0001166124720812722,
+      "loss": 2.266,
+      "step": 1321
+    },
+    {
+      "epoch": 0.1675751045759919,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0001166048939615807,
+      "loss": 2.0758,
+      "step": 1322
+    },
+    {
+      "epoch": 0.16770186335403728,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011659730762177682,
+      "loss": 1.9477,
+      "step": 1323
+    },
+    {
+      "epoch": 0.16782862213208263,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011658971306296224,
+      "loss": 1.9067,
+      "step": 1324
+    },
+    {
+      "epoch": 0.16795538091012802,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011658211028623983,
+      "loss": 2.1705,
+      "step": 1325
+    },
+    {
+      "epoch": 0.1680821396881734,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011657449929271366,
+      "loss": 2.155,
+      "step": 1326
+    },
+    {
+      "epoch": 0.16820889846621878,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011656688008348902,
+      "loss": 1.9701,
+      "step": 1327
+    },
+    {
+      "epoch": 0.16833565724426416,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011655925265967231,
+      "loss": 1.7513,
+      "step": 1328
+    },
+    {
+      "epoch": 0.16846241602230955,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011655161702237121,
+      "loss": 1.9852,
+      "step": 1329
+    },
+    {
+      "epoch": 0.16858917480035493,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011654397317269457,
+      "loss": 2.1549,
+      "step": 1330
+    },
+    {
+      "epoch": 0.1687159335784003,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0001165363211117524,
+      "loss": 1.8626,
+      "step": 1331
+    },
+    {
+      "epoch": 0.1688426923564457,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011652866084065594,
+      "loss": 2.2102,
+      "step": 1332
+    },
+    {
+      "epoch": 0.16896945113449105,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011652099236051761,
+      "loss": 2.1832,
+      "step": 1333
+    },
+    {
+      "epoch": 0.16909620991253643,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.000116513315672451,
+      "loss": 2.1535,
+      "step": 1334
+    },
+    {
+      "epoch": 0.16922296869058182,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011650563077757095,
+      "loss": 1.6856,
+      "step": 1335
+    },
+    {
+      "epoch": 0.1693497274686272,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0001164979376769934,
+      "loss": 2.3066,
+      "step": 1336
+    },
+    {
+      "epoch": 0.16947648624667258,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011649023637183559,
+      "loss": 1.8957,
+      "step": 1337
+    },
+    {
+      "epoch": 0.16960324502471796,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011648252686321586,
+      "loss": 2.3303,
+      "step": 1338
+    },
+    {
+      "epoch": 0.16973000380276335,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0001164748091522538,
+      "loss": 2.186,
+      "step": 1339
+    },
+    {
+      "epoch": 0.16985676258080873,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011646708324007013,
+      "loss": 1.4996,
+      "step": 1340
+    },
+    {
+      "epoch": 0.1699835213588541,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011645934912778685,
+      "loss": 1.6884,
+      "step": 1341
+    },
+    {
+      "epoch": 0.17011028013689947,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011645160681652707,
+      "loss": 2.3821,
+      "step": 1342
+    },
+    {
+      "epoch": 0.17023703891494485,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011644385630741514,
+      "loss": 2.4489,
+      "step": 1343
+    },
+    {
+      "epoch": 0.17036379769299023,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011643609760157658,
+      "loss": 2.0797,
+      "step": 1344
+    },
+    {
+      "epoch": 0.17049055647103561,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011642833070013807,
+      "loss": 1.7555,
+      "step": 1345
+    },
+    {
+      "epoch": 0.170617315249081,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011642055560422756,
+      "loss": 1.7697,
+      "step": 1346
+    },
+    {
+      "epoch": 0.17074407402712638,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011641277231497411,
+      "loss": 2.1339,
+      "step": 1347
+    },
+    {
+      "epoch": 0.17087083280517176,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.000116404980833508,
+      "loss": 1.6965,
+      "step": 1348
+    },
+    {
+      "epoch": 0.17099759158321715,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00011639718116096075,
+      "loss": 1.9666,
+      "step": 1349
+    },
+    {
+      "epoch": 0.17112435036126253,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011638937329846495,
+      "loss": 1.7666,
+      "step": 1350
+    },
+    {
+      "epoch": 0.17125110913930788,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0001163815572471545,
+      "loss": 2.5994,
+      "step": 1351
+    },
+    {
+      "epoch": 0.17137786791735327,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011637373300816444,
+      "loss": 1.911,
+      "step": 1352
+    },
+    {
+      "epoch": 0.17150462669539865,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011636590058263097,
+      "loss": 1.7797,
+      "step": 1353
+    },
+    {
+      "epoch": 0.17163138547344403,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011635805997169153,
+      "loss": 1.6793,
+      "step": 1354
+    },
+    {
+      "epoch": 0.17175814425148941,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011635021117648471,
+      "loss": 2.3738,
+      "step": 1355
+    },
+    {
+      "epoch": 0.1718849030295348,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011634235419815033,
+      "loss": 1.9983,
+      "step": 1356
+    },
+    {
+      "epoch": 0.17201166180758018,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011633448903782936,
+      "loss": 2.091,
+      "step": 1357
+    },
+    {
+      "epoch": 0.17213842058562556,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011632661569666396,
+      "loss": 2.1014,
+      "step": 1358
+    },
+    {
+      "epoch": 0.17226517936367094,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011631873417579752,
+      "loss": 2.3214,
+      "step": 1359
+    },
+    {
+      "epoch": 0.17239193814171633,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011631084447637455,
+      "loss": 1.8834,
+      "step": 1360
+    },
+    {
+      "epoch": 0.17251869691976168,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001163029465995408,
+      "loss": 1.3993,
+      "step": 1361
+    },
+    {
+      "epoch": 0.17264545569780707,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0001162950405464432,
+      "loss": 2.1255,
+      "step": 1362
+    },
+    {
+      "epoch": 0.17277221447585245,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011628712631822985,
+      "loss": 2.4376,
+      "step": 1363
+    },
+    {
+      "epoch": 0.17289897325389783,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011627920391605006,
+      "loss": 2.189,
+      "step": 1364
+    },
+    {
+      "epoch": 0.1730257320319432,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011627127334105431,
+      "loss": 1.7081,
+      "step": 1365
+    },
+    {
+      "epoch": 0.1731524908099886,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011626333459439426,
+      "loss": 1.8611,
+      "step": 1366
+    },
+    {
+      "epoch": 0.17327924958803398,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011625538767722278,
+      "loss": 1.7965,
+      "step": 1367
+    },
+    {
+      "epoch": 0.17340600836607936,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011624743259069391,
+      "loss": 2.0881,
+      "step": 1368
+    },
+    {
+      "epoch": 0.17353276714412474,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011623946933596287,
+      "loss": 2.0016,
+      "step": 1369
+    },
+    {
+      "epoch": 0.1736595259221701,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011623149791418611,
+      "loss": 2.2606,
+      "step": 1370
+    },
+    {
+      "epoch": 0.17378628470021548,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011622351832652118,
+      "loss": 2.2285,
+      "step": 1371
+    },
+    {
+      "epoch": 0.17391304347826086,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011621553057412692,
+      "loss": 1.9222,
+      "step": 1372
+    },
+    {
+      "epoch": 0.17403980225630625,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011620753465816328,
+      "loss": 2.5336,
+      "step": 1373
+    },
+    {
+      "epoch": 0.17416656103435163,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011619953057979142,
+      "loss": 2.153,
+      "step": 1374
+    },
+    {
+      "epoch": 0.174293319812397,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011619151834017369,
+      "loss": 2.2346,
+      "step": 1375
+    },
+    {
+      "epoch": 0.1744200785904424,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011618349794047361,
+      "loss": 1.9778,
+      "step": 1376
+    },
+    {
+      "epoch": 0.17454683736848778,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011617546938185591,
+      "loss": 2.1729,
+      "step": 1377
+    },
+    {
+      "epoch": 0.17467359614653316,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0001161674326654865,
+      "loss": 2.1126,
+      "step": 1378
+    },
+    {
+      "epoch": 0.17480035492457852,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011615938779253243,
+      "loss": 2.2715,
+      "step": 1379
+    },
+    {
+      "epoch": 0.1749271137026239,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00011615133476416198,
+      "loss": 2.2649,
+      "step": 1380
+    },
+    {
+      "epoch": 0.17505387248066928,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011614327358154461,
+      "loss": 1.9336,
+      "step": 1381
+    },
+    {
+      "epoch": 0.17518063125871466,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011613520424585097,
+      "loss": 2.2498,
+      "step": 1382
+    },
+    {
+      "epoch": 0.17530739003676005,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00011612712675825288,
+      "loss": 1.8766,
+      "step": 1383
+    },
+    {
+      "epoch": 0.17543414881480543,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011611904111992333,
+      "loss": 2.2918,
+      "step": 1384
+    },
+    {
+      "epoch": 0.1755609075928508,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011611094733203652,
+      "loss": 2.3701,
+      "step": 1385
+    },
+    {
+      "epoch": 0.1756876663708962,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011610284539576782,
+      "loss": 2.2539,
+      "step": 1386
+    },
+    {
+      "epoch": 0.17581442514894158,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011609473531229376,
+      "loss": 2.128,
+      "step": 1387
+    },
+    {
+      "epoch": 0.17594118392698693,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011608661708279212,
+      "loss": 1.9965,
+      "step": 1388
+    },
+    {
+      "epoch": 0.17606794270503232,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0001160784907084418,
+      "loss": 2.0458,
+      "step": 1389
+    },
+    {
+      "epoch": 0.1761947014830777,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011607035619042292,
+      "loss": 1.7919,
+      "step": 1390
+    },
+    {
+      "epoch": 0.17632146026112308,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00011606221352991676,
+      "loss": 2.1364,
+      "step": 1391
+    },
+    {
+      "epoch": 0.17644821903916846,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011605406272810577,
+      "loss": 1.684,
+      "step": 1392
+    },
+    {
+      "epoch": 0.17657497781721385,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011604590378617363,
+      "loss": 2.0512,
+      "step": 1393
+    },
+    {
+      "epoch": 0.17670173659525923,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011603773670530517,
+      "loss": 2.3032,
+      "step": 1394
+    },
+    {
+      "epoch": 0.1768284953733046,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011602956148668639,
+      "loss": 2.1799,
+      "step": 1395
+    },
+    {
+      "epoch": 0.17695525415135,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011602137813150451,
+      "loss": 1.6765,
+      "step": 1396
+    },
+    {
+      "epoch": 0.17708201292939535,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001160131866409479,
+      "loss": 1.8718,
+      "step": 1397
+    },
+    {
+      "epoch": 0.17720877170744073,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0001160049870162061,
+      "loss": 2.271,
+      "step": 1398
+    },
+    {
+      "epoch": 0.17733553048548611,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011599677925846988,
+      "loss": 1.7783,
+      "step": 1399
+    },
+    {
+      "epoch": 0.1774622892635315,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011598856336893115,
+      "loss": 1.9059,
+      "step": 1400
+    },
+    {
+      "epoch": 0.17758904804157688,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011598033934878304,
+      "loss": 2.0421,
+      "step": 1401
+    },
+    {
+      "epoch": 0.17771580681962226,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011597210719921979,
+      "loss": 1.9537,
+      "step": 1402
+    },
+    {
+      "epoch": 0.17784256559766765,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00011596386692143689,
+      "loss": 2.1808,
+      "step": 1403
+    },
+    {
+      "epoch": 0.17796932437571303,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011595561851663099,
+      "loss": 2.0229,
+      "step": 1404
+    },
+    {
+      "epoch": 0.1780960831537584,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011594736198599992,
+      "loss": 1.9081,
+      "step": 1405
+    },
+    {
+      "epoch": 0.17822284193180377,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00011593909733074268,
+      "loss": 2.2144,
+      "step": 1406
+    },
+    {
+      "epoch": 0.17834960070984915,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00011593082455205944,
+      "loss": 1.8181,
+      "step": 1407
+    },
+    {
+      "epoch": 0.17847635948789453,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011592254365115158,
+      "loss": 2.2321,
+      "step": 1408
+    },
+    {
+      "epoch": 0.17860311826593991,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011591425462922164,
+      "loss": 1.9074,
+      "step": 1409
+    },
+    {
+      "epoch": 0.1787298770439853,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011590595748747335,
+      "loss": 2.5156,
+      "step": 1410
+    },
+    {
+      "epoch": 0.17885663582203068,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011589765222711163,
+      "loss": 2.0608,
+      "step": 1411
+    },
+    {
+      "epoch": 0.17898339460007606,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011588933884934252,
+      "loss": 2.0114,
+      "step": 1412
+    },
+    {
+      "epoch": 0.17911015337812144,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00011588101735537331,
+      "loss": 2.1753,
+      "step": 1413
+    },
+    {
+      "epoch": 0.17923691215616683,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011587268774641244,
+      "loss": 1.8167,
+      "step": 1414
+    },
+    {
+      "epoch": 0.17936367093421218,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011586435002366953,
+      "loss": 2.1786,
+      "step": 1415
+    },
+    {
+      "epoch": 0.17949042971225757,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011585600418835535,
+      "loss": 2.0829,
+      "step": 1416
+    },
+    {
+      "epoch": 0.17961718849030295,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011584765024168191,
+      "loss": 1.8147,
+      "step": 1417
+    },
+    {
+      "epoch": 0.17974394726834833,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011583928818486235,
+      "loss": 1.9458,
+      "step": 1418
+    },
+    {
+      "epoch": 0.1798707060463937,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011583091801911099,
+      "loss": 1.966,
+      "step": 1419
+    },
+    {
+      "epoch": 0.1799974648244391,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011582253974564335,
+      "loss": 2.1057,
+      "step": 1420
+    },
+    {
+      "epoch": 0.18012422360248448,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0001158141533656761,
+      "loss": 1.9527,
+      "step": 1421
+    },
+    {
+      "epoch": 0.18025098238052986,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011580575888042713,
+      "loss": 1.7309,
+      "step": 1422
+    },
+    {
+      "epoch": 0.18037774115857524,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011579735629111546,
+      "loss": 2.1355,
+      "step": 1423
+    },
+    {
+      "epoch": 0.1805044999366206,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011578894559896131,
+      "loss": 1.8338,
+      "step": 1424
+    },
+    {
+      "epoch": 0.18063125871466598,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011578052680518607,
+      "loss": 2.2209,
+      "step": 1425
+    },
+    {
+      "epoch": 0.18075801749271136,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011577209991101231,
+      "loss": 2.1022,
+      "step": 1426
+    },
+    {
+      "epoch": 0.18088477627075675,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0001157636649176638,
+      "loss": 1.9303,
+      "step": 1427
+    },
+    {
+      "epoch": 0.18101153504880213,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011575522182636541,
+      "loss": 2.0104,
+      "step": 1428
+    },
+    {
+      "epoch": 0.1811382938268475,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0001157467706383433,
+      "loss": 2.8187,
+      "step": 1429
+    },
+    {
+      "epoch": 0.1812650526048929,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0001157383113548247,
+      "loss": 2.3011,
+      "step": 1430
+    },
+    {
+      "epoch": 0.18139181138293828,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011572984397703807,
+      "loss": 2.119,
+      "step": 1431
+    },
+    {
+      "epoch": 0.18151857016098366,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011572136850621305,
+      "loss": 2.0259,
+      "step": 1432
+    },
+    {
+      "epoch": 0.18164532893902902,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011571288494358043,
+      "loss": 1.8872,
+      "step": 1433
+    },
+    {
+      "epoch": 0.1817720877170744,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011570439329037217,
+      "loss": 1.6205,
+      "step": 1434
+    },
+    {
+      "epoch": 0.18189884649511978,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011569589354782145,
+      "loss": 2.2939,
+      "step": 1435
+    },
+    {
+      "epoch": 0.18202560527316516,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011568738571716257,
+      "loss": 1.9209,
+      "step": 1436
+    },
+    {
+      "epoch": 0.18215236405121055,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011567886979963104,
+      "loss": 1.8242,
+      "step": 1437
+    },
+    {
+      "epoch": 0.18227912282925593,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011567034579646353,
+      "loss": 2.2958,
+      "step": 1438
+    },
+    {
+      "epoch": 0.1824058816073013,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011566181370889791,
+      "loss": 2.1523,
+      "step": 1439
+    },
+    {
+      "epoch": 0.1825326403853467,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00011565327353817316,
+      "loss": 2.3284,
+      "step": 1440
+    },
+    {
+      "epoch": 0.18265939916339208,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011564472528552952,
+      "loss": 2.4607,
+      "step": 1441
+    },
+    {
+      "epoch": 0.18278615794143743,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011563616895220833,
+      "loss": 2.0969,
+      "step": 1442
+    },
+    {
+      "epoch": 0.18291291671948282,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.00011562760453945214,
+      "loss": 1.8607,
+      "step": 1443
+    },
+    {
+      "epoch": 0.1830396754975282,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00011561903204850467,
+      "loss": 1.9982,
+      "step": 1444
+    },
+    {
+      "epoch": 0.18316643427557358,
+      "grad_norm": 3.625,
+      "learning_rate": 0.0001156104514806108,
+      "loss": 1.7434,
+      "step": 1445
+    },
+    {
+      "epoch": 0.18329319305361896,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011560186283701662,
+      "loss": 1.6313,
+      "step": 1446
+    },
+    {
+      "epoch": 0.18341995183166435,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011559326611896935,
+      "loss": 2.0457,
+      "step": 1447
+    },
+    {
+      "epoch": 0.18354671060970973,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011558466132771737,
+      "loss": 1.7957,
+      "step": 1448
+    },
+    {
+      "epoch": 0.1836734693877551,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011557604846451031,
+      "loss": 2.0291,
+      "step": 1449
+    },
+    {
+      "epoch": 0.1838002281658005,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011556742753059887,
+      "loss": 1.8652,
+      "step": 1450
+    },
+    {
+      "epoch": 0.18392698694384585,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011555879852723502,
+      "loss": 2.059,
+      "step": 1451
+    },
+    {
+      "epoch": 0.18405374572189123,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011555016145567183,
+      "loss": 2.1226,
+      "step": 1452
+    },
+    {
+      "epoch": 0.18418050449993661,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011554151631716358,
+      "loss": 1.847,
+      "step": 1453
+    },
+    {
+      "epoch": 0.184307263277982,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0001155328631129657,
+      "loss": 1.758,
+      "step": 1454
+    },
+    {
+      "epoch": 0.18443402205602738,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011552420184433481,
+      "loss": 2.505,
+      "step": 1455
+    },
+    {
+      "epoch": 0.18456078083407276,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011551553251252867,
+      "loss": 2.4045,
+      "step": 1456
+    },
+    {
+      "epoch": 0.18468753961211815,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011550685511880626,
+      "loss": 1.5924,
+      "step": 1457
+    },
+    {
+      "epoch": 0.18481429839016353,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00011549816966442769,
+      "loss": 1.8448,
+      "step": 1458
+    },
+    {
+      "epoch": 0.1849410571682089,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011548947615065427,
+      "loss": 1.8081,
+      "step": 1459
+    },
+    {
+      "epoch": 0.18506781594625427,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011548077457874843,
+      "loss": 1.8125,
+      "step": 1460
+    },
+    {
+      "epoch": 0.18519457472429965,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011547206494997383,
+      "loss": 1.4639,
+      "step": 1461
+    },
+    {
+      "epoch": 0.18532133350234503,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011546334726559528,
+      "loss": 2.3092,
+      "step": 1462
+    },
+    {
+      "epoch": 0.18544809228039041,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011545462152687875,
+      "loss": 2.0931,
+      "step": 1463
+    },
+    {
+      "epoch": 0.1855748510584358,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011544588773509136,
+      "loss": 2.2285,
+      "step": 1464
+    },
+    {
+      "epoch": 0.18570160983648118,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011543714589150145,
+      "loss": 1.9125,
+      "step": 1465
+    },
+    {
+      "epoch": 0.18582836861452656,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011542839599737849,
+      "loss": 2.0016,
+      "step": 1466
+    },
+    {
+      "epoch": 0.18595512739257195,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011541963805399313,
+      "loss": 1.9566,
+      "step": 1467
+    },
+    {
+      "epoch": 0.18608188617061733,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011541087206261721,
+      "loss": 1.6469,
+      "step": 1468
+    },
+    {
+      "epoch": 0.18620864494866268,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011540209802452371,
+      "loss": 2.1394,
+      "step": 1469
+    },
+    {
+      "epoch": 0.18633540372670807,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011539331594098678,
+      "loss": 2.2794,
+      "step": 1470
+    },
+    {
+      "epoch": 0.18646216250475345,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011538452581328174,
+      "loss": 1.5623,
+      "step": 1471
+    },
+    {
+      "epoch": 0.18658892128279883,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011537572764268512,
+      "loss": 1.7997,
+      "step": 1472
+    },
+    {
+      "epoch": 0.1867156800608442,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011536692143047453,
+      "loss": 1.8238,
+      "step": 1473
+    },
+    {
+      "epoch": 0.1868424388388896,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.00011535810717792885,
+      "loss": 1.4472,
+      "step": 1474
+    },
+    {
+      "epoch": 0.18696919761693498,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011534928488632804,
+      "loss": 1.8083,
+      "step": 1475
+    },
+    {
+      "epoch": 0.18709595639498036,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011534045455695329,
+      "loss": 1.9584,
+      "step": 1476
+    },
+    {
+      "epoch": 0.18722271517302574,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011533161619108694,
+      "loss": 2.1273,
+      "step": 1477
+    },
+    {
+      "epoch": 0.1873494739510711,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011532276979001245,
+      "loss": 2.4269,
+      "step": 1478
+    },
+    {
+      "epoch": 0.18747623272911648,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.00011531391535501451,
+      "loss": 2.4586,
+      "step": 1479
+    },
+    {
+      "epoch": 0.18760299150716186,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011530505288737896,
+      "loss": 2.3621,
+      "step": 1480
+    },
+    {
+      "epoch": 0.18772975028520725,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0001152961823883928,
+      "loss": 1.7885,
+      "step": 1481
+    },
+    {
+      "epoch": 0.18785650906325263,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011528730385934418,
+      "loss": 1.8499,
+      "step": 1482
+    },
+    {
+      "epoch": 0.187983267841298,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011527841730152244,
+      "loss": 2.3266,
+      "step": 1483
+    },
+    {
+      "epoch": 0.1881100266193434,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0001152695227162181,
+      "loss": 2.0633,
+      "step": 1484
+    },
+    {
+      "epoch": 0.18823678539738878,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011526062010472277,
+      "loss": 2.0153,
+      "step": 1485
+    },
+    {
+      "epoch": 0.18836354417543416,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011525170946832934,
+      "loss": 1.9151,
+      "step": 1486
+    },
+    {
+      "epoch": 0.18849030295347952,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00011524279080833176,
+      "loss": 2.252,
+      "step": 1487
+    },
+    {
+      "epoch": 0.1886170617315249,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00011523386412602521,
+      "loss": 2.2172,
+      "step": 1488
+    },
+    {
+      "epoch": 0.18874382050957028,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011522492942270601,
+      "loss": 2.259,
+      "step": 1489
+    },
+    {
+      "epoch": 0.18887057928761566,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011521598669967166,
+      "loss": 1.9296,
+      "step": 1490
+    },
+    {
+      "epoch": 0.18899733806566105,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011520703595822081,
+      "loss": 2.1687,
+      "step": 1491
+    },
+    {
+      "epoch": 0.18912409684370643,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011519807719965326,
+      "loss": 2.2101,
+      "step": 1492
+    },
+    {
+      "epoch": 0.1892508556217518,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011518911042527003,
+      "loss": 1.9152,
+      "step": 1493
+    },
+    {
+      "epoch": 0.1893776143997972,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011518013563637323,
+      "loss": 2.6482,
+      "step": 1494
+    },
+    {
+      "epoch": 0.18950437317784258,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011517115283426617,
+      "loss": 1.6926,
+      "step": 1495
+    },
+    {
+      "epoch": 0.18963113195588793,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011516216202025337,
+      "loss": 2.1468,
+      "step": 1496
+    },
+    {
+      "epoch": 0.18975789073393332,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0001151531631956404,
+      "loss": 1.8959,
+      "step": 1497
+    },
+    {
+      "epoch": 0.1898846495119787,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011514415636173414,
+      "loss": 2.1068,
+      "step": 1498
+    },
+    {
+      "epoch": 0.19001140829002408,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00011513514151984249,
+      "loss": 2.229,
+      "step": 1499
+    },
+    {
+      "epoch": 0.19013816706806946,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011512611867127461,
+      "loss": 1.7983,
+      "step": 1500
+    },
+    {
+      "epoch": 0.19026492584611485,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011511708781734078,
+      "loss": 1.9443,
+      "step": 1501
+    },
+    {
+      "epoch": 0.19039168462416023,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011510804895935245,
+      "loss": 2.0794,
+      "step": 1502
+    },
+    {
+      "epoch": 0.1905184434022056,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011509900209862224,
+      "loss": 1.7023,
+      "step": 1503
+    },
+    {
+      "epoch": 0.190645202180251,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011508994723646392,
+      "loss": 2.4678,
+      "step": 1504
+    },
+    {
+      "epoch": 0.19077196095829635,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011508088437419244,
+      "loss": 1.8963,
+      "step": 1505
+    },
+    {
+      "epoch": 0.19089871973634173,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011507181351312389,
+      "loss": 1.9281,
+      "step": 1506
+    },
+    {
+      "epoch": 0.19102547851438711,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011506273465457555,
+      "loss": 1.7779,
+      "step": 1507
+    },
+    {
+      "epoch": 0.1911522372924325,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011505364779986583,
+      "loss": 2.4982,
+      "step": 1508
+    },
+    {
+      "epoch": 0.19127899607047788,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0001150445529503143,
+      "loss": 1.731,
+      "step": 1509
+    },
+    {
+      "epoch": 0.19140575484852326,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011503545010724173,
+      "loss": 2.2086,
+      "step": 1510
+    },
+    {
+      "epoch": 0.19153251362656865,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011502633927197002,
+      "loss": 2.1011,
+      "step": 1511
+    },
+    {
+      "epoch": 0.19165927240461403,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011501722044582224,
+      "loss": 2.2627,
+      "step": 1512
+    },
+    {
+      "epoch": 0.1917860311826594,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011500809363012261,
+      "loss": 1.9455,
+      "step": 1513
+    },
+    {
+      "epoch": 0.19191278996070477,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011499895882619653,
+      "loss": 2.2059,
+      "step": 1514
+    },
+    {
+      "epoch": 0.19203954873875015,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00011498981603537054,
+      "loss": 1.7699,
+      "step": 1515
+    },
+    {
+      "epoch": 0.19216630751679553,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011498066525897234,
+      "loss": 1.5404,
+      "step": 1516
+    },
+    {
+      "epoch": 0.19229306629484091,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011497150649833082,
+      "loss": 2.1719,
+      "step": 1517
+    },
+    {
+      "epoch": 0.1924198250728863,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011496233975477598,
+      "loss": 2.3721,
+      "step": 1518
+    },
+    {
+      "epoch": 0.19254658385093168,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011495316502963902,
+      "loss": 1.9466,
+      "step": 1519
+    },
+    {
+      "epoch": 0.19267334262897706,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0001149439823242523,
+      "loss": 2.0047,
+      "step": 1520
+    },
+    {
+      "epoch": 0.19280010140702245,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0001149347916399493,
+      "loss": 2.1183,
+      "step": 1521
+    },
+    {
+      "epoch": 0.19292686018506783,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011492559297806468,
+      "loss": 1.6834,
+      "step": 1522
+    },
+    {
+      "epoch": 0.19305361896311318,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011491638633993428,
+      "loss": 1.7123,
+      "step": 1523
+    },
+    {
+      "epoch": 0.19318037774115857,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011490717172689509,
+      "loss": 2.0399,
+      "step": 1524
+    },
+    {
+      "epoch": 0.19330713651920395,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00011489794914028521,
+      "loss": 2.8563,
+      "step": 1525
+    },
+    {
+      "epoch": 0.19343389529724933,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011488871858144398,
+      "loss": 1.9132,
+      "step": 1526
+    },
+    {
+      "epoch": 0.1935606540752947,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011487948005171181,
+      "loss": 1.7047,
+      "step": 1527
+    },
+    {
+      "epoch": 0.1936874128533401,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00011487023355243036,
+      "loss": 2.7341,
+      "step": 1528
+    },
+    {
+      "epoch": 0.19381417163138548,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011486097908494233,
+      "loss": 2.0278,
+      "step": 1529
+    },
+    {
+      "epoch": 0.19394093040943086,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011485171665059171,
+      "loss": 2.1076,
+      "step": 1530
+    },
+    {
+      "epoch": 0.19406768918747624,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011484244625072356,
+      "loss": 2.0605,
+      "step": 1531
+    },
+    {
+      "epoch": 0.1941944479655216,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011483316788668412,
+      "loss": 2.3451,
+      "step": 1532
+    },
+    {
+      "epoch": 0.19432120674356698,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011482388155982077,
+      "loss": 2.3693,
+      "step": 1533
+    },
+    {
+      "epoch": 0.19444796552161236,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011481458727148208,
+      "loss": 2.3917,
+      "step": 1534
+    },
+    {
+      "epoch": 0.19457472429965775,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011480528502301776,
+      "loss": 2.214,
+      "step": 1535
+    },
+    {
+      "epoch": 0.19470148307770313,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011479597481577866,
+      "loss": 2.3455,
+      "step": 1536
+    },
+    {
+      "epoch": 0.1948282418557485,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011478665665111682,
+      "loss": 1.9683,
+      "step": 1537
+    },
+    {
+      "epoch": 0.1949550006337939,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0001147773305303854,
+      "loss": 1.5155,
+      "step": 1538
+    },
+    {
+      "epoch": 0.19508175941183928,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011476799645493872,
+      "loss": 1.5717,
+      "step": 1539
+    },
+    {
+      "epoch": 0.19520851818988466,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0001147586544261323,
+      "loss": 1.7679,
+      "step": 1540
+    },
+    {
+      "epoch": 0.19533527696793002,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00011474930444532275,
+      "loss": 2.1292,
+      "step": 1541
+    },
+    {
+      "epoch": 0.1954620357459754,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011473994651386788,
+      "loss": 2.2694,
+      "step": 1542
+    },
+    {
+      "epoch": 0.19558879452402078,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011473058063312664,
+      "loss": 1.8529,
+      "step": 1543
+    },
+    {
+      "epoch": 0.19571555330206616,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011472120680445914,
+      "loss": 1.6976,
+      "step": 1544
+    },
+    {
+      "epoch": 0.19584231208011155,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00011471182502922664,
+      "loss": 2.3781,
+      "step": 1545
+    },
+    {
+      "epoch": 0.19596907085815693,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011470243530879152,
+      "loss": 2.1917,
+      "step": 1546
+    },
+    {
+      "epoch": 0.1960958296362023,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011469303764451738,
+      "loss": 2.7729,
+      "step": 1547
+    },
+    {
+      "epoch": 0.1962225884142477,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011468363203776895,
+      "loss": 1.8288,
+      "step": 1548
+    },
+    {
+      "epoch": 0.19634934719229308,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011467421848991207,
+      "loss": 2.2218,
+      "step": 1549
+    },
+    {
+      "epoch": 0.19647610597033843,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011466479700231378,
+      "loss": 2.0362,
+      "step": 1550
+    },
+    {
+      "epoch": 0.19660286474838382,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011465536757634227,
+      "loss": 2.1633,
+      "step": 1551
+    },
+    {
+      "epoch": 0.1967296235264292,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011464593021336686,
+      "loss": 1.5545,
+      "step": 1552
+    },
+    {
+      "epoch": 0.19685638230447458,
+      "grad_norm": 2.296875,
+      "learning_rate": 0.00011463648491475807,
+      "loss": 1.7115,
+      "step": 1553
+    },
+    {
+      "epoch": 0.19698314108251996,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011462703168188747,
+      "loss": 1.7945,
+      "step": 1554
+    },
+    {
+      "epoch": 0.19710989986056535,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011461757051612792,
+      "loss": 2.0922,
+      "step": 1555
+    },
+    {
+      "epoch": 0.19723665863861073,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011460810141885331,
+      "loss": 1.7561,
+      "step": 1556
+    },
+    {
+      "epoch": 0.1973634174166561,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011459862439143877,
+      "loss": 2.034,
+      "step": 1557
+    },
+    {
+      "epoch": 0.1974901761947015,
+      "grad_norm": 13.75,
+      "learning_rate": 0.00011458913943526053,
+      "loss": 1.5943,
+      "step": 1558
+    },
+    {
+      "epoch": 0.19761693497274685,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011457964655169599,
+      "loss": 2.0073,
+      "step": 1559
+    },
+    {
+      "epoch": 0.19774369375079223,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.00011457014574212369,
+      "loss": 1.6193,
+      "step": 1560
+    },
+    {
+      "epoch": 0.19787045252883761,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011456063700792333,
+      "loss": 2.6768,
+      "step": 1561
+    },
+    {
+      "epoch": 0.197997211306883,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011455112035047578,
+      "loss": 2.4781,
+      "step": 1562
+    },
+    {
+      "epoch": 0.19812397008492838,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011454159577116302,
+      "loss": 1.853,
+      "step": 1563
+    },
+    {
+      "epoch": 0.19825072886297376,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011453206327136821,
+      "loss": 2.5363,
+      "step": 1564
+    },
+    {
+      "epoch": 0.19837748764101915,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011452252285247564,
+      "loss": 1.9284,
+      "step": 1565
+    },
+    {
+      "epoch": 0.19850424641906453,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011451297451587077,
+      "loss": 1.5369,
+      "step": 1566
+    },
+    {
+      "epoch": 0.1986310051971099,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011450341826294022,
+      "loss": 1.8416,
+      "step": 1567
+    },
+    {
+      "epoch": 0.19875776397515527,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0001144938540950717,
+      "loss": 2.1704,
+      "step": 1568
+    },
+    {
+      "epoch": 0.19888452275320065,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011448428201365414,
+      "loss": 1.6665,
+      "step": 1569
+    },
+    {
+      "epoch": 0.19901128153124603,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011447470202007757,
+      "loss": 2.2956,
+      "step": 1570
+    },
+    {
+      "epoch": 0.19913804030929141,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0001144651141157332,
+      "loss": 1.7257,
+      "step": 1571
+    },
+    {
+      "epoch": 0.1992647990873368,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011445551830201338,
+      "loss": 2.2848,
+      "step": 1572
+    },
+    {
+      "epoch": 0.19939155786538218,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0001144459145803116,
+      "loss": 2.0587,
+      "step": 1573
+    },
+    {
+      "epoch": 0.19951831664342756,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0001144363029520225,
+      "loss": 1.9333,
+      "step": 1574
+    },
+    {
+      "epoch": 0.19964507542147295,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011442668341854188,
+      "loss": 1.9749,
+      "step": 1575
+    },
+    {
+      "epoch": 0.19977183419951833,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011441705598126667,
+      "loss": 2.2388,
+      "step": 1576
+    },
+    {
+      "epoch": 0.19989859297756368,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011440742064159498,
+      "loss": 1.9092,
+      "step": 1577
+    },
+    {
+      "epoch": 0.20002535175560907,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011439777740092602,
+      "loss": 2.3095,
+      "step": 1578
+    },
+    {
+      "epoch": 0.20015211053365445,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011438812626066017,
+      "loss": 2.1366,
+      "step": 1579
+    },
+    {
+      "epoch": 0.20027886931169983,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011437846722219901,
+      "loss": 2.3277,
+      "step": 1580
+    },
+    {
+      "epoch": 0.2004056280897452,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011436880028694515,
+      "loss": 1.8407,
+      "step": 1581
+    },
+    {
+      "epoch": 0.2005323868677906,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011435912545630244,
+      "loss": 2.1008,
+      "step": 1582
+    },
+    {
+      "epoch": 0.20065914564583598,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011434944273167588,
+      "loss": 2.2148,
+      "step": 1583
+    },
+    {
+      "epoch": 0.20078590442388136,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011433975211447154,
+      "loss": 1.9835,
+      "step": 1584
+    },
+    {
+      "epoch": 0.20091266320192674,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0001143300536060967,
+      "loss": 2.3527,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2010394219799721,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011432034720795979,
+      "loss": 1.6061,
+      "step": 1586
+    },
+    {
+      "epoch": 0.20116618075801748,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011431063292147033,
+      "loss": 2.3286,
+      "step": 1587
+    },
+    {
+      "epoch": 0.20129293953606286,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00011430091074803904,
+      "loss": 1.8174,
+      "step": 1588
+    },
+    {
+      "epoch": 0.20141969831410825,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011429118068907776,
+      "loss": 1.5983,
+      "step": 1589
+    },
+    {
+      "epoch": 0.20154645709215363,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011428144274599949,
+      "loss": 2.337,
+      "step": 1590
+    },
+    {
+      "epoch": 0.201673215870199,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011427169692021836,
+      "loss": 1.8694,
+      "step": 1591
+    },
+    {
+      "epoch": 0.2017999746482444,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011426194321314964,
+      "loss": 1.8436,
+      "step": 1592
+    },
+    {
+      "epoch": 0.20192673342628978,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011425218162620978,
+      "loss": 1.8827,
+      "step": 1593
+    },
+    {
+      "epoch": 0.20205349220433516,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011424241216081631,
+      "loss": 1.6589,
+      "step": 1594
+    },
+    {
+      "epoch": 0.20218025098238052,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011423263481838797,
+      "loss": 2.0369,
+      "step": 1595
+    },
+    {
+      "epoch": 0.2023070097604259,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011422284960034462,
+      "loss": 1.694,
+      "step": 1596
+    },
+    {
+      "epoch": 0.20243376853847128,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00011421305650810725,
+      "loss": 1.9539,
+      "step": 1597
+    },
+    {
+      "epoch": 0.20256052731651666,
+      "grad_norm": 1.0,
+      "learning_rate": 0.000114203255543098,
+      "loss": 1.9343,
+      "step": 1598
+    },
+    {
+      "epoch": 0.20268728609456205,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011419344670674017,
+      "loss": 2.5027,
+      "step": 1599
+    },
+    {
+      "epoch": 0.20281404487260743,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011418363000045817,
+      "loss": 1.9408,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2029408036506528,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0001141738054256776,
+      "loss": 2.428,
+      "step": 1601
+    },
+    {
+      "epoch": 0.2030675624286982,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011416397298382518,
+      "loss": 2.2939,
+      "step": 1602
+    },
+    {
+      "epoch": 0.20319432120674358,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011415413267632874,
+      "loss": 2.4483,
+      "step": 1603
+    },
+    {
+      "epoch": 0.20332107998478896,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011414428450461729,
+      "loss": 1.6756,
+      "step": 1604
+    },
+    {
+      "epoch": 0.20344783876283432,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00011413442847012098,
+      "loss": 1.9263,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2035745975408797,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011412456457427108,
+      "loss": 1.4446,
+      "step": 1606
+    },
+    {
+      "epoch": 0.20370135631892508,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011411469281850004,
+      "loss": 1.5815,
+      "step": 1607
+    },
+    {
+      "epoch": 0.20382811509697046,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0001141048132042414,
+      "loss": 2.1134,
+      "step": 1608
+    },
+    {
+      "epoch": 0.20395487387501585,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011409492573292988,
+      "loss": 1.6897,
+      "step": 1609
+    },
+    {
+      "epoch": 0.20408163265306123,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.00011408503040600136,
+      "loss": 2.0431,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2042083914311066,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011407512722489278,
+      "loss": 2.1309,
+      "step": 1611
+    },
+    {
+      "epoch": 0.204335150209152,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0001140652161910423,
+      "loss": 1.8774,
+      "step": 1612
+    },
+    {
+      "epoch": 0.20446190898719738,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011405529730588918,
+      "loss": 1.9765,
+      "step": 1613
+    },
+    {
+      "epoch": 0.20458866776524273,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00011404537057087386,
+      "loss": 2.7709,
+      "step": 1614
+    },
+    {
+      "epoch": 0.20471542654328811,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011403543598743785,
+      "loss": 1.9313,
+      "step": 1615
+    },
+    {
+      "epoch": 0.2048421853213335,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011402549355702387,
+      "loss": 2.0697,
+      "step": 1616
+    },
+    {
+      "epoch": 0.20496894409937888,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011401554328107571,
+      "loss": 2.0153,
+      "step": 1617
+    },
+    {
+      "epoch": 0.20509570287742426,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011400558516103842,
+      "loss": 1.8693,
+      "step": 1618
+    },
+    {
+      "epoch": 0.20522246165546965,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.00011399561919835803,
+      "loss": 2.1455,
+      "step": 1619
+    },
+    {
+      "epoch": 0.20534922043351503,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011398564539448184,
+      "loss": 2.1132,
+      "step": 1620
+    },
+    {
+      "epoch": 0.2054759792115604,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011397566375085821,
+      "loss": 1.6704,
+      "step": 1621
+    },
+    {
+      "epoch": 0.2056027379896058,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011396567426893666,
+      "loss": 2.2182,
+      "step": 1622
+    },
+    {
+      "epoch": 0.20572949676765115,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011395567695016787,
+      "loss": 1.7067,
+      "step": 1623
+    },
+    {
+      "epoch": 0.20585625554569653,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011394567179600364,
+      "loss": 2.0757,
+      "step": 1624
+    },
+    {
+      "epoch": 0.20598301432374191,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011393565880789691,
+      "loss": 1.8869,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2061097731017873,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011392563798730175,
+      "loss": 2.1026,
+      "step": 1626
+    },
+    {
+      "epoch": 0.20623653187983268,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011391560933567339,
+      "loss": 1.4794,
+      "step": 1627
+    },
+    {
+      "epoch": 0.20636329065787806,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011390557285446814,
+      "loss": 2.0066,
+      "step": 1628
+    },
+    {
+      "epoch": 0.20649004943592345,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011389552854514354,
+      "loss": 1.9564,
+      "step": 1629
+    },
+    {
+      "epoch": 0.20661680821396883,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0001138854764091582,
+      "loss": 2.2551,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2067435669920142,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011387541644797187,
+      "loss": 2.1688,
+      "step": 1631
+    },
+    {
+      "epoch": 0.20687032577005957,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011386534866304547,
+      "loss": 2.0605,
+      "step": 1632
+    },
+    {
+      "epoch": 0.20699708454810495,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011385527305584101,
+      "loss": 1.798,
+      "step": 1633
+    },
+    {
+      "epoch": 0.20712384332615033,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00011384518962782168,
+      "loss": 1.927,
+      "step": 1634
+    },
+    {
+      "epoch": 0.2072506021041957,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011383509838045177,
+      "loss": 1.6926,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2073773608822411,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011382499931519671,
+      "loss": 1.6868,
+      "step": 1636
+    },
+    {
+      "epoch": 0.20750411966028648,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00011381489243352312,
+      "loss": 2.2065,
+      "step": 1637
+    },
+    {
+      "epoch": 0.20763087843833186,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011380477773689868,
+      "loss": 2.0217,
+      "step": 1638
+    },
+    {
+      "epoch": 0.20775763721637724,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011379465522679227,
+      "loss": 2.1434,
+      "step": 1639
+    },
+    {
+      "epoch": 0.20788439599442263,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011378452490467382,
+      "loss": 1.7276,
+      "step": 1640
+    },
+    {
+      "epoch": 0.20801115477246798,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00011377438677201449,
+      "loss": 1.9917,
+      "step": 1641
+    },
+    {
+      "epoch": 0.20813791355051336,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011376424083028652,
+      "loss": 1.5198,
+      "step": 1642
+    },
+    {
+      "epoch": 0.20826467232855875,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011375408708096327,
+      "loss": 1.389,
+      "step": 1643
+    },
+    {
+      "epoch": 0.20839143110660413,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.00011374392552551931,
+      "loss": 1.9101,
+      "step": 1644
+    },
+    {
+      "epoch": 0.2085181898846495,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011373375616543023,
+      "loss": 2.1235,
+      "step": 1645
+    },
+    {
+      "epoch": 0.2086449486626949,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011372357900217286,
+      "loss": 2.0348,
+      "step": 1646
+    },
+    {
+      "epoch": 0.20877170744074028,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011371339403722512,
+      "loss": 1.723,
+      "step": 1647
+    },
+    {
+      "epoch": 0.20889846621878566,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011370320127206602,
+      "loss": 2.155,
+      "step": 1648
+    },
+    {
+      "epoch": 0.20902522499683104,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011369300070817578,
+      "loss": 1.8449,
+      "step": 1649
+    },
+    {
+      "epoch": 0.2091519837748764,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00011368279234703571,
+      "loss": 2.4111,
+      "step": 1650
+    },
+    {
+      "epoch": 0.20927874255292178,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011367257619012826,
+      "loss": 2.1259,
+      "step": 1651
+    },
+    {
+      "epoch": 0.20940550133096716,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.000113662352238937,
+      "loss": 2.2397,
+      "step": 1652
+    },
+    {
+      "epoch": 0.20953226010901255,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011365212049494665,
+      "loss": 2.3617,
+      "step": 1653
+    },
+    {
+      "epoch": 0.20965901888705793,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011364188095964306,
+      "loss": 2.0238,
+      "step": 1654
+    },
+    {
+      "epoch": 0.2097857776651033,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011363163363451321,
+      "loss": 2.0522,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2099125364431487,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0001136213785210452,
+      "loss": 2.2515,
+      "step": 1656
+    },
+    {
+      "epoch": 0.21003929522119408,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011361111562072825,
+      "loss": 2.1909,
+      "step": 1657
+    },
+    {
+      "epoch": 0.21016605399923946,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00011360084493505276,
+      "loss": 2.1522,
+      "step": 1658
+    },
+    {
+      "epoch": 0.21029281277728482,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0001135905664655102,
+      "loss": 2.0019,
+      "step": 1659
+    },
+    {
+      "epoch": 0.2104195715553302,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00011358028021359325,
+      "loss": 2.1471,
+      "step": 1660
+    },
+    {
+      "epoch": 0.21054633033337558,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0001135699861807956,
+      "loss": 2.2812,
+      "step": 1661
+    },
+    {
+      "epoch": 0.21067308911142096,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011355968436861222,
+      "loss": 2.0823,
+      "step": 1662
+    },
+    {
+      "epoch": 0.21079984788946635,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.00011354937477853906,
+      "loss": 1.8609,
+      "step": 1663
+    },
+    {
+      "epoch": 0.21092660666751173,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0001135390574120733,
+      "loss": 1.9208,
+      "step": 1664
+    },
+    {
+      "epoch": 0.2110533654455571,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011352873227071321,
+      "loss": 1.7732,
+      "step": 1665
+    },
+    {
+      "epoch": 0.2111801242236025,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011351839935595823,
+      "loss": 1.785,
+      "step": 1666
+    },
+    {
+      "epoch": 0.21130688300164788,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011350805866930884,
+      "loss": 1.8062,
+      "step": 1667
+    },
+    {
+      "epoch": 0.21143364177969323,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011349771021226676,
+      "loss": 1.9102,
+      "step": 1668
+    },
+    {
+      "epoch": 0.21156040055773861,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011348735398633474,
+      "loss": 1.7619,
+      "step": 1669
+    },
+    {
+      "epoch": 0.211687159335784,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011347698999301675,
+      "loss": 2.0346,
+      "step": 1670
+    },
+    {
+      "epoch": 0.21181391811382938,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011346661823381778,
+      "loss": 2.471,
+      "step": 1671
+    },
+    {
+      "epoch": 0.21194067689187476,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011345623871024406,
+      "loss": 1.9056,
+      "step": 1672
+    },
+    {
+      "epoch": 0.21206743566992015,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011344585142380286,
+      "loss": 2.5207,
+      "step": 1673
+    },
+    {
+      "epoch": 0.21219419444796553,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011343545637600264,
+      "loss": 1.7837,
+      "step": 1674
+    },
+    {
+      "epoch": 0.2123209532260109,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011342505356835294,
+      "loss": 2.2347,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2124477120040563,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011341464300236444,
+      "loss": 2.0306,
+      "step": 1676
+    },
+    {
+      "epoch": 0.21257447078210165,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011340422467954899,
+      "loss": 1.9878,
+      "step": 1677
+    },
+    {
+      "epoch": 0.21270122956014703,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0001133937986014195,
+      "loss": 2.069,
+      "step": 1678
+    },
+    {
+      "epoch": 0.21282798833819241,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011338336476949003,
+      "loss": 2.3773,
+      "step": 1679
+    },
+    {
+      "epoch": 0.2129547471162378,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011337292318527578,
+      "loss": 1.6187,
+      "step": 1680
+    },
+    {
+      "epoch": 0.21308150589428318,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011336247385029309,
+      "loss": 2.2853,
+      "step": 1681
+    },
+    {
+      "epoch": 0.21320826467232856,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011335201676605939,
+      "loss": 1.738,
+      "step": 1682
+    },
+    {
+      "epoch": 0.21333502345037395,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.00011334155193409322,
+      "loss": 1.663,
+      "step": 1683
+    },
+    {
+      "epoch": 0.21346178222841933,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0001133310793559143,
+      "loss": 1.7941,
+      "step": 1684
+    },
+    {
+      "epoch": 0.2135885410064647,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011332059903304346,
+      "loss": 2.1791,
+      "step": 1685
+    },
+    {
+      "epoch": 0.21371529978451007,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0001133101109670026,
+      "loss": 1.8736,
+      "step": 1686
+    },
+    {
+      "epoch": 0.21384205856255545,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00011329961515931483,
+      "loss": 2.4165,
+      "step": 1687
+    },
+    {
+      "epoch": 0.21396881734060083,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011328911161150433,
+      "loss": 1.7651,
+      "step": 1688
+    },
+    {
+      "epoch": 0.2140955761186462,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011327860032509642,
+      "loss": 2.1969,
+      "step": 1689
+    },
+    {
+      "epoch": 0.2142223348966916,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011326808130161752,
+      "loss": 1.6884,
+      "step": 1690
+    },
+    {
+      "epoch": 0.21434909367473698,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011325755454259522,
+      "loss": 1.9444,
+      "step": 1691
+    },
+    {
+      "epoch": 0.21447585245278236,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0001132470200495582,
+      "loss": 1.7434,
+      "step": 1692
+    },
+    {
+      "epoch": 0.21460261123082774,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011323647782403624,
+      "loss": 2.0652,
+      "step": 1693
+    },
+    {
+      "epoch": 0.21472937000887313,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011322592786756029,
+      "loss": 2.2021,
+      "step": 1694
+    },
+    {
+      "epoch": 0.21485612878691848,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011321537018166243,
+      "loss": 2.3423,
+      "step": 1695
+    },
+    {
+      "epoch": 0.21498288756496386,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011320480476787582,
+      "loss": 1.959,
+      "step": 1696
+    },
+    {
+      "epoch": 0.21510964634300925,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011319423162773475,
+      "loss": 2.2949,
+      "step": 1697
+    },
+    {
+      "epoch": 0.21523640512105463,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011318365076277463,
+      "loss": 2.2568,
+      "step": 1698
+    },
+    {
+      "epoch": 0.2153631638991,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011317306217453204,
+      "loss": 1.9458,
+      "step": 1699
+    },
+    {
+      "epoch": 0.2154899226771454,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011316246586454464,
+      "loss": 2.1855,
+      "step": 1700
+    },
+    {
+      "epoch": 0.21561668145519078,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0001131518618343512,
+      "loss": 2.1287,
+      "step": 1701
+    },
+    {
+      "epoch": 0.21574344023323616,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011314125008549163,
+      "loss": 1.9651,
+      "step": 1702
+    },
+    {
+      "epoch": 0.21587019901128154,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011313063061950695,
+      "loss": 2.084,
+      "step": 1703
+    },
+    {
+      "epoch": 0.2159969577893269,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011312000343793935,
+      "loss": 2.3129,
+      "step": 1704
+    },
+    {
+      "epoch": 0.21612371656737228,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011310936854233206,
+      "loss": 1.8155,
+      "step": 1705
+    },
+    {
+      "epoch": 0.21625047534541766,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00011309872593422948,
+      "loss": 2.1351,
+      "step": 1706
+    },
+    {
+      "epoch": 0.21637723412346305,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011308807561517712,
+      "loss": 1.6095,
+      "step": 1707
+    },
+    {
+      "epoch": 0.21650399290150843,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011307741758672162,
+      "loss": 1.5984,
+      "step": 1708
+    },
+    {
+      "epoch": 0.2166307516795538,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011306675185041071,
+      "loss": 1.7248,
+      "step": 1709
+    },
+    {
+      "epoch": 0.2167575104575992,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00011305607840779326,
+      "loss": 1.7754,
+      "step": 1710
+    },
+    {
+      "epoch": 0.21688426923564458,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0001130453972604193,
+      "loss": 1.8595,
+      "step": 1711
+    },
+    {
+      "epoch": 0.21701102801368996,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011303470840983989,
+      "loss": 1.9527,
+      "step": 1712
+    },
+    {
+      "epoch": 0.21713778679173532,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011302401185760727,
+      "loss": 1.8134,
+      "step": 1713
+    },
+    {
+      "epoch": 0.2172645455697807,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011301330760527477,
+      "loss": 1.413,
+      "step": 1714
+    },
+    {
+      "epoch": 0.21739130434782608,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011300259565439689,
+      "loss": 1.6031,
+      "step": 1715
+    },
+    {
+      "epoch": 0.21751806312587146,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.00011299187600652916,
+      "loss": 1.8899,
+      "step": 1716
+    },
+    {
+      "epoch": 0.21764482190391685,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011298114866322833,
+      "loss": 2.4563,
+      "step": 1717
+    },
+    {
+      "epoch": 0.21777158068196223,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011297041362605218,
+      "loss": 2.176,
+      "step": 1718
+    },
+    {
+      "epoch": 0.2178983394600076,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011295967089655963,
+      "loss": 1.972,
+      "step": 1719
+    },
+    {
+      "epoch": 0.218025098238053,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011294892047631078,
+      "loss": 1.934,
+      "step": 1720
+    },
+    {
+      "epoch": 0.21815185701609838,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011293816236686675,
+      "loss": 2.1529,
+      "step": 1721
+    },
+    {
+      "epoch": 0.21827861579414373,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011292739656978984,
+      "loss": 1.9657,
+      "step": 1722
+    },
+    {
+      "epoch": 0.21840537457218911,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011291662308664347,
+      "loss": 1.852,
+      "step": 1723
+    },
+    {
+      "epoch": 0.2185321333502345,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011290584191899211,
+      "loss": 1.8048,
+      "step": 1724
+    },
+    {
+      "epoch": 0.21865889212827988,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011289505306840142,
+      "loss": 2.0214,
+      "step": 1725
+    },
+    {
+      "epoch": 0.21878565090632526,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011288425653643815,
+      "loss": 2.2263,
+      "step": 1726
+    },
+    {
+      "epoch": 0.21891240968437065,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011287345232467017,
+      "loss": 2.1432,
+      "step": 1727
+    },
+    {
+      "epoch": 0.21903916846241603,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011286264043466643,
+      "loss": 2.4881,
+      "step": 1728
+    },
+    {
+      "epoch": 0.2191659272404614,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011285182086799705,
+      "loss": 2.3732,
+      "step": 1729
+    },
+    {
+      "epoch": 0.2192926860185068,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011284099362623322,
+      "loss": 1.6829,
+      "step": 1730
+    },
+    {
+      "epoch": 0.21941944479655215,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00011283015871094727,
+      "loss": 1.1974,
+      "step": 1731
+    },
+    {
+      "epoch": 0.21954620357459753,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011281931612371263,
+      "loss": 1.457,
+      "step": 1732
+    },
+    {
+      "epoch": 0.21967296235264291,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011280846586610387,
+      "loss": 2.1439,
+      "step": 1733
+    },
+    {
+      "epoch": 0.2197997211306883,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011279760793969663,
+      "loss": 1.6494,
+      "step": 1734
+    },
+    {
+      "epoch": 0.21992647990873368,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0001127867423460677,
+      "loss": 1.8797,
+      "step": 1735
+    },
+    {
+      "epoch": 0.22005323868677906,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011277586908679498,
+      "loss": 2.0296,
+      "step": 1736
+    },
+    {
+      "epoch": 0.22017999746482445,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011276498816345745,
+      "loss": 2.1084,
+      "step": 1737
+    },
+    {
+      "epoch": 0.22030675624286983,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011275409957763525,
+      "loss": 1.8608,
+      "step": 1738
+    },
+    {
+      "epoch": 0.2204335150209152,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0001127432033309096,
+      "loss": 2.0492,
+      "step": 1739
+    },
+    {
+      "epoch": 0.22056027379896057,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011273229942486287,
+      "loss": 1.9129,
+      "step": 1740
+    },
+    {
+      "epoch": 0.22068703257700595,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011272138786107848,
+      "loss": 2.1094,
+      "step": 1741
+    },
+    {
+      "epoch": 0.22081379135505133,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011271046864114101,
+      "loss": 2.0271,
+      "step": 1742
+    },
+    {
+      "epoch": 0.2209405501330967,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011269954176663614,
+      "loss": 1.8049,
+      "step": 1743
+    },
+    {
+      "epoch": 0.2210673089111421,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011268860723915066,
+      "loss": 1.8328,
+      "step": 1744
+    },
+    {
+      "epoch": 0.22119406768918748,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0001126776650602725,
+      "loss": 2.2105,
+      "step": 1745
+    },
+    {
+      "epoch": 0.22132082646723286,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011266671523159062,
+      "loss": 2.2894,
+      "step": 1746
+    },
+    {
+      "epoch": 0.22144758524527824,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011265575775469518,
+      "loss": 1.9724,
+      "step": 1747
+    },
+    {
+      "epoch": 0.22157434402332363,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0001126447926311774,
+      "loss": 2.0877,
+      "step": 1748
+    },
+    {
+      "epoch": 0.22170110280136898,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011263381986262965,
+      "loss": 1.4707,
+      "step": 1749
+    },
+    {
+      "epoch": 0.22182786157941436,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011262283945064534,
+      "loss": 2.0119,
+      "step": 1750
+    },
+    {
+      "epoch": 0.22195462035745975,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011261185139681909,
+      "loss": 2.0175,
+      "step": 1751
+    },
+    {
+      "epoch": 0.22208137913550513,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011260085570274655,
+      "loss": 1.8443,
+      "step": 1752
+    },
+    {
+      "epoch": 0.2222081379135505,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0001125898523700245,
+      "loss": 1.6954,
+      "step": 1753
+    },
+    {
+      "epoch": 0.2223348966915959,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011257884140025083,
+      "loss": 2.309,
+      "step": 1754
+    },
+    {
+      "epoch": 0.22246165546964128,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011256782279502456,
+      "loss": 2.0581,
+      "step": 1755
+    },
+    {
+      "epoch": 0.22258841424768666,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011255679655594578,
+      "loss": 1.6548,
+      "step": 1756
+    },
+    {
+      "epoch": 0.22271517302573204,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011254576268461574,
+      "loss": 1.8822,
+      "step": 1757
+    },
+    {
+      "epoch": 0.2228419318037774,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011253472118263674,
+      "loss": 1.6084,
+      "step": 1758
+    },
+    {
+      "epoch": 0.22296869058182278,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011252367205161224,
+      "loss": 2.2172,
+      "step": 1759
+    },
+    {
+      "epoch": 0.22309544935986816,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011251261529314678,
+      "loss": 2.0781,
+      "step": 1760
+    },
+    {
+      "epoch": 0.22322220813791355,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011250155090884601,
+      "loss": 1.52,
+      "step": 1761
+    },
+    {
+      "epoch": 0.22334896691595893,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011249047890031668,
+      "loss": 2.3089,
+      "step": 1762
+    },
+    {
+      "epoch": 0.2234757256940043,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00011247939926916666,
+      "loss": 2.0268,
+      "step": 1763
+    },
+    {
+      "epoch": 0.2236024844720497,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011246831201700493,
+      "loss": 2.0295,
+      "step": 1764
+    },
+    {
+      "epoch": 0.22372924325009508,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011245721714544159,
+      "loss": 1.8774,
+      "step": 1765
+    },
+    {
+      "epoch": 0.22385600202814046,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011244611465608779,
+      "loss": 1.9199,
+      "step": 1766
+    },
+    {
+      "epoch": 0.22398276080618582,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011243500455055586,
+      "loss": 1.6384,
+      "step": 1767
+    },
+    {
+      "epoch": 0.2241095195842312,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011242388683045916,
+      "loss": 2.2951,
+      "step": 1768
+    },
+    {
+      "epoch": 0.22423627836227658,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011241276149741223,
+      "loss": 2.0019,
+      "step": 1769
+    },
+    {
+      "epoch": 0.22436303714032196,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011240162855303065,
+      "loss": 1.5862,
+      "step": 1770
+    },
+    {
+      "epoch": 0.22448979591836735,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011239048799893118,
+      "loss": 2.2524,
+      "step": 1771
+    },
+    {
+      "epoch": 0.22461655469641273,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011237933983673161,
+      "loss": 2.118,
+      "step": 1772
+    },
+    {
+      "epoch": 0.2247433134744581,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011236818406805086,
+      "loss": 2.0593,
+      "step": 1773
+    },
+    {
+      "epoch": 0.2248700722525035,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011235702069450897,
+      "loss": 2.1958,
+      "step": 1774
+    },
+    {
+      "epoch": 0.22499683103054888,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011234584971772708,
+      "loss": 1.7664,
+      "step": 1775
+    },
+    {
+      "epoch": 0.22512358980859423,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011233467113932743,
+      "loss": 1.7337,
+      "step": 1776
+    },
+    {
+      "epoch": 0.22525034858663961,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00011232348496093338,
+      "loss": 1.9444,
+      "step": 1777
+    },
+    {
+      "epoch": 0.225377107364685,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011231229118416934,
+      "loss": 1.7405,
+      "step": 1778
+    },
+    {
+      "epoch": 0.22550386614273038,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011230108981066086,
+      "loss": 1.5865,
+      "step": 1779
+    },
+    {
+      "epoch": 0.22563062492077576,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011228988084203463,
+      "loss": 1.9092,
+      "step": 1780
+    },
+    {
+      "epoch": 0.22575738369882115,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0001122786642799184,
+      "loss": 2.2845,
+      "step": 1781
+    },
+    {
+      "epoch": 0.22588414247686653,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011226744012594098,
+      "loss": 1.8124,
+      "step": 1782
+    },
+    {
+      "epoch": 0.2260109012549119,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011225620838173239,
+      "loss": 1.6382,
+      "step": 1783
+    },
+    {
+      "epoch": 0.2261376600329573,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011224496904892367,
+      "loss": 1.8672,
+      "step": 1784
+    },
+    {
+      "epoch": 0.22626441881100265,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.000112233722129147,
+      "loss": 2.5175,
+      "step": 1785
+    },
+    {
+      "epoch": 0.22639117758904803,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011222246762403561,
+      "loss": 1.8184,
+      "step": 1786
+    },
+    {
+      "epoch": 0.22651793636709341,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0001122112055352239,
+      "loss": 1.4041,
+      "step": 1787
+    },
+    {
+      "epoch": 0.2266446951451388,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011219993586434735,
+      "loss": 1.6064,
+      "step": 1788
+    },
+    {
+      "epoch": 0.22677145392318418,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00011218865861304251,
+      "loss": 2.008,
+      "step": 1789
+    },
+    {
+      "epoch": 0.22689821270122956,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011217737378294707,
+      "loss": 1.6627,
+      "step": 1790
+    },
+    {
+      "epoch": 0.22702497147927495,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011216608137569978,
+      "loss": 2.0856,
+      "step": 1791
+    },
+    {
+      "epoch": 0.22715173025732033,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011215478139294054,
+      "loss": 1.4823,
+      "step": 1792
+    },
+    {
+      "epoch": 0.2272784890353657,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011214347383631029,
+      "loss": 1.8086,
+      "step": 1793
+    },
+    {
+      "epoch": 0.22740524781341107,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011213215870745113,
+      "loss": 1.7201,
+      "step": 1794
+    },
+    {
+      "epoch": 0.22753200659145645,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011212083600800622,
+      "loss": 1.9477,
+      "step": 1795
+    },
+    {
+      "epoch": 0.22765876536950183,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011210950573961985,
+      "loss": 2.072,
+      "step": 1796
+    },
+    {
+      "epoch": 0.2277855241475472,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011209816790393737,
+      "loss": 1.9112,
+      "step": 1797
+    },
+    {
+      "epoch": 0.2279122829255926,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011208682250260526,
+      "loss": 2.2568,
+      "step": 1798
+    },
+    {
+      "epoch": 0.22803904170363798,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0001120754695372711,
+      "loss": 2.1276,
+      "step": 1799
+    },
+    {
+      "epoch": 0.22816580048168336,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011206410900958355,
+      "loss": 1.9848,
+      "step": 1800
+    },
+    {
+      "epoch": 0.22829255925972874,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011205274092119235,
+      "loss": 2.4462,
+      "step": 1801
+    },
+    {
+      "epoch": 0.22841931803777413,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011204136527374839,
+      "loss": 1.9101,
+      "step": 1802
+    },
+    {
+      "epoch": 0.22854607681581948,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011202998206890364,
+      "loss": 1.7892,
+      "step": 1803
+    },
+    {
+      "epoch": 0.22867283559386486,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011201859130831114,
+      "loss": 2.0911,
+      "step": 1804
+    },
+    {
+      "epoch": 0.22879959437191025,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011200719299362506,
+      "loss": 2.2664,
+      "step": 1805
+    },
+    {
+      "epoch": 0.22892635314995563,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011199578712650062,
+      "loss": 2.2006,
+      "step": 1806
+    },
+    {
+      "epoch": 0.229053111928001,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011198437370859421,
+      "loss": 1.9552,
+      "step": 1807
+    },
+    {
+      "epoch": 0.2291798707060464,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011197295274156326,
+      "loss": 1.9655,
+      "step": 1808
+    },
+    {
+      "epoch": 0.22930662948409178,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011196152422706633,
+      "loss": 2.3358,
+      "step": 1809
+    },
+    {
+      "epoch": 0.22943338826213716,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011195008816676302,
+      "loss": 2.0809,
+      "step": 1810
+    },
+    {
+      "epoch": 0.22956014704018254,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011193864456231411,
+      "loss": 1.9587,
+      "step": 1811
+    },
+    {
+      "epoch": 0.2296869058182279,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011192719341538138,
+      "loss": 2.4708,
+      "step": 1812
+    },
+    {
+      "epoch": 0.22981366459627328,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0001119157347276278,
+      "loss": 2.2584,
+      "step": 1813
+    },
+    {
+      "epoch": 0.22994042337431866,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00011190426850071738,
+      "loss": 1.6815,
+      "step": 1814
+    },
+    {
+      "epoch": 0.23006718215236405,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00011189279473631521,
+      "loss": 1.2917,
+      "step": 1815
+    },
+    {
+      "epoch": 0.23019394093040943,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011188131343608753,
+      "loss": 1.8444,
+      "step": 1816
+    },
+    {
+      "epoch": 0.2303206997084548,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011186982460170164,
+      "loss": 2.0416,
+      "step": 1817
+    },
+    {
+      "epoch": 0.2304474584865002,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011185832823482593,
+      "loss": 2.1699,
+      "step": 1818
+    },
+    {
+      "epoch": 0.23057421726454558,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011184682433712987,
+      "loss": 2.0034,
+      "step": 1819
+    },
+    {
+      "epoch": 0.23070097604259096,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0001118353129102841,
+      "loss": 1.6573,
+      "step": 1820
+    },
+    {
+      "epoch": 0.23082773482063632,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011182379395596025,
+      "loss": 2.3483,
+      "step": 1821
+    },
+    {
+      "epoch": 0.2309544935986817,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011181226747583111,
+      "loss": 1.7456,
+      "step": 1822
+    },
+    {
+      "epoch": 0.23108125237672708,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011180073347157054,
+      "loss": 2.1377,
+      "step": 1823
+    },
+    {
+      "epoch": 0.23120801115477246,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00011178919194485352,
+      "loss": 1.8963,
+      "step": 1824
+    },
+    {
+      "epoch": 0.23133476993281785,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011177764289735608,
+      "loss": 2.2232,
+      "step": 1825
+    },
+    {
+      "epoch": 0.23146152871086323,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00011176608633075536,
+      "loss": 2.1092,
+      "step": 1826
+    },
+    {
+      "epoch": 0.2315882874889086,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011175452224672961,
+      "loss": 2.3346,
+      "step": 1827
+    },
+    {
+      "epoch": 0.231715046266954,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011174295064695814,
+      "loss": 1.6763,
+      "step": 1828
+    },
+    {
+      "epoch": 0.23184180504499938,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011173137153312137,
+      "loss": 1.6134,
+      "step": 1829
+    },
+    {
+      "epoch": 0.23196856382304473,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011171978490690082,
+      "loss": 1.9884,
+      "step": 1830
+    },
+    {
+      "epoch": 0.23209532260109011,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011170819076997907,
+      "loss": 2.1096,
+      "step": 1831
+    },
+    {
+      "epoch": 0.2322220813791355,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011169658912403984,
+      "loss": 1.6547,
+      "step": 1832
+    },
+    {
+      "epoch": 0.23234884015718088,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011168497997076789,
+      "loss": 1.6564,
+      "step": 1833
+    },
+    {
+      "epoch": 0.23247559893522626,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0001116733633118491,
+      "loss": 2.0879,
+      "step": 1834
+    },
+    {
+      "epoch": 0.23260235771327165,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0001116617391489704,
+      "loss": 2.3098,
+      "step": 1835
+    },
+    {
+      "epoch": 0.23272911649131703,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011165010748381988,
+      "loss": 2.2079,
+      "step": 1836
+    },
+    {
+      "epoch": 0.2328558752693624,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011163846831808667,
+      "loss": 1.9217,
+      "step": 1837
+    },
+    {
+      "epoch": 0.2329826340474078,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011162682165346099,
+      "loss": 2.422,
+      "step": 1838
+    },
+    {
+      "epoch": 0.23310939282545315,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00011161516749163416,
+      "loss": 1.949,
+      "step": 1839
+    },
+    {
+      "epoch": 0.23323615160349853,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0001116035058342986,
+      "loss": 1.7733,
+      "step": 1840
+    },
+    {
+      "epoch": 0.23336291038154391,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0001115918366831478,
+      "loss": 1.9932,
+      "step": 1841
+    },
+    {
+      "epoch": 0.2334896691595893,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011158016003987632,
+      "loss": 1.722,
+      "step": 1842
+    },
+    {
+      "epoch": 0.23361642793763468,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011156847590617985,
+      "loss": 1.8859,
+      "step": 1843
+    },
+    {
+      "epoch": 0.23374318671568006,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00011155678428375517,
+      "loss": 1.756,
+      "step": 1844
+    },
+    {
+      "epoch": 0.23386994549372545,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0001115450851743001,
+      "loss": 1.9091,
+      "step": 1845
+    },
+    {
+      "epoch": 0.23399670427177083,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011153337857951357,
+      "loss": 2.2108,
+      "step": 1846
+    },
+    {
+      "epoch": 0.2341234630498162,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011152166450109562,
+      "loss": 1.6843,
+      "step": 1847
+    },
+    {
+      "epoch": 0.23425022182786157,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00011150994294074737,
+      "loss": 2.2695,
+      "step": 1848
+    },
+    {
+      "epoch": 0.23437698060590695,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011149821390017098,
+      "loss": 2.1799,
+      "step": 1849
+    },
+    {
+      "epoch": 0.23450373938395233,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011148647738106973,
+      "loss": 2.1236,
+      "step": 1850
+    },
+    {
+      "epoch": 0.2346304981619977,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011147473338514804,
+      "loss": 2.1795,
+      "step": 1851
+    },
+    {
+      "epoch": 0.2347572569400431,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0001114629819141113,
+      "loss": 2.0387,
+      "step": 1852
+    },
+    {
+      "epoch": 0.23488401571808848,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0001114512229696661,
+      "loss": 1.5641,
+      "step": 1853
+    },
+    {
+      "epoch": 0.23501077449613386,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011143945655352001,
+      "loss": 2.0649,
+      "step": 1854
+    },
+    {
+      "epoch": 0.23513753327417924,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011142768266738177,
+      "loss": 1.3388,
+      "step": 1855
+    },
+    {
+      "epoch": 0.23526429205222463,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011141590131296119,
+      "loss": 2.2702,
+      "step": 1856
+    },
+    {
+      "epoch": 0.23539105083027,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011140411249196911,
+      "loss": 1.8902,
+      "step": 1857
+    },
+    {
+      "epoch": 0.23551780960831536,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.00011139231620611752,
+      "loss": 1.7341,
+      "step": 1858
+    },
+    {
+      "epoch": 0.23564456838636075,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00011138051245711944,
+      "loss": 2.2396,
+      "step": 1859
+    },
+    {
+      "epoch": 0.23577132716440613,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011136870124668903,
+      "loss": 1.6936,
+      "step": 1860
+    },
+    {
+      "epoch": 0.2358980859424515,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011135688257654148,
+      "loss": 2.4677,
+      "step": 1861
+    },
+    {
+      "epoch": 0.2360248447204969,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0001113450564483931,
+      "loss": 2.1463,
+      "step": 1862
+    },
+    {
+      "epoch": 0.23615160349854228,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011133322286396127,
+      "loss": 1.9601,
+      "step": 1863
+    },
+    {
+      "epoch": 0.23627836227658766,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011132138182496445,
+      "loss": 1.9763,
+      "step": 1864
+    },
+    {
+      "epoch": 0.23640512105463304,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011130953333312217,
+      "loss": 1.8936,
+      "step": 1865
+    },
+    {
+      "epoch": 0.23653187983267843,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00011129767739015509,
+      "loss": 1.906,
+      "step": 1866
+    },
+    {
+      "epoch": 0.23665863861072378,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0001112858139977849,
+      "loss": 2.3812,
+      "step": 1867
+    },
+    {
+      "epoch": 0.23678539738876916,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011127394315773438,
+      "loss": 1.6732,
+      "step": 1868
+    },
+    {
+      "epoch": 0.23691215616681455,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00011126206487172741,
+      "loss": 2.1126,
+      "step": 1869
+    },
+    {
+      "epoch": 0.23703891494485993,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00011125017914148895,
+      "loss": 2.0343,
+      "step": 1870
+    },
+    {
+      "epoch": 0.2371656737229053,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.00011123828596874504,
+      "loss": 2.0351,
+      "step": 1871
+    },
+    {
+      "epoch": 0.2372924325009507,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0001112263853552228,
+      "loss": 2.495,
+      "step": 1872
+    },
+    {
+      "epoch": 0.23741919127899608,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00011121447730265039,
+      "loss": 2.3829,
+      "step": 1873
+    },
+    {
+      "epoch": 0.23754595005704146,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011120256181275715,
+      "loss": 1.9801,
+      "step": 1874
+    },
+    {
+      "epoch": 0.23767270883508684,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011119063888727336,
+      "loss": 2.0665,
+      "step": 1875
+    },
+    {
+      "epoch": 0.2377994676131322,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011117870852793051,
+      "loss": 2.025,
+      "step": 1876
+    },
+    {
+      "epoch": 0.23792622639117758,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011116677073646113,
+      "loss": 1.9129,
+      "step": 1877
+    },
+    {
+      "epoch": 0.23805298516922296,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011115482551459876,
+      "loss": 2.3149,
+      "step": 1878
+    },
+    {
+      "epoch": 0.23817974394726835,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011114287286407811,
+      "loss": 1.7753,
+      "step": 1879
+    },
+    {
+      "epoch": 0.23830650272531373,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011113091278663492,
+      "loss": 1.9654,
+      "step": 1880
+    },
+    {
+      "epoch": 0.2384332615033591,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011111894528400603,
+      "loss": 2.5581,
+      "step": 1881
+    },
+    {
+      "epoch": 0.2385600202814045,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00011110697035792936,
+      "loss": 2.056,
+      "step": 1882
+    },
+    {
+      "epoch": 0.23868677905944988,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011109498801014388,
+      "loss": 1.9327,
+      "step": 1883
+    },
+    {
+      "epoch": 0.23881353783749526,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011108299824238967,
+      "loss": 2.0871,
+      "step": 1884
+    },
+    {
+      "epoch": 0.23894029661554061,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011107100105640786,
+      "loss": 1.9444,
+      "step": 1885
+    },
+    {
+      "epoch": 0.239067055393586,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011105899645394066,
+      "loss": 1.6214,
+      "step": 1886
+    },
+    {
+      "epoch": 0.23919381417163138,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0001110469844367314,
+      "loss": 1.7278,
+      "step": 1887
+    },
+    {
+      "epoch": 0.23932057294967676,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011103496500652444,
+      "loss": 2.4886,
+      "step": 1888
+    },
+    {
+      "epoch": 0.23944733172772215,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011102293816506523,
+      "loss": 1.8917,
+      "step": 1889
+    },
+    {
+      "epoch": 0.23957409050576753,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011101090391410028,
+      "loss": 1.7682,
+      "step": 1890
+    },
+    {
+      "epoch": 0.2397008492838129,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00011099886225537723,
+      "loss": 1.8318,
+      "step": 1891
+    },
+    {
+      "epoch": 0.2398276080618583,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011098681319064473,
+      "loss": 1.8688,
+      "step": 1892
+    },
+    {
+      "epoch": 0.23995436683990368,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011097475672165254,
+      "loss": 1.766,
+      "step": 1893
+    },
+    {
+      "epoch": 0.24008112561794903,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011096269285015149,
+      "loss": 2.0014,
+      "step": 1894
+    },
+    {
+      "epoch": 0.24020788439599441,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0001109506215778935,
+      "loss": 2.0121,
+      "step": 1895
+    },
+    {
+      "epoch": 0.2403346431740398,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011093854290663152,
+      "loss": 2.0996,
+      "step": 1896
+    },
+    {
+      "epoch": 0.24046140195208518,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011092645683811964,
+      "loss": 1.8513,
+      "step": 1897
+    },
+    {
+      "epoch": 0.24058816073013056,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011091436337411296,
+      "loss": 2.5915,
+      "step": 1898
+    },
+    {
+      "epoch": 0.24071491950817595,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011090226251636768,
+      "loss": 2.4251,
+      "step": 1899
+    },
+    {
+      "epoch": 0.24084167828622133,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0001108901542666411,
+      "loss": 1.9666,
+      "step": 1900
+    },
+    {
+      "epoch": 0.2409684370642667,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.00011087803862669156,
+      "loss": 2.2832,
+      "step": 1901
+    },
+    {
+      "epoch": 0.2410951958423121,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00011086591559827847,
+      "loss": 2.1511,
+      "step": 1902
+    },
+    {
+      "epoch": 0.24122195462035745,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011085378518316236,
+      "loss": 2.0961,
+      "step": 1903
+    },
+    {
+      "epoch": 0.24134871339840283,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011084164738310474,
+      "loss": 2.2358,
+      "step": 1904
+    },
+    {
+      "epoch": 0.2414754721764482,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011082950219986828,
+      "loss": 1.6418,
+      "step": 1905
+    },
+    {
+      "epoch": 0.2416022309544936,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011081734963521673,
+      "loss": 1.4844,
+      "step": 1906
+    },
+    {
+      "epoch": 0.24172898973253898,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011080518969091481,
+      "loss": 2.1625,
+      "step": 1907
+    },
+    {
+      "epoch": 0.24185574851058436,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011079302236872842,
+      "loss": 2.0725,
+      "step": 1908
+    },
+    {
+      "epoch": 0.24198250728862974,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00011078084767042447,
+      "loss": 2.3077,
+      "step": 1909
+    },
+    {
+      "epoch": 0.24210926606667513,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011076866559777096,
+      "loss": 1.6039,
+      "step": 1910
+    },
+    {
+      "epoch": 0.2422360248447205,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00011075647615253696,
+      "loss": 1.7217,
+      "step": 1911
+    },
+    {
+      "epoch": 0.24236278362276586,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011074427933649261,
+      "loss": 1.8374,
+      "step": 1912
+    },
+    {
+      "epoch": 0.24248954240081125,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00011073207515140915,
+      "loss": 1.5506,
+      "step": 1913
+    },
+    {
+      "epoch": 0.24261630117885663,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011071986359905881,
+      "loss": 2.2718,
+      "step": 1914
+    },
+    {
+      "epoch": 0.242743059956902,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00011070764468121498,
+      "loss": 1.898,
+      "step": 1915
+    },
+    {
+      "epoch": 0.2428698187349474,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011069541839965205,
+      "loss": 1.7358,
+      "step": 1916
+    },
+    {
+      "epoch": 0.24299657751299278,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00011068318475614553,
+      "loss": 1.8377,
+      "step": 1917
+    },
+    {
+      "epoch": 0.24312333629103816,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00011067094375247199,
+      "loss": 2.1218,
+      "step": 1918
+    },
+    {
+      "epoch": 0.24325009506908354,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011065869539040902,
+      "loss": 1.8304,
+      "step": 1919
+    },
+    {
+      "epoch": 0.24337685384712893,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011064643967173536,
+      "loss": 2.0517,
+      "step": 1920
+    },
+    {
+      "epoch": 0.24350361262517428,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011063417659823075,
+      "loss": 1.7884,
+      "step": 1921
+    },
+    {
+      "epoch": 0.24363037140321966,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011062190617167602,
+      "loss": 2.3266,
+      "step": 1922
+    },
+    {
+      "epoch": 0.24375713018126505,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00011060962839385306,
+      "loss": 2.4072,
+      "step": 1923
+    },
+    {
+      "epoch": 0.24388388895931043,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00011059734326654488,
+      "loss": 1.6475,
+      "step": 1924
+    },
+    {
+      "epoch": 0.2440106477373558,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011058505079153546,
+      "loss": 1.6104,
+      "step": 1925
+    },
+    {
+      "epoch": 0.2441374065154012,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011057275097060997,
+      "loss": 1.9628,
+      "step": 1926
+    },
+    {
+      "epoch": 0.24426416529344658,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0001105604438055545,
+      "loss": 1.8682,
+      "step": 1927
+    },
+    {
+      "epoch": 0.24439092407149196,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011054812929815636,
+      "loss": 2.0616,
+      "step": 1928
+    },
+    {
+      "epoch": 0.24451768284953734,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00011053580745020381,
+      "loss": 2.1254,
+      "step": 1929
+    },
+    {
+      "epoch": 0.2446444416275827,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011052347826348621,
+      "loss": 2.4461,
+      "step": 1930
+    },
+    {
+      "epoch": 0.24477120040562808,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011051114173979403,
+      "loss": 2.045,
+      "step": 1931
+    },
+    {
+      "epoch": 0.24489795918367346,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011049879788091874,
+      "loss": 2.0409,
+      "step": 1932
+    },
+    {
+      "epoch": 0.24502471796171885,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0001104864466886529,
+      "loss": 2.2023,
+      "step": 1933
+    },
+    {
+      "epoch": 0.24515147673976423,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011047408816479017,
+      "loss": 2.2773,
+      "step": 1934
+    },
+    {
+      "epoch": 0.2452782355178096,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011046172231112523,
+      "loss": 1.8321,
+      "step": 1935
+    },
+    {
+      "epoch": 0.245404994295855,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011044934912945382,
+      "loss": 2.1143,
+      "step": 1936
+    },
+    {
+      "epoch": 0.24553175307390038,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011043696862157279,
+      "loss": 1.5387,
+      "step": 1937
+    },
+    {
+      "epoch": 0.24565851185194576,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00011042458078927999,
+      "loss": 2.0486,
+      "step": 1938
+    },
+    {
+      "epoch": 0.24578527062999111,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0001104121856343744,
+      "loss": 1.777,
+      "step": 1939
+    },
+    {
+      "epoch": 0.2459120294080365,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011039978315865603,
+      "loss": 1.4646,
+      "step": 1940
+    },
+    {
+      "epoch": 0.24603878818608188,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011038737336392596,
+      "loss": 2.4365,
+      "step": 1941
+    },
+    {
+      "epoch": 0.24616554696412726,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00011037495625198631,
+      "loss": 2.4839,
+      "step": 1942
+    },
+    {
+      "epoch": 0.24629230574217265,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00011036253182464031,
+      "loss": 2.3482,
+      "step": 1943
+    },
+    {
+      "epoch": 0.24641906452021803,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.00011035010008369219,
+      "loss": 1.9241,
+      "step": 1944
+    },
+    {
+      "epoch": 0.2465458232982634,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0001103376610309473,
+      "loss": 2.4444,
+      "step": 1945
+    },
+    {
+      "epoch": 0.2466725820763088,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011032521466821204,
+      "loss": 1.9348,
+      "step": 1946
+    },
+    {
+      "epoch": 0.24679934085435418,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011031276099729382,
+      "loss": 1.4657,
+      "step": 1947
+    },
+    {
+      "epoch": 0.24692609963239953,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011030030002000118,
+      "loss": 2.4435,
+      "step": 1948
+    },
+    {
+      "epoch": 0.24705285841044491,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011028783173814369,
+      "loss": 1.8672,
+      "step": 1949
+    },
+    {
+      "epoch": 0.2471796171884903,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011027535615353197,
+      "loss": 2.0986,
+      "step": 1950
+    },
+    {
+      "epoch": 0.24730637596653568,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00011026287326797774,
+      "loss": 1.6975,
+      "step": 1951
+    },
+    {
+      "epoch": 0.24743313474458106,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00011025038308329372,
+      "loss": 1.9396,
+      "step": 1952
+    },
+    {
+      "epoch": 0.24755989352262645,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011023788560129374,
+      "loss": 1.6226,
+      "step": 1953
+    },
+    {
+      "epoch": 0.24768665230067183,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011022538082379268,
+      "loss": 2.2463,
+      "step": 1954
+    },
+    {
+      "epoch": 0.2478134110787172,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00011021286875260646,
+      "loss": 2.2083,
+      "step": 1955
+    },
+    {
+      "epoch": 0.2479401698567626,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00011020034938955205,
+      "loss": 1.9937,
+      "step": 1956
+    },
+    {
+      "epoch": 0.24806692863480795,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.00011018782273644757,
+      "loss": 1.8878,
+      "step": 1957
+    },
+    {
+      "epoch": 0.24819368741285333,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00011017528879511206,
+      "loss": 2.2198,
+      "step": 1958
+    },
+    {
+      "epoch": 0.2483204461908987,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011016274756736572,
+      "loss": 1.9249,
+      "step": 1959
+    },
+    {
+      "epoch": 0.2484472049689441,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00011015019905502979,
+      "loss": 2.5262,
+      "step": 1960
+    },
+    {
+      "epoch": 0.24857396374698948,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011013764325992652,
+      "loss": 1.3796,
+      "step": 1961
+    },
+    {
+      "epoch": 0.24870072252503486,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011012508018387925,
+      "loss": 1.7887,
+      "step": 1962
+    },
+    {
+      "epoch": 0.24882748130308024,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00011011250982871242,
+      "loss": 2.5132,
+      "step": 1963
+    },
+    {
+      "epoch": 0.24895424008112563,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011009993219625144,
+      "loss": 2.3026,
+      "step": 1964
+    },
+    {
+      "epoch": 0.249080998859171,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011008734728832286,
+      "loss": 1.6692,
+      "step": 1965
+    },
+    {
+      "epoch": 0.24920775763721636,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00011007475510675421,
+      "loss": 2.0657,
+      "step": 1966
+    },
+    {
+      "epoch": 0.24933451641526175,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011006215565337416,
+      "loss": 1.8933,
+      "step": 1967
+    },
+    {
+      "epoch": 0.24946127519330713,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011004954893001237,
+      "loss": 1.6081,
+      "step": 1968
+    },
+    {
+      "epoch": 0.2495880339713525,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011003693493849956,
+      "loss": 1.7577,
+      "step": 1969
+    },
+    {
+      "epoch": 0.2497147927493979,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011002431368066754,
+      "loss": 2.6812,
+      "step": 1970
+    },
+    {
+      "epoch": 0.24984155152744328,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00011001168515834915,
+      "loss": 2.5858,
+      "step": 1971
+    },
+    {
+      "epoch": 0.24996831030548866,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00010999904937337833,
+      "loss": 2.075,
+      "step": 1972
+    },
+    {
+      "epoch": 0.250095069083534,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00010998640632758997,
+      "loss": 2.0207,
+      "step": 1973
+    },
+    {
+      "epoch": 0.2502218278615794,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00010997375602282014,
+      "loss": 1.7996,
+      "step": 1974
+    },
+    {
+      "epoch": 0.2503485866396248,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00010996109846090588,
+      "loss": 2.0745,
+      "step": 1975
+    },
+    {
+      "epoch": 0.2504753454176702,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00010994843364368533,
+      "loss": 1.9539,
+      "step": 1976
+    },
+    {
+      "epoch": 0.25060210419571555,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00010993576157299763,
+      "loss": 1.8362,
+      "step": 1977
+    },
+    {
+      "epoch": 0.25072886297376096,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00010992308225068303,
+      "loss": 1.6796,
+      "step": 1978
+    },
+    {
+      "epoch": 0.2508556217518063,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.00010991039567858279,
+      "loss": 2.194,
+      "step": 1979
+    },
+    {
+      "epoch": 0.25098238052985167,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00010989770185853926,
+      "loss": 2.204,
+      "step": 1980
+    },
+    {
+      "epoch": 0.2511091393078971,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00010988500079239585,
+      "loss": 1.9912,
+      "step": 1981
+    },
+    {
+      "epoch": 0.25123589808594243,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00010987229248199694,
+      "loss": 1.9439,
+      "step": 1982
+    },
+    {
+      "epoch": 0.25136265686398784,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00010985957692918806,
+      "loss": 1.9681,
+      "step": 1983
+    },
+    {
+      "epoch": 0.2514894156420332,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00010984685413581575,
+      "loss": 2.3029,
+      "step": 1984
+    },
+    {
+      "epoch": 0.2516161744200786,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00010983412410372759,
+      "loss": 2.1463,
+      "step": 1985
+    },
+    {
+      "epoch": 0.25174293319812396,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00010982138683477223,
+      "loss": 1.6559,
+      "step": 1986
+    },
+    {
+      "epoch": 0.2518696919761694,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00010980864233079935,
+      "loss": 2.3137,
+      "step": 1987
+    },
+    {
+      "epoch": 0.25199645075421473,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00010979589059365972,
+      "loss": 1.5084,
+      "step": 1988
+    },
+    {
+      "epoch": 0.2521232095322601,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00010978313162520511,
+      "loss": 1.7619,
+      "step": 1989
+    },
+    {
+      "epoch": 0.2522499683103055,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0001097703654272884,
+      "loss": 1.6992,
+      "step": 1990
+    },
+    {
+      "epoch": 0.25237672708835085,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00010975759200176346,
+      "loss": 2.2644,
+      "step": 1991
+    },
+    {
+      "epoch": 0.25250348586639626,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00010974481135048524,
+      "loss": 1.8911,
+      "step": 1992
+    },
+    {
+      "epoch": 0.2526302446444416,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00010973202347530973,
+      "loss": 1.855,
+      "step": 1993
+    },
+    {
+      "epoch": 0.252757003422487,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.000109719228378094,
+      "loss": 1.9577,
+      "step": 1994
+    },
+    {
+      "epoch": 0.2528837622005324,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00010970642606069609,
+      "loss": 2.1349,
+      "step": 1995
+    },
+    {
+      "epoch": 0.2530105209785778,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00010969361652497519,
+      "loss": 2.3517,
+      "step": 1996
+    },
+    {
+      "epoch": 0.25313727975662315,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00010968079977279148,
+      "loss": 2.1919,
+      "step": 1997
+    },
+    {
+      "epoch": 0.2532640385346685,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00010966797580600617,
+      "loss": 2.1692,
+      "step": 1998
+    },
+    {
+      "epoch": 0.2533907973127139,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.00010965514462648159,
+      "loss": 1.4257,
+      "step": 1999
+    },
+    {
+      "epoch": 0.25351755609075927,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00010964230623608104,
+      "loss": 2.3177,
+      "step": 2000
+    },
+    {
+      "epoch": 0.2536443148688047,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00010962946063666892,
+      "loss": 1.971,
+      "step": 2001
+    },
+    {
+      "epoch": 0.25377107364685003,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00010961660783011061,
+      "loss": 2.0185,
+      "step": 2002
+    },
+    {
+      "epoch": 0.25389783242489544,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00010960374781827267,
+      "loss": 2.1336,
+      "step": 2003
+    },
+    {
+      "epoch": 0.2540245912029408,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00010959088060302252,
+      "loss": 1.8294,
+      "step": 2004
+    },
+    {
+      "epoch": 0.2541513499809862,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0001095780061862288,
+      "loss": 1.9698,
+      "step": 2005
+    },
+    {
+      "epoch": 0.25427810875903156,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00010956512456976108,
+      "loss": 2.2703,
+      "step": 2006
+    },
+    {
+      "epoch": 0.2544048675370769,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00010955223575549002,
+      "loss": 1.6022,
+      "step": 2007
+    },
+    {
+      "epoch": 0.25453162631512233,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00010953933974528736,
+      "loss": 1.6724,
+      "step": 2008
+    },
+    {
+      "epoch": 0.2546583850931677,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0001095264365410258,
+      "loss": 1.9459,
+      "step": 2009
+    },
+    {
+      "epoch": 0.2547851438712131,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00010951352614457916,
+      "loss": 1.6412,
+      "step": 2010
+    },
+    {
+      "epoch": 0.25491190264925845,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00010950060855782228,
+      "loss": 1.8433,
+      "step": 2011
+    },
+    {
+      "epoch": 0.25503866142730386,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00010948768378263101,
+      "loss": 1.879,
+      "step": 2012
+    },
+    {
+      "epoch": 0.2551654202053492,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0001094747518208823,
+      "loss": 1.7705,
+      "step": 2013
+    },
+    {
+      "epoch": 0.2552921789833946,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00010946181267445411,
+      "loss": 2.1051,
+      "step": 2014
+    },
+    {
+      "epoch": 0.25541893776144,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00010944886634522546,
+      "loss": 1.8151,
+      "step": 2015
+    },
+    {
+      "epoch": 0.25554569653948533,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00010943591283507639,
+      "loss": 2.1353,
+      "step": 2016
+    },
+    {
+      "epoch": 0.25567245531753074,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00010942295214588801,
+      "loss": 1.8492,
+      "step": 2017
+    },
+    {
+      "epoch": 0.2557992140955761,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00010940998427954244,
+      "loss": 1.6428,
+      "step": 2018
+    },
+    {
+      "epoch": 0.2559259728736215,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00010939700923792288,
+      "loss": 2.0904,
+      "step": 2019
+    },
+    {
+      "epoch": 0.25605273165166686,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00010938402702291358,
+      "loss": 2.3051,
+      "step": 2020
+    },
+    {
+      "epoch": 0.2561794904297123,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00010937103763639975,
+      "loss": 1.9981,
+      "step": 2021
+    },
+    {
+      "epoch": 0.25630624920775763,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0001093580410802677,
+      "loss": 2.4197,
+      "step": 2022
+    },
+    {
+      "epoch": 0.25643300798580304,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00010934503735640484,
+      "loss": 2.0751,
+      "step": 2023
+    },
+    {
+      "epoch": 0.2565597667638484,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00010933202646669951,
+      "loss": 1.8955,
+      "step": 2024
+    },
+    {
+      "epoch": 0.25668652554189375,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00010931900841304114,
+      "loss": 2.0562,
+      "step": 2025
+    },
+    {
+      "epoch": 0.25681328431993916,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00010930598319732021,
+      "loss": 1.7824,
+      "step": 2026
+    },
+    {
+      "epoch": 0.2569400430979845,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00010929295082142825,
+      "loss": 1.6555,
+      "step": 2027
+    },
+    {
+      "epoch": 0.2570668018760299,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00010927991128725778,
+      "loss": 1.6762,
+      "step": 2028
+    },
+    {
+      "epoch": 0.2571935606540753,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00010926686459670239,
+      "loss": 2.1007,
+      "step": 2029
+    },
+    {
+      "epoch": 0.2573203194321207,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00010925381075165673,
+      "loss": 2.1472,
+      "step": 2030
+    },
+    {
+      "epoch": 0.25744707821016605,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00010924074975401646,
+      "loss": 2.3564,
+      "step": 2031
+    },
+    {
+      "epoch": 0.25757383698821146,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00010922768160567829,
+      "loss": 1.9732,
+      "step": 2032
+    },
+    {
+      "epoch": 0.2577005957662568,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00010921460630853994,
+      "loss": 2.1771,
+      "step": 2033
+    },
+    {
+      "epoch": 0.25782735454430217,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0001092015238645002,
+      "loss": 1.7126,
+      "step": 2034
+    },
+    {
+      "epoch": 0.2579541133223476,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00010918843427545892,
+      "loss": 2.0775,
+      "step": 2035
+    },
+    {
+      "epoch": 0.25808087210039293,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00010917533754331694,
+      "loss": 1.7782,
+      "step": 2036
+    },
+    {
+      "epoch": 0.25820763087843834,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00010916223366997616,
+      "loss": 1.8822,
+      "step": 2037
+    },
+    {
+      "epoch": 0.2583343896564837,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.00010914912265733952,
+      "loss": 2.0884,
+      "step": 2038
+    },
+    {
+      "epoch": 0.2584611484345291,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00010913600450731095,
+      "loss": 2.1335,
+      "step": 2039
+    },
+    {
+      "epoch": 0.25858790721257446,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00010912287922179551,
+      "loss": 2.2726,
+      "step": 2040
+    },
+    {
+      "epoch": 0.2587146659906199,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0001091097468026992,
+      "loss": 2.0254,
+      "step": 2041
+    },
+    {
+      "epoch": 0.25884142476866523,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00010909660725192912,
+      "loss": 1.694,
+      "step": 2042
+    },
+    {
+      "epoch": 0.2589681835467106,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0001090834605713934,
+      "loss": 1.6937,
+      "step": 2043
+    },
+    {
+      "epoch": 0.259094942324756,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00010907030676300115,
+      "loss": 1.6514,
+      "step": 2044
+    },
+    {
+      "epoch": 0.25922170110280135,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00010905714582866256,
+      "loss": 1.8929,
+      "step": 2045
+    },
+    {
+      "epoch": 0.25934845988084676,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00010904397777028887,
+      "loss": 2.0018,
+      "step": 2046
+    },
+    {
+      "epoch": 0.2594752186588921,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00010903080258979233,
+      "loss": 1.289,
+      "step": 2047
+    },
+    {
+      "epoch": 0.2596019774369375,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00010901762028908623,
+      "loss": 1.8984,
+      "step": 2048
+    },
+    {
+      "epoch": 0.2597287362149829,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00010900443087008488,
+      "loss": 1.8592,
+      "step": 2049
+    },
+    {
+      "epoch": 0.2598554949930283,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00010899123433470365,
+      "loss": 2.3645,
+      "step": 2050
+    },
+    {
+      "epoch": 0.25998225377107365,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0001089780306848589,
+      "loss": 1.7831,
+      "step": 2051
+    },
+    {
+      "epoch": 0.260109012549119,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0001089648199224681,
+      "loss": 2.2172,
+      "step": 2052
+    },
+    {
+      "epoch": 0.2602357713271644,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00010895160204944966,
+      "loss": 1.7063,
+      "step": 2053
+    },
+    {
+      "epoch": 0.26036253010520977,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0001089383770677231,
+      "loss": 1.6328,
+      "step": 2054
+    },
+    {
+      "epoch": 0.2604892888832552,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00010892514497920891,
+      "loss": 1.8716,
+      "step": 2055
+    },
+    {
+      "epoch": 0.26061604766130053,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00010891190578582867,
+      "loss": 1.4233,
+      "step": 2056
+    },
+    {
+      "epoch": 0.26074280643934594,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00010889865948950494,
+      "loss": 2.43,
+      "step": 2057
+    },
+    {
+      "epoch": 0.2608695652173913,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00010888540609216136,
+      "loss": 2.1284,
+      "step": 2058
+    },
+    {
+      "epoch": 0.2609963239954367,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00010887214559572255,
+      "loss": 2.1232,
+      "step": 2059
+    },
+    {
+      "epoch": 0.26112308277348206,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0001088588780021142,
+      "loss": 1.8737,
+      "step": 2060
+    },
+    {
+      "epoch": 0.2612498415515274,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00010884560331326304,
+      "loss": 1.8746,
+      "step": 2061
+    },
+    {
+      "epoch": 0.26137660032957283,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00010883232153109678,
+      "loss": 2.2546,
+      "step": 2062
+    },
+    {
+      "epoch": 0.2615033591076182,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00010881903265754419,
+      "loss": 1.6832,
+      "step": 2063
+    },
+    {
+      "epoch": 0.2616301178856636,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001088057366945351,
+      "loss": 2.0841,
+      "step": 2064
+    },
+    {
+      "epoch": 0.26175687666370895,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00010879243364400028,
+      "loss": 2.0414,
+      "step": 2065
+    },
+    {
+      "epoch": 0.26188363544175436,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00010877912350787164,
+      "loss": 1.5754,
+      "step": 2066
+    },
+    {
+      "epoch": 0.2620103942197997,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00010876580628808203,
+      "loss": 2.9128,
+      "step": 2067
+    },
+    {
+      "epoch": 0.2621371529978451,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001087524819865654,
+      "loss": 2.2536,
+      "step": 2068
+    },
+    {
+      "epoch": 0.2622639117758905,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00010873915060525666,
+      "loss": 2.1688,
+      "step": 2069
+    },
+    {
+      "epoch": 0.26239067055393583,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0001087258121460918,
+      "loss": 2.2801,
+      "step": 2070
+    },
+    {
+      "epoch": 0.26251742933198124,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00010871246661100782,
+      "loss": 2.2957,
+      "step": 2071
+    },
+    {
+      "epoch": 0.2626441881100266,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00010869911400194273,
+      "loss": 2.5681,
+      "step": 2072
+    },
+    {
+      "epoch": 0.262770946888072,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0001086857543208356,
+      "loss": 1.9326,
+      "step": 2073
+    },
+    {
+      "epoch": 0.26289770566611737,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00010867238756962652,
+      "loss": 2.1175,
+      "step": 2074
+    },
+    {
+      "epoch": 0.2630244644441628,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00010865901375025658,
+      "loss": 1.4167,
+      "step": 2075
+    },
+    {
+      "epoch": 0.26315122322220813,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00010864563286466791,
+      "loss": 1.8575,
+      "step": 2076
+    },
+    {
+      "epoch": 0.26327798200025354,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00010863224491480369,
+      "loss": 1.8406,
+      "step": 2077
+    },
+    {
+      "epoch": 0.2634047407782989,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00010861884990260809,
+      "loss": 1.9168,
+      "step": 2078
+    },
+    {
+      "epoch": 0.26353149955634425,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00010860544783002633,
+      "loss": 2.0365,
+      "step": 2079
+    },
+    {
+      "epoch": 0.26365825833438966,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00010859203869900462,
+      "loss": 1.5835,
+      "step": 2080
+    },
+    {
+      "epoch": 0.263785017112435,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00010857862251149028,
+      "loss": 1.8031,
+      "step": 2081
+    },
+    {
+      "epoch": 0.2639117758904804,
+      "grad_norm": 1.9765625,
+      "learning_rate": 0.00010856519926943155,
+      "loss": 3.324,
+      "step": 2082
+    },
+    {
+      "epoch": 0.2640385346685258,
+      "grad_norm": 2.90625,
+      "learning_rate": 0.00010855176897477775,
+      "loss": 3.0361,
+      "step": 2083
+    },
+    {
+      "epoch": 0.2641652934465712,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001085383316294792,
+      "loss": 2.0123,
+      "step": 2084
+    },
+    {
+      "epoch": 0.26429205222461655,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0001085248872354873,
+      "loss": 1.7503,
+      "step": 2085
+    },
+    {
+      "epoch": 0.26441881100266196,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00010851143579475443,
+      "loss": 2.4202,
+      "step": 2086
+    },
+    {
+      "epoch": 0.2645455697807073,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00010849797730923396,
+      "loss": 1.9654,
+      "step": 2087
+    },
+    {
+      "epoch": 0.26467232855875267,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00010848451178088033,
+      "loss": 2.1493,
+      "step": 2088
+    },
+    {
+      "epoch": 0.2647990873367981,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00010847103921164902,
+      "loss": 1.9769,
+      "step": 2089
+    },
+    {
+      "epoch": 0.26492584611484343,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00010845755960349647,
+      "loss": 1.6657,
+      "step": 2090
+    },
+    {
+      "epoch": 0.26505260489288884,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00010844407295838019,
+      "loss": 2.1509,
+      "step": 2091
+    },
+    {
+      "epoch": 0.2651793636709342,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0001084305792782587,
+      "loss": 2.333,
+      "step": 2092
+    },
+    {
+      "epoch": 0.2653061224489796,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00010841707856509154,
+      "loss": 2.2181,
+      "step": 2093
+    },
+    {
+      "epoch": 0.26543288122702496,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00010840357082083928,
+      "loss": 1.6493,
+      "step": 2094
+    },
+    {
+      "epoch": 0.2655596400050704,
+      "grad_norm": 6.0,
+      "learning_rate": 0.00010839005604746349,
+      "loss": 2.9562,
+      "step": 2095
+    },
+    {
+      "epoch": 0.26568639878311573,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00010837653424692677,
+      "loss": 2.206,
+      "step": 2096
+    },
+    {
+      "epoch": 0.2658131575611611,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00010836300542119276,
+      "loss": 2.0237,
+      "step": 2097
+    },
+    {
+      "epoch": 0.2659399163392065,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.00010834946957222608,
+      "loss": 2.1401,
+      "step": 2098
+    },
+    {
+      "epoch": 0.26606667511725185,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00010833592670199243,
+      "loss": 2.0903,
+      "step": 2099
+    },
+    {
+      "epoch": 0.26619343389529726,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00010832237681245846,
+      "loss": 1.4192,
+      "step": 2100
+    },
+    {
+      "epoch": 0.2663201926733426,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00010830881990559189,
+      "loss": 2.2133,
+      "step": 2101
+    },
+    {
+      "epoch": 0.266446951451388,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00010829525598336144,
+      "loss": 2.3092,
+      "step": 2102
+    },
+    {
+      "epoch": 0.2665737102294334,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00010828168504773683,
+      "loss": 2.3911,
+      "step": 2103
+    },
+    {
+      "epoch": 0.2667004690074788,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00010826810710068886,
+      "loss": 1.4397,
+      "step": 2104
+    },
+    {
+      "epoch": 0.26682722778552415,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00010825452214418928,
+      "loss": 2.2206,
+      "step": 2105
+    },
+    {
+      "epoch": 0.2669539865635695,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00010824093018021087,
+      "loss": 2.1534,
+      "step": 2106
+    },
+    {
+      "epoch": 0.2670807453416149,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00010822733121072747,
+      "loss": 1.9457,
+      "step": 2107
+    },
+    {
+      "epoch": 0.26720750411966027,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00010821372523771392,
+      "loss": 1.9259,
+      "step": 2108
+    },
+    {
+      "epoch": 0.2673342628977057,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00010820011226314606,
+      "loss": 2.0859,
+      "step": 2109
+    },
+    {
+      "epoch": 0.26746102167575103,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00010818649228900073,
+      "loss": 1.9406,
+      "step": 2110
+    },
+    {
+      "epoch": 0.26758778045379644,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0001081728653172558,
+      "loss": 1.9499,
+      "step": 2111
+    },
+    {
+      "epoch": 0.2677145392318418,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00010815923134989023,
+      "loss": 2.0755,
+      "step": 2112
+    },
+    {
+      "epoch": 0.2678412980098872,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00010814559038888387,
+      "loss": 2.1358,
+      "step": 2113
+    },
+    {
+      "epoch": 0.26796805678793256,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00010813194243621767,
+      "loss": 2.4974,
+      "step": 2114
+    },
+    {
+      "epoch": 0.2680948155659779,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00010811828749387361,
+      "loss": 1.8863,
+      "step": 2115
+    },
+    {
+      "epoch": 0.26822157434402333,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00010810462556383459,
+      "loss": 2.3304,
+      "step": 2116
+    },
+    {
+      "epoch": 0.2683483331220687,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0001080909566480846,
+      "loss": 2.0889,
+      "step": 2117
+    },
+    {
+      "epoch": 0.2684750919001141,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00010807728074860866,
+      "loss": 2.0821,
+      "step": 2118
+    },
+    {
+      "epoch": 0.26860185067815945,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00010806359786739273,
+      "loss": 1.9428,
+      "step": 2119
+    },
+    {
+      "epoch": 0.26872860945620486,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00010804990800642386,
+      "loss": 1.7947,
+      "step": 2120
+    },
+    {
+      "epoch": 0.2688553682342502,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00010803621116769004,
+      "loss": 2.5499,
+      "step": 2121
+    },
+    {
+      "epoch": 0.2689821270122956,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00010802250735318035,
+      "loss": 1.9677,
+      "step": 2122
+    },
+    {
+      "epoch": 0.269108885790341,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00010800879656488483,
+      "loss": 2.2482,
+      "step": 2123
+    },
+    {
+      "epoch": 0.26923564456838633,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00010799507880479456,
+      "loss": 2.3249,
+      "step": 2124
+    },
+    {
+      "epoch": 0.26936240334643174,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00010798135407490158,
+      "loss": 1.7045,
+      "step": 2125
+    },
+    {
+      "epoch": 0.2694891621244771,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00010796762237719904,
+      "loss": 1.5704,
+      "step": 2126
+    },
+    {
+      "epoch": 0.2696159209025225,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00010795388371368104,
+      "loss": 2.1678,
+      "step": 2127
+    },
+    {
+      "epoch": 0.26974267968056787,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00010794013808634264,
+      "loss": 1.8206,
+      "step": 2128
+    },
+    {
+      "epoch": 0.2698694384586133,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.00010792638549718002,
+      "loss": 1.9321,
+      "step": 2129
+    },
+    {
+      "epoch": 0.26999619723665863,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0001079126259481903,
+      "loss": 2.0523,
+      "step": 2130
+    },
+    {
+      "epoch": 0.27012295601470404,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00010789885944137162,
+      "loss": 2.2796,
+      "step": 2131
+    },
+    {
+      "epoch": 0.2702497147927494,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00010788508597872317,
+      "loss": 2.4429,
+      "step": 2132
+    },
+    {
+      "epoch": 0.27037647357079475,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00010787130556224507,
+      "loss": 1.6398,
+      "step": 2133
+    },
+    {
+      "epoch": 0.27050323234884016,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00010785751819393857,
+      "loss": 1.6361,
+      "step": 2134
+    },
+    {
+      "epoch": 0.2706299911268855,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0001078437238758058,
+      "loss": 1.9116,
+      "step": 2135
+    },
+    {
+      "epoch": 0.2707567499049309,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00010782992260984998,
+      "loss": 1.5352,
+      "step": 2136
+    },
+    {
+      "epoch": 0.2708835086829763,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00010781611439807534,
+      "loss": 1.8568,
+      "step": 2137
+    },
+    {
+      "epoch": 0.2710102674610217,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00010780229924248705,
+      "loss": 1.891,
+      "step": 2138
+    },
+    {
+      "epoch": 0.27113702623906705,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00010778847714509136,
+      "loss": 1.99,
+      "step": 2139
+    },
+    {
+      "epoch": 0.27126378501711246,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0001077746481078955,
+      "loss": 2.6804,
+      "step": 2140
+    },
+    {
+      "epoch": 0.2713905437951578,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00010776081213290772,
+      "loss": 1.7029,
+      "step": 2141
+    },
+    {
+      "epoch": 0.27151730257320317,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00010774696922213725,
+      "loss": 1.8984,
+      "step": 2142
+    },
+    {
+      "epoch": 0.2716440613512486,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00010773311937759436,
+      "loss": 2.014,
+      "step": 2143
+    },
+    {
+      "epoch": 0.27177082012929393,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0001077192626012903,
+      "loss": 2.3618,
+      "step": 2144
+    },
+    {
+      "epoch": 0.27189757890733934,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00010770539889523736,
+      "loss": 2.218,
+      "step": 2145
+    },
+    {
+      "epoch": 0.2720243376853847,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00010769152826144878,
+      "loss": 1.9896,
+      "step": 2146
+    },
+    {
+      "epoch": 0.2721510964634301,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.00010767765070193887,
+      "loss": 2.1676,
+      "step": 2147
+    },
+    {
+      "epoch": 0.27227785524147546,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00010766376621872291,
+      "loss": 2.7233,
+      "step": 2148
+    },
+    {
+      "epoch": 0.2724046140195209,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00010764987481381718,
+      "loss": 1.8668,
+      "step": 2149
+    },
+    {
+      "epoch": 0.27253137279756623,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00010763597648923902,
+      "loss": 1.8857,
+      "step": 2150
+    },
+    {
+      "epoch": 0.2726581315756116,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00010762207124700666,
+      "loss": 2.1943,
+      "step": 2151
+    },
+    {
+      "epoch": 0.272784890353657,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00010760815908913948,
+      "loss": 1.7987,
+      "step": 2152
+    },
+    {
+      "epoch": 0.27291164913170235,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00010759424001765774,
+      "loss": 2.397,
+      "step": 2153
+    },
+    {
+      "epoch": 0.27303840790974776,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00010758031403458277,
+      "loss": 1.833,
+      "step": 2154
+    },
+    {
+      "epoch": 0.2731651666877931,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00010756638114193689,
+      "loss": 1.8482,
+      "step": 2155
+    },
+    {
+      "epoch": 0.2732919254658385,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00010755244134174344,
+      "loss": 1.7616,
+      "step": 2156
+    },
+    {
+      "epoch": 0.2734186842438839,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00010753849463602673,
+      "loss": 1.8068,
+      "step": 2157
+    },
+    {
+      "epoch": 0.2735454430219293,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00010752454102681209,
+      "loss": 2.4359,
+      "step": 2158
+    },
+    {
+      "epoch": 0.27367220179997465,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00010751058051612584,
+      "loss": 2.1111,
+      "step": 2159
+    },
+    {
+      "epoch": 0.27379896057802,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00010749661310599532,
+      "loss": 2.3023,
+      "step": 2160
+    },
+    {
+      "epoch": 0.2739257193560654,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00010748263879844887,
+      "loss": 2.2653,
+      "step": 2161
+    },
+    {
+      "epoch": 0.27405247813411077,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00010746865759551582,
+      "loss": 1.5266,
+      "step": 2162
+    },
+    {
+      "epoch": 0.2741792369121562,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00010745466949922653,
+      "loss": 2.2371,
+      "step": 2163
+    },
+    {
+      "epoch": 0.27430599569020153,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0001074406745116123,
+      "loss": 2.403,
+      "step": 2164
+    },
+    {
+      "epoch": 0.27443275446824694,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0001074266726347055,
+      "loss": 1.9814,
+      "step": 2165
+    },
+    {
+      "epoch": 0.2745595132462923,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00010741266387053945,
+      "loss": 1.6237,
+      "step": 2166
+    },
+    {
+      "epoch": 0.2746862720243377,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00010739864822114852,
+      "loss": 1.8948,
+      "step": 2167
+    },
+    {
+      "epoch": 0.27481303080238306,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.000107384625688568,
+      "loss": 2.7144,
+      "step": 2168
+    },
+    {
+      "epoch": 0.2749397895804284,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00010737059627483427,
+      "loss": 2.0845,
+      "step": 2169
+    },
+    {
+      "epoch": 0.27506654835847383,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00010735655998198467,
+      "loss": 1.9002,
+      "step": 2170
+    },
+    {
+      "epoch": 0.27506654835847383,
+      "eval_loss": 1.9766894578933716,
+      "eval_runtime": 46.4841,
+      "eval_samples_per_second": 55.417,
+      "eval_steps_per_second": 13.854,
+      "step": 2170
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 8678,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 2170,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.385495432973517e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}