diff --git "a/checkpoint-1750/trainer_state.json" "b/checkpoint-1750/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-1750/trainer_state.json"
@@ -0,0 +1,12284 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.11968,
+  "eval_steps": 500,
+  "global_step": 1750,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00064,
+      "grad_norm": 1.1776920557022095,
+      "learning_rate": 0.0,
+      "loss": 0.8675,
+      "step": 1
+    },
+    {
+      "epoch": 0.00128,
+      "grad_norm": 1.203395962715149,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 0.9145,
+      "step": 2
+    },
+    {
+      "epoch": 0.00192,
+      "grad_norm": 1.2823563814163208,
+      "learning_rate": 2.666666666666667e-06,
+      "loss": 1.0141,
+      "step": 3
+    },
+    {
+      "epoch": 0.00256,
+      "grad_norm": 1.1800405979156494,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.8967,
+      "step": 4
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.4020403623580933,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 0.9143,
+      "step": 5
+    },
+    {
+      "epoch": 0.00384,
+      "grad_norm": 1.121119737625122,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 0.8989,
+      "step": 6
+    },
+    {
+      "epoch": 0.00448,
+      "grad_norm": 1.4375133514404297,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 1.0609,
+      "step": 7
+    },
+    {
+      "epoch": 0.00512,
+      "grad_norm": 1.0692578554153442,
+      "learning_rate": 9.333333333333334e-06,
+      "loss": 0.9022,
+      "step": 8
+    },
+    {
+      "epoch": 0.00576,
+      "grad_norm": 0.9640074968338013,
+      "learning_rate": 1.0666666666666667e-05,
+      "loss": 0.8429,
+      "step": 9
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.837898313999176,
+      "learning_rate": 1.2e-05,
+      "loss": 0.9479,
+      "step": 10
+    },
+    {
+      "epoch": 0.00704,
+      "grad_norm": 0.7400044798851013,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 0.8695,
+      "step": 11
+    },
+    {
+      "epoch": 0.00768,
+      "grad_norm": 0.5966362357139587,
+      "learning_rate": 1.4666666666666668e-05,
+      "loss": 0.8578,
+      "step": 12
+    },
+    {
+      "epoch": 0.00832,
+      "grad_norm": 0.59197998046875,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 0.807,
+      "step": 13
+    },
+    {
+      "epoch": 0.00896,
+      "grad_norm": 0.527433454990387,
+      "learning_rate": 1.7333333333333336e-05,
+      "loss": 0.8634,
+      "step": 14
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.4848785996437073,
+      "learning_rate": 1.866666666666667e-05,
+      "loss": 0.8072,
+      "step": 15
+    },
+    {
+      "epoch": 0.01024,
+      "grad_norm": 0.500170111656189,
+      "learning_rate": 2e-05,
+      "loss": 0.7865,
+      "step": 16
+    },
+    {
+      "epoch": 0.01088,
+      "grad_norm": 0.5288170576095581,
+      "learning_rate": 2.1333333333333335e-05,
+      "loss": 0.7776,
+      "step": 17
+    },
+    {
+      "epoch": 0.01152,
+      "grad_norm": 0.5462191700935364,
+      "learning_rate": 2.2666666666666668e-05,
+      "loss": 0.758,
+      "step": 18
+    },
+    {
+      "epoch": 0.01216,
+      "grad_norm": 0.5302035212516785,
+      "learning_rate": 2.4e-05,
+      "loss": 0.8512,
+      "step": 19
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.46077799797058105,
+      "learning_rate": 2.5333333333333337e-05,
+      "loss": 0.849,
+      "step": 20
+    },
+    {
+      "epoch": 0.01344,
+      "grad_norm": 0.4158674478530884,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 0.8552,
+      "step": 21
+    },
+    {
+      "epoch": 0.01408,
+      "grad_norm": 0.40496826171875,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 0.8578,
+      "step": 22
+    },
+    {
+      "epoch": 0.01472,
+      "grad_norm": 0.3750077486038208,
+      "learning_rate": 2.9333333333333336e-05,
+      "loss": 0.8703,
+      "step": 23
+    },
+    {
+      "epoch": 0.01536,
+      "grad_norm": 0.3390427529811859,
+      "learning_rate": 3.066666666666667e-05,
+      "loss": 0.81,
+      "step": 24
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.3427319824695587,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 0.7206,
+      "step": 25
+    },
+    {
+      "epoch": 0.01664,
+      "grad_norm": 0.3468775451183319,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 0.7187,
+      "step": 26
+    },
+    {
+      "epoch": 0.01728,
+      "grad_norm": 0.37644508481025696,
+      "learning_rate": 3.466666666666667e-05,
+      "loss": 0.8508,
+      "step": 27
+    },
+    {
+      "epoch": 0.01792,
+      "grad_norm": 0.39194977283477783,
+      "learning_rate": 3.6e-05,
+      "loss": 0.8958,
+      "step": 28
+    },
+    {
+      "epoch": 0.01856,
+      "grad_norm": 0.38071537017822266,
+      "learning_rate": 3.733333333333334e-05,
+      "loss": 0.743,
+      "step": 29
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.38951224088668823,
+      "learning_rate": 3.866666666666667e-05,
+      "loss": 0.896,
+      "step": 30
+    },
+    {
+      "epoch": 0.01984,
+      "grad_norm": 0.37390923500061035,
+      "learning_rate": 4e-05,
+      "loss": 0.7967,
+      "step": 31
+    },
+    {
+      "epoch": 0.02048,
+      "grad_norm": 0.3488105833530426,
+      "learning_rate": 4.133333333333333e-05,
+      "loss": 0.8419,
+      "step": 32
+    },
+    {
+      "epoch": 0.02112,
+      "grad_norm": 0.34550485014915466,
+      "learning_rate": 4.266666666666667e-05,
+      "loss": 0.8013,
+      "step": 33
+    },
+    {
+      "epoch": 0.02176,
+      "grad_norm": 0.3142073154449463,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 0.7542,
+      "step": 34
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.3066573143005371,
+      "learning_rate": 4.5333333333333335e-05,
+      "loss": 0.7548,
+      "step": 35
+    },
+    {
+      "epoch": 0.02304,
+      "grad_norm": 0.33399665355682373,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.7624,
+      "step": 36
+    },
+    {
+      "epoch": 0.02368,
+      "grad_norm": 0.319749653339386,
+      "learning_rate": 4.8e-05,
+      "loss": 0.8393,
+      "step": 37
+    },
+    {
+      "epoch": 0.02432,
+      "grad_norm": 0.33495351672172546,
+      "learning_rate": 4.933333333333334e-05,
+      "loss": 0.8262,
+      "step": 38
+    },
+    {
+      "epoch": 0.02496,
+      "grad_norm": 0.3214278817176819,
+      "learning_rate": 5.0666666666666674e-05,
+      "loss": 0.7511,
+      "step": 39
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.344425231218338,
+      "learning_rate": 5.2000000000000004e-05,
+      "loss": 0.8275,
+      "step": 40
+    },
+    {
+      "epoch": 0.02624,
+      "grad_norm": 0.3231922686100006,
+      "learning_rate": 5.333333333333333e-05,
+      "loss": 0.7819,
+      "step": 41
+    },
+    {
+      "epoch": 0.02688,
+      "grad_norm": 0.3128565847873688,
+      "learning_rate": 5.466666666666666e-05,
+      "loss": 0.7356,
+      "step": 42
+    },
+    {
+      "epoch": 0.02752,
+      "grad_norm": 0.3014838695526123,
+      "learning_rate": 5.6000000000000006e-05,
+      "loss": 0.7658,
+      "step": 43
+    },
+    {
+      "epoch": 0.02816,
+      "grad_norm": 0.3117836117744446,
+      "learning_rate": 5.7333333333333336e-05,
+      "loss": 0.7325,
+      "step": 44
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.3346293568611145,
+      "learning_rate": 5.866666666666667e-05,
+      "loss": 0.7179,
+      "step": 45
+    },
+    {
+      "epoch": 0.02944,
+      "grad_norm": 0.2924030125141144,
+      "learning_rate": 6e-05,
+      "loss": 0.7569,
+      "step": 46
+    },
+    {
+      "epoch": 0.03008,
+      "grad_norm": 0.3573443591594696,
+      "learning_rate": 6.133333333333334e-05,
+      "loss": 0.7773,
+      "step": 47
+    },
+    {
+      "epoch": 0.03072,
+      "grad_norm": 0.3303980529308319,
+      "learning_rate": 6.266666666666667e-05,
+      "loss": 0.7001,
+      "step": 48
+    },
+    {
+      "epoch": 0.03136,
+      "grad_norm": 0.3368162214756012,
+      "learning_rate": 6.400000000000001e-05,
+      "loss": 0.803,
+      "step": 49
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.34593161940574646,
+      "learning_rate": 6.533333333333334e-05,
+      "loss": 0.8198,
+      "step": 50
+    },
+    {
+      "epoch": 0.03264,
+      "grad_norm": 0.3314804136753082,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.8692,
+      "step": 51
+    },
+    {
+      "epoch": 0.03328,
+      "grad_norm": 0.33531248569488525,
+      "learning_rate": 6.800000000000001e-05,
+      "loss": 0.7863,
+      "step": 52
+    },
+    {
+      "epoch": 0.03392,
+      "grad_norm": 0.3078380823135376,
+      "learning_rate": 6.933333333333334e-05,
+      "loss": 0.7695,
+      "step": 53
+    },
+    {
+      "epoch": 0.03456,
+      "grad_norm": 0.3284406065940857,
+      "learning_rate": 7.066666666666667e-05,
+      "loss": 0.7374,
+      "step": 54
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.3009240925312042,
+      "learning_rate": 7.2e-05,
+      "loss": 0.741,
+      "step": 55
+    },
+    {
+      "epoch": 0.03584,
+      "grad_norm": 0.3034893572330475,
+      "learning_rate": 7.333333333333333e-05,
+      "loss": 0.8591,
+      "step": 56
+    },
+    {
+      "epoch": 0.03648,
+      "grad_norm": 0.32195839285850525,
+      "learning_rate": 7.466666666666667e-05,
+      "loss": 0.8739,
+      "step": 57
+    },
+    {
+      "epoch": 0.03712,
+      "grad_norm": 0.3256431818008423,
+      "learning_rate": 7.6e-05,
+      "loss": 0.7597,
+      "step": 58
+    },
+    {
+      "epoch": 0.03776,
+      "grad_norm": 0.329429566860199,
+      "learning_rate": 7.733333333333333e-05,
+      "loss": 0.8074,
+      "step": 59
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.33767491579055786,
+      "learning_rate": 7.866666666666666e-05,
+      "loss": 0.7833,
+      "step": 60
+    },
+    {
+      "epoch": 0.03904,
+      "grad_norm": 0.31706422567367554,
+      "learning_rate": 8e-05,
+      "loss": 0.7313,
+      "step": 61
+    },
+    {
+      "epoch": 0.03968,
+      "grad_norm": 0.3143269717693329,
+      "learning_rate": 8.133333333333334e-05,
+      "loss": 0.669,
+      "step": 62
+    },
+    {
+      "epoch": 0.04032,
+      "grad_norm": 0.32198163866996765,
+      "learning_rate": 8.266666666666667e-05,
+      "loss": 0.6839,
+      "step": 63
+    },
+    {
+      "epoch": 0.04096,
+      "grad_norm": 0.3127489387989044,
+      "learning_rate": 8.4e-05,
+      "loss": 0.7504,
+      "step": 64
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.31161946058273315,
+      "learning_rate": 8.533333333333334e-05,
+      "loss": 0.6898,
+      "step": 65
+    },
+    {
+      "epoch": 0.04224,
+      "grad_norm": 0.3216173052787781,
+      "learning_rate": 8.666666666666667e-05,
+      "loss": 0.8007,
+      "step": 66
+    },
+    {
+      "epoch": 0.04288,
+      "grad_norm": 0.34100618958473206,
+      "learning_rate": 8.800000000000001e-05,
+      "loss": 0.8052,
+      "step": 67
+    },
+    {
+      "epoch": 0.04352,
+      "grad_norm": 0.3120584487915039,
+      "learning_rate": 8.933333333333334e-05,
+      "loss": 0.7052,
+      "step": 68
+    },
+    {
+      "epoch": 0.04416,
+      "grad_norm": 0.32298824191093445,
+      "learning_rate": 9.066666666666667e-05,
+      "loss": 0.7722,
+      "step": 69
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.3192760944366455,
+      "learning_rate": 9.200000000000001e-05,
+      "loss": 0.8325,
+      "step": 70
+    },
+    {
+      "epoch": 0.04544,
+      "grad_norm": 0.3320888578891754,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 0.8078,
+      "step": 71
+    },
+    {
+      "epoch": 0.04608,
+      "grad_norm": 0.30659231543540955,
+      "learning_rate": 9.466666666666667e-05,
+      "loss": 0.7702,
+      "step": 72
+    },
+    {
+      "epoch": 0.04672,
+      "grad_norm": 0.321393221616745,
+      "learning_rate": 9.6e-05,
+      "loss": 0.7173,
+      "step": 73
+    },
+    {
+      "epoch": 0.04736,
+      "grad_norm": 0.32504022121429443,
+      "learning_rate": 9.733333333333335e-05,
+      "loss": 0.6959,
+      "step": 74
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.31898629665374756,
+      "learning_rate": 9.866666666666668e-05,
+      "loss": 0.8182,
+      "step": 75
+    },
+    {
+      "epoch": 0.04864,
+      "grad_norm": 0.3069212734699249,
+      "learning_rate": 0.0001,
+      "loss": 0.7323,
+      "step": 76
+    },
+    {
+      "epoch": 0.04928,
+      "grad_norm": 0.30757465958595276,
+      "learning_rate": 0.00010133333333333335,
+      "loss": 0.7489,
+      "step": 77
+    },
+    {
+      "epoch": 0.04992,
+      "grad_norm": 0.3295109272003174,
+      "learning_rate": 0.00010266666666666666,
+      "loss": 0.7856,
+      "step": 78
+    },
+    {
+      "epoch": 0.05056,
+      "grad_norm": 0.3125697076320648,
+      "learning_rate": 0.00010400000000000001,
+      "loss": 0.7128,
+      "step": 79
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.3253207206726074,
+      "learning_rate": 0.00010533333333333332,
+      "loss": 0.679,
+      "step": 80
+    },
+    {
+      "epoch": 0.05184,
+      "grad_norm": 0.3182617425918579,
+      "learning_rate": 0.00010666666666666667,
+      "loss": 0.6698,
+      "step": 81
+    },
+    {
+      "epoch": 0.05248,
+      "grad_norm": 0.3359428942203522,
+      "learning_rate": 0.00010800000000000001,
+      "loss": 0.7042,
+      "step": 82
+    },
+    {
+      "epoch": 0.05312,
+      "grad_norm": 0.2904784679412842,
+      "learning_rate": 0.00010933333333333333,
+      "loss": 0.6712,
+      "step": 83
+    },
+    {
+      "epoch": 0.05376,
+      "grad_norm": 0.36973974108695984,
+      "learning_rate": 0.00011066666666666667,
+      "loss": 0.817,
+      "step": 84
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3179187774658203,
+      "learning_rate": 0.00011200000000000001,
+      "loss": 0.852,
+      "step": 85
+    },
+    {
+      "epoch": 0.05504,
+      "grad_norm": 0.3543005883693695,
+      "learning_rate": 0.00011333333333333334,
+      "loss": 0.7617,
+      "step": 86
+    },
+    {
+      "epoch": 0.05568,
+      "grad_norm": 0.31280890107154846,
+      "learning_rate": 0.00011466666666666667,
+      "loss": 0.838,
+      "step": 87
+    },
+    {
+      "epoch": 0.05632,
+      "grad_norm": 0.29157790541648865,
+      "learning_rate": 0.000116,
+      "loss": 0.7285,
+      "step": 88
+    },
+    {
+      "epoch": 0.05696,
+      "grad_norm": 0.3266328275203705,
+      "learning_rate": 0.00011733333333333334,
+      "loss": 0.7723,
+      "step": 89
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.31193479895591736,
+      "learning_rate": 0.00011866666666666669,
+      "loss": 0.699,
+      "step": 90
+    },
+    {
+      "epoch": 0.05824,
+      "grad_norm": 0.3112991452217102,
+      "learning_rate": 0.00012,
+      "loss": 0.8292,
+      "step": 91
+    },
+    {
+      "epoch": 0.05888,
+      "grad_norm": 0.34648987650871277,
+      "learning_rate": 0.00012133333333333335,
+      "loss": 0.8311,
+      "step": 92
+    },
+    {
+      "epoch": 0.05952,
+      "grad_norm": 0.3485969603061676,
+      "learning_rate": 0.00012266666666666668,
+      "loss": 0.7708,
+      "step": 93
+    },
+    {
+      "epoch": 0.06016,
+      "grad_norm": 0.28423577547073364,
+      "learning_rate": 0.000124,
+      "loss": 0.7446,
+      "step": 94
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.3227793574333191,
+      "learning_rate": 0.00012533333333333334,
+      "loss": 0.6656,
+      "step": 95
+    },
+    {
+      "epoch": 0.06144,
+      "grad_norm": 0.3523266911506653,
+      "learning_rate": 0.00012666666666666666,
+      "loss": 0.7765,
+      "step": 96
+    },
+    {
+      "epoch": 0.06208,
+      "grad_norm": 0.31250032782554626,
+      "learning_rate": 0.00012800000000000002,
+      "loss": 0.7881,
+      "step": 97
+    },
+    {
+      "epoch": 0.06272,
+      "grad_norm": 0.3039192855358124,
+      "learning_rate": 0.00012933333333333332,
+      "loss": 0.7969,
+      "step": 98
+    },
+    {
+      "epoch": 0.06336,
+      "grad_norm": 0.32894226908683777,
+      "learning_rate": 0.00013066666666666668,
+      "loss": 0.6723,
+      "step": 99
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.3103935122489929,
+      "learning_rate": 0.000132,
+      "loss": 0.8185,
+      "step": 100
+    },
+    {
+      "epoch": 0.06464,
+      "grad_norm": 0.3284664750099182,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.7811,
+      "step": 101
+    },
+    {
+      "epoch": 0.06528,
+      "grad_norm": 0.31105977296829224,
+      "learning_rate": 0.00013466666666666667,
+      "loss": 0.6877,
+      "step": 102
+    },
+    {
+      "epoch": 0.06592,
+      "grad_norm": 0.342144638299942,
+      "learning_rate": 0.00013600000000000003,
+      "loss": 0.807,
+      "step": 103
+    },
+    {
+      "epoch": 0.06656,
+      "grad_norm": 0.3340494930744171,
+      "learning_rate": 0.00013733333333333333,
+      "loss": 0.7321,
+      "step": 104
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.30696117877960205,
+      "learning_rate": 0.00013866666666666669,
+      "loss": 0.6576,
+      "step": 105
+    },
+    {
+      "epoch": 0.06784,
+      "grad_norm": 0.30079448223114014,
+      "learning_rate": 0.00014,
+      "loss": 0.716,
+      "step": 106
+    },
+    {
+      "epoch": 0.06848,
+      "grad_norm": 0.32421207427978516,
+      "learning_rate": 0.00014133333333333334,
+      "loss": 0.7067,
+      "step": 107
+    },
+    {
+      "epoch": 0.06912,
+      "grad_norm": 0.34156566858291626,
+      "learning_rate": 0.00014266666666666667,
+      "loss": 0.7903,
+      "step": 108
+    },
+    {
+      "epoch": 0.06976,
+      "grad_norm": 0.31713947653770447,
+      "learning_rate": 0.000144,
+      "loss": 0.6748,
+      "step": 109
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.2779797315597534,
+      "learning_rate": 0.00014533333333333333,
+      "loss": 0.7532,
+      "step": 110
+    },
+    {
+      "epoch": 0.07104,
+      "grad_norm": 0.2949628531932831,
+      "learning_rate": 0.00014666666666666666,
+      "loss": 0.6854,
+      "step": 111
+    },
+    {
+      "epoch": 0.07168,
+      "grad_norm": 0.30041080713272095,
+      "learning_rate": 0.000148,
+      "loss": 0.6923,
+      "step": 112
+    },
+    {
+      "epoch": 0.07232,
+      "grad_norm": 0.3237162232398987,
+      "learning_rate": 0.00014933333333333335,
+      "loss": 0.6486,
+      "step": 113
+    },
+    {
+      "epoch": 0.07296,
+      "grad_norm": 0.3145727217197418,
+      "learning_rate": 0.00015066666666666668,
+      "loss": 0.7314,
+      "step": 114
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.3207891583442688,
+      "learning_rate": 0.000152,
+      "loss": 0.7386,
+      "step": 115
+    },
+    {
+      "epoch": 0.07424,
+      "grad_norm": 0.3066804111003876,
+      "learning_rate": 0.00015333333333333334,
+      "loss": 0.8044,
+      "step": 116
+    },
+    {
+      "epoch": 0.07488,
+      "grad_norm": 0.31171250343322754,
+      "learning_rate": 0.00015466666666666667,
+      "loss": 0.7764,
+      "step": 117
+    },
+    {
+      "epoch": 0.07552,
+      "grad_norm": 0.2974667549133301,
+      "learning_rate": 0.00015600000000000002,
+      "loss": 0.7191,
+      "step": 118
+    },
+    {
+      "epoch": 0.07616,
+      "grad_norm": 0.30347663164138794,
+      "learning_rate": 0.00015733333333333333,
+      "loss": 0.7289,
+      "step": 119
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.28048965334892273,
+      "learning_rate": 0.00015866666666666668,
+      "loss": 0.7528,
+      "step": 120
+    },
+    {
+      "epoch": 0.07744,
+      "grad_norm": 0.30916306376457214,
+      "learning_rate": 0.00016,
+      "loss": 0.7144,
+      "step": 121
+    },
+    {
+      "epoch": 0.07808,
+      "grad_norm": 0.3212970793247223,
+      "learning_rate": 0.00016133333333333334,
+      "loss": 0.697,
+      "step": 122
+    },
+    {
+      "epoch": 0.07872,
+      "grad_norm": 0.3115072250366211,
+      "learning_rate": 0.00016266666666666667,
+      "loss": 0.8522,
+      "step": 123
+    },
+    {
+      "epoch": 0.07936,
+      "grad_norm": 0.2989172339439392,
+      "learning_rate": 0.000164,
+      "loss": 0.7709,
+      "step": 124
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.301048219203949,
+      "learning_rate": 0.00016533333333333333,
+      "loss": 0.69,
+      "step": 125
+    },
+    {
+      "epoch": 0.08064,
+      "grad_norm": 0.32641011476516724,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 0.8069,
+      "step": 126
+    },
+    {
+      "epoch": 0.08128,
+      "grad_norm": 0.30696675181388855,
+      "learning_rate": 0.000168,
+      "loss": 0.813,
+      "step": 127
+    },
+    {
+      "epoch": 0.08192,
+      "grad_norm": 0.28849950432777405,
+      "learning_rate": 0.00016933333333333335,
+      "loss": 0.6871,
+      "step": 128
+    },
+    {
+      "epoch": 0.08256,
+      "grad_norm": 0.2904180586338043,
+      "learning_rate": 0.00017066666666666668,
+      "loss": 0.7063,
+      "step": 129
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3106667399406433,
+      "learning_rate": 0.000172,
+      "loss": 0.8531,
+      "step": 130
+    },
+    {
+      "epoch": 0.08384,
+      "grad_norm": 0.2927021384239197,
+      "learning_rate": 0.00017333333333333334,
+      "loss": 0.6937,
+      "step": 131
+    },
+    {
+      "epoch": 0.08448,
+      "grad_norm": 0.3108760714530945,
+      "learning_rate": 0.00017466666666666667,
+      "loss": 0.7822,
+      "step": 132
+    },
+    {
+      "epoch": 0.08512,
+      "grad_norm": 0.3195500671863556,
+      "learning_rate": 0.00017600000000000002,
+      "loss": 0.8762,
+      "step": 133
+    },
+    {
+      "epoch": 0.08576,
+      "grad_norm": 0.2844371199607849,
+      "learning_rate": 0.00017733333333333335,
+      "loss": 0.6747,
+      "step": 134
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.29041963815689087,
+      "learning_rate": 0.00017866666666666668,
+      "loss": 0.7687,
+      "step": 135
+    },
+    {
+      "epoch": 0.08704,
+      "grad_norm": 0.3079332411289215,
+      "learning_rate": 0.00018,
+      "loss": 0.7302,
+      "step": 136
+    },
+    {
+      "epoch": 0.08768,
+      "grad_norm": 0.3247929811477661,
+      "learning_rate": 0.00018133333333333334,
+      "loss": 0.7977,
+      "step": 137
+    },
+    {
+      "epoch": 0.08832,
+      "grad_norm": 0.2808547019958496,
+      "learning_rate": 0.00018266666666666667,
+      "loss": 0.7727,
+      "step": 138
+    },
+    {
+      "epoch": 0.08896,
+      "grad_norm": 0.30695173144340515,
+      "learning_rate": 0.00018400000000000003,
+      "loss": 0.7746,
+      "step": 139
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.2910364866256714,
+      "learning_rate": 0.00018533333333333333,
+      "loss": 0.7245,
+      "step": 140
+    },
+    {
+      "epoch": 0.09024,
+      "grad_norm": 0.3014585077762604,
+      "learning_rate": 0.0001866666666666667,
+      "loss": 0.7615,
+      "step": 141
+    },
+    {
+      "epoch": 0.09088,
+      "grad_norm": 0.28013312816619873,
+      "learning_rate": 0.000188,
+      "loss": 0.7208,
+      "step": 142
+    },
+    {
+      "epoch": 0.09152,
+      "grad_norm": 0.29026126861572266,
+      "learning_rate": 0.00018933333333333335,
+      "loss": 0.708,
+      "step": 143
+    },
+    {
+      "epoch": 0.09216,
+      "grad_norm": 0.29961952567100525,
+      "learning_rate": 0.00019066666666666668,
+      "loss": 0.7403,
+      "step": 144
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.3099021315574646,
+      "learning_rate": 0.000192,
+      "loss": 0.7026,
+      "step": 145
+    },
+    {
+      "epoch": 0.09344,
+      "grad_norm": 0.2849430739879608,
+      "learning_rate": 0.00019333333333333333,
+      "loss": 0.7296,
+      "step": 146
+    },
+    {
+      "epoch": 0.09408,
+      "grad_norm": 0.28370675444602966,
+      "learning_rate": 0.0001946666666666667,
+      "loss": 0.7572,
+      "step": 147
+    },
+    {
+      "epoch": 0.09472,
+      "grad_norm": 0.29357901215553284,
+      "learning_rate": 0.000196,
+      "loss": 0.6787,
+      "step": 148
+    },
+    {
+      "epoch": 0.09536,
+      "grad_norm": 0.31645891070365906,
+      "learning_rate": 0.00019733333333333335,
+      "loss": 0.9229,
+      "step": 149
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.32968923449516296,
+      "learning_rate": 0.00019866666666666668,
+      "loss": 0.8676,
+      "step": 150
+    },
+    {
+      "epoch": 0.09664,
+      "grad_norm": 0.2814370095729828,
+      "learning_rate": 0.0002,
+      "loss": 0.7605,
+      "step": 151
+    },
+    {
+      "epoch": 0.09728,
+      "grad_norm": 0.33951953053474426,
+      "learning_rate": 0.00019995876288659794,
+      "loss": 0.6641,
+      "step": 152
+    },
+    {
+      "epoch": 0.09792,
+      "grad_norm": 0.290942907333374,
+      "learning_rate": 0.0001999175257731959,
+      "loss": 0.6975,
+      "step": 153
+    },
+    {
+      "epoch": 0.09856,
+      "grad_norm": 0.3173905611038208,
+      "learning_rate": 0.00019987628865979383,
+      "loss": 0.7418,
+      "step": 154
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.2619405686855316,
+      "learning_rate": 0.00019983505154639176,
+      "loss": 0.6878,
+      "step": 155
+    },
+    {
+      "epoch": 0.09984,
+      "grad_norm": 0.2939823865890503,
+      "learning_rate": 0.0001997938144329897,
+      "loss": 0.7463,
+      "step": 156
+    },
+    {
+      "epoch": 0.10048,
+      "grad_norm": 0.2828215956687927,
+      "learning_rate": 0.00019975257731958762,
+      "loss": 0.7422,
+      "step": 157
+    },
+    {
+      "epoch": 0.10112,
+      "grad_norm": 0.29763561487197876,
+      "learning_rate": 0.00019971134020618558,
+      "loss": 0.7002,
+      "step": 158
+    },
+    {
+      "epoch": 0.10176,
+      "grad_norm": 0.271653950214386,
+      "learning_rate": 0.00019967010309278351,
+      "loss": 0.6432,
+      "step": 159
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3049352765083313,
+      "learning_rate": 0.00019962886597938147,
+      "loss": 0.7761,
+      "step": 160
+    },
+    {
+      "epoch": 0.10304,
+      "grad_norm": 0.2919256389141083,
+      "learning_rate": 0.0001995876288659794,
+      "loss": 0.7133,
+      "step": 161
+    },
+    {
+      "epoch": 0.10368,
+      "grad_norm": 0.2686925530433655,
+      "learning_rate": 0.00019954639175257733,
+      "loss": 0.6926,
+      "step": 162
+    },
+    {
+      "epoch": 0.10432,
+      "grad_norm": 0.35965174436569214,
+      "learning_rate": 0.00019950515463917527,
+      "loss": 0.8662,
+      "step": 163
+    },
+    {
+      "epoch": 0.10496,
+      "grad_norm": 0.3050249218940735,
+      "learning_rate": 0.0001994639175257732,
+      "loss": 0.6815,
+      "step": 164
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.3129306435585022,
+      "learning_rate": 0.00019942268041237116,
+      "loss": 0.752,
+      "step": 165
+    },
+    {
+      "epoch": 0.10624,
+      "grad_norm": 0.33631986379623413,
+      "learning_rate": 0.0001993814432989691,
+      "loss": 0.7659,
+      "step": 166
+    },
+    {
+      "epoch": 0.10688,
+      "grad_norm": 0.3103901743888855,
+      "learning_rate": 0.00019934020618556702,
+      "loss": 0.7861,
+      "step": 167
+    },
+    {
+      "epoch": 0.10752,
+      "grad_norm": 0.3209918439388275,
+      "learning_rate": 0.00019929896907216498,
+      "loss": 0.6942,
+      "step": 168
+    },
+    {
+      "epoch": 0.10816,
+      "grad_norm": 0.3065873682498932,
+      "learning_rate": 0.00019925773195876288,
+      "loss": 0.7872,
+      "step": 169
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.3127877414226532,
+      "learning_rate": 0.0001992164948453608,
+      "loss": 0.7304,
+      "step": 170
+    },
+    {
+      "epoch": 0.10944,
+      "grad_norm": 0.28500568866729736,
+      "learning_rate": 0.00019917525773195877,
+      "loss": 0.6198,
+      "step": 171
+    },
+    {
+      "epoch": 0.11008,
+      "grad_norm": 0.28309932351112366,
+      "learning_rate": 0.0001991340206185567,
+      "loss": 0.6264,
+      "step": 172
+    },
+    {
+      "epoch": 0.11072,
+      "grad_norm": 0.25835469365119934,
+      "learning_rate": 0.00019909278350515466,
+      "loss": 0.6785,
+      "step": 173
+    },
+    {
+      "epoch": 0.11136,
+      "grad_norm": 0.302633672952652,
+      "learning_rate": 0.0001990515463917526,
+      "loss": 0.7509,
+      "step": 174
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.3255435526371002,
+      "learning_rate": 0.00019901030927835052,
+      "loss": 0.7079,
+      "step": 175
+    },
+    {
+      "epoch": 0.11264,
+      "grad_norm": 0.29552069306373596,
+      "learning_rate": 0.00019896907216494845,
+      "loss": 0.708,
+      "step": 176
+    },
+    {
+      "epoch": 0.11328,
+      "grad_norm": 0.30411866307258606,
+      "learning_rate": 0.00019892783505154639,
+      "loss": 0.7061,
+      "step": 177
+    },
+    {
+      "epoch": 0.11392,
+      "grad_norm": 0.31047070026397705,
+      "learning_rate": 0.00019888659793814434,
+      "loss": 0.72,
+      "step": 178
+    },
+    {
+      "epoch": 0.11456,
+      "grad_norm": 0.3015742301940918,
+      "learning_rate": 0.00019884536082474227,
+      "loss": 0.6723,
+      "step": 179
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.3008481562137604,
+      "learning_rate": 0.0001988041237113402,
+      "loss": 0.7674,
+      "step": 180
+    },
+    {
+      "epoch": 0.11584,
+      "grad_norm": 0.2997855544090271,
+      "learning_rate": 0.00019876288659793816,
+      "loss": 0.7232,
+      "step": 181
+    },
+    {
+      "epoch": 0.11648,
+      "grad_norm": 0.29149672389030457,
+      "learning_rate": 0.0001987216494845361,
+      "loss": 0.6402,
+      "step": 182
+    },
+    {
+      "epoch": 0.11712,
+      "grad_norm": 0.3142986297607422,
+      "learning_rate": 0.00019868041237113403,
+      "loss": 0.6477,
+      "step": 183
+    },
+    {
+      "epoch": 0.11776,
+      "grad_norm": 0.31407997012138367,
+      "learning_rate": 0.00019863917525773196,
+      "loss": 0.7246,
+      "step": 184
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.2753082811832428,
+      "learning_rate": 0.0001985979381443299,
+      "loss": 0.7443,
+      "step": 185
+    },
+    {
+      "epoch": 0.11904,
+      "grad_norm": 0.3194136917591095,
+      "learning_rate": 0.00019855670103092785,
+      "loss": 0.7081,
+      "step": 186
+    },
+    {
+      "epoch": 0.11968,
+      "grad_norm": 0.31416574120521545,
+      "learning_rate": 0.00019851546391752578,
+      "loss": 0.6431,
+      "step": 187
+    },
+    {
+      "epoch": 0.12032,
+      "grad_norm": 0.3210378885269165,
+      "learning_rate": 0.00019847422680412374,
+      "loss": 0.6816,
+      "step": 188
+    },
+    {
+      "epoch": 0.12096,
+      "grad_norm": 0.30720171332359314,
+      "learning_rate": 0.00019843298969072167,
+      "loss": 0.78,
+      "step": 189
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.34126073122024536,
+      "learning_rate": 0.0001983917525773196,
+      "loss": 0.7791,
+      "step": 190
+    },
+    {
+      "epoch": 0.12224,
+      "grad_norm": 0.29785630106925964,
+      "learning_rate": 0.00019835051546391753,
+      "loss": 0.6336,
+      "step": 191
+    },
+    {
+      "epoch": 0.12288,
+      "grad_norm": 0.291119784116745,
+      "learning_rate": 0.00019830927835051546,
+      "loss": 0.7284,
+      "step": 192
+    },
+    {
+      "epoch": 0.12352,
+      "grad_norm": 0.317786306142807,
+      "learning_rate": 0.00019826804123711342,
+      "loss": 0.8124,
+      "step": 193
+    },
+    {
+      "epoch": 0.12416,
+      "grad_norm": 0.29936137795448303,
+      "learning_rate": 0.00019822680412371135,
+      "loss": 0.7288,
+      "step": 194
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.34165239334106445,
+      "learning_rate": 0.00019818556701030928,
+      "loss": 0.7556,
+      "step": 195
+    },
+    {
+      "epoch": 0.12544,
+      "grad_norm": 0.31708401441574097,
+      "learning_rate": 0.00019814432989690724,
+      "loss": 0.7874,
+      "step": 196
+    },
+    {
+      "epoch": 0.12608,
+      "grad_norm": 0.30362123250961304,
+      "learning_rate": 0.00019810309278350517,
+      "loss": 0.7761,
+      "step": 197
+    },
+    {
+      "epoch": 0.12672,
+      "grad_norm": 0.29262575507164,
+      "learning_rate": 0.0001980618556701031,
+      "loss": 0.6847,
+      "step": 198
+    },
+    {
+      "epoch": 0.12736,
+      "grad_norm": 0.29288071393966675,
+      "learning_rate": 0.00019802061855670104,
+      "loss": 0.7932,
+      "step": 199
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.30932584404945374,
+      "learning_rate": 0.00019797938144329897,
+      "loss": 0.7789,
+      "step": 200
+    },
+    {
+      "epoch": 0.12864,
+      "grad_norm": 0.2790425419807434,
+      "learning_rate": 0.00019793814432989693,
+      "loss": 0.755,
+      "step": 201
+    },
+    {
+      "epoch": 0.12928,
+      "grad_norm": 0.2887434959411621,
+      "learning_rate": 0.00019789690721649486,
+      "loss": 0.654,
+      "step": 202
+    },
+    {
+      "epoch": 0.12992,
+      "grad_norm": 0.2877611815929413,
+      "learning_rate": 0.0001978556701030928,
+      "loss": 0.7671,
+      "step": 203
+    },
+    {
+      "epoch": 0.13056,
+      "grad_norm": 0.3143225312232971,
+      "learning_rate": 0.00019781443298969075,
+      "loss": 0.8305,
+      "step": 204
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.32392239570617676,
+      "learning_rate": 0.00019777319587628865,
+      "loss": 0.7256,
+      "step": 205
+    },
+    {
+      "epoch": 0.13184,
+      "grad_norm": 0.34306666254997253,
+      "learning_rate": 0.0001977319587628866,
+      "loss": 0.7557,
+      "step": 206
+    },
+    {
+      "epoch": 0.13248,
+      "grad_norm": 0.3159867227077484,
+      "learning_rate": 0.00019769072164948454,
+      "loss": 0.8242,
+      "step": 207
+    },
+    {
+      "epoch": 0.13312,
+      "grad_norm": 0.30515992641448975,
+      "learning_rate": 0.00019764948453608247,
+      "loss": 0.7825,
+      "step": 208
+    },
+    {
+      "epoch": 0.13376,
+      "grad_norm": 0.3434230089187622,
+      "learning_rate": 0.00019760824742268043,
+      "loss": 0.647,
+      "step": 209
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.3035261929035187,
+      "learning_rate": 0.00019756701030927836,
+      "loss": 0.8179,
+      "step": 210
+    },
+    {
+      "epoch": 0.13504,
+      "grad_norm": 0.3019905686378479,
+      "learning_rate": 0.00019752577319587632,
+      "loss": 0.6927,
+      "step": 211
+    },
+    {
+      "epoch": 0.13568,
+      "grad_norm": 0.2871446907520294,
+      "learning_rate": 0.00019748453608247422,
+      "loss": 0.8086,
+      "step": 212
+    },
+    {
+      "epoch": 0.13632,
+      "grad_norm": 0.295788049697876,
+      "learning_rate": 0.00019744329896907216,
+      "loss": 0.6928,
+      "step": 213
+    },
+    {
+      "epoch": 0.13696,
+      "grad_norm": 0.30843695998191833,
+      "learning_rate": 0.0001974020618556701,
+      "loss": 0.6596,
+      "step": 214
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.34640681743621826,
+      "learning_rate": 0.00019736082474226804,
+      "loss": 0.7069,
+      "step": 215
+    },
+    {
+      "epoch": 0.13824,
+      "grad_norm": 0.3011893332004547,
+      "learning_rate": 0.000197319587628866,
+      "loss": 0.7283,
+      "step": 216
+    },
+    {
+      "epoch": 0.13888,
+      "grad_norm": 0.3524264395236969,
+      "learning_rate": 0.00019727835051546393,
+      "loss": 0.7265,
+      "step": 217
+    },
+    {
+      "epoch": 0.13952,
+      "grad_norm": 0.2686672508716583,
+      "learning_rate": 0.00019723711340206187,
+      "loss": 0.6644,
+      "step": 218
+    },
+    {
+      "epoch": 0.14016,
+      "grad_norm": 0.3050358295440674,
+      "learning_rate": 0.0001971958762886598,
+      "loss": 0.691,
+      "step": 219
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.28776687383651733,
+      "learning_rate": 0.00019715463917525773,
+      "loss": 0.7653,
+      "step": 220
+    },
+    {
+      "epoch": 0.14144,
+      "grad_norm": 0.2940015196800232,
+      "learning_rate": 0.0001971134020618557,
+      "loss": 0.7806,
+      "step": 221
+    },
+    {
+      "epoch": 0.14208,
+      "grad_norm": 0.3044091463088989,
+      "learning_rate": 0.00019707216494845362,
+      "loss": 0.712,
+      "step": 222
+    },
+    {
+      "epoch": 0.14272,
+      "grad_norm": 0.310905396938324,
+      "learning_rate": 0.00019703092783505155,
+      "loss": 0.7279,
+      "step": 223
+    },
+    {
+      "epoch": 0.14336,
+      "grad_norm": 0.28161653876304626,
+      "learning_rate": 0.0001969896907216495,
+      "loss": 0.7095,
+      "step": 224
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.29839521646499634,
+      "learning_rate": 0.00019694845360824744,
+      "loss": 0.7408,
+      "step": 225
+    },
+    {
+      "epoch": 0.14464,
+      "grad_norm": 0.32727834582328796,
+      "learning_rate": 0.00019690721649484537,
+      "loss": 0.776,
+      "step": 226
+    },
+    {
+      "epoch": 0.14528,
+      "grad_norm": 0.3301754295825958,
+      "learning_rate": 0.0001968659793814433,
+      "loss": 0.7859,
+      "step": 227
+    },
+    {
+      "epoch": 0.14592,
+      "grad_norm": 0.29604339599609375,
+      "learning_rate": 0.00019682474226804123,
+      "loss": 0.7962,
+      "step": 228
+    },
+    {
+      "epoch": 0.14656,
+      "grad_norm": 0.36882567405700684,
+      "learning_rate": 0.0001967835051546392,
+      "loss": 0.783,
+      "step": 229
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.32955777645111084,
+      "learning_rate": 0.00019674226804123712,
+      "loss": 0.7806,
+      "step": 230
+    },
+    {
+      "epoch": 0.14784,
+      "grad_norm": 0.30023613572120667,
+      "learning_rate": 0.00019670103092783505,
+      "loss": 0.6966,
+      "step": 231
+    },
+    {
+      "epoch": 0.14848,
+      "grad_norm": 0.33033934235572815,
+      "learning_rate": 0.000196659793814433,
+      "loss": 0.7132,
+      "step": 232
+    },
+    {
+      "epoch": 0.14912,
+      "grad_norm": 0.2974123954772949,
+      "learning_rate": 0.00019661855670103094,
+      "loss": 0.7452,
+      "step": 233
+    },
+    {
+      "epoch": 0.14976,
+      "grad_norm": 0.28271958231925964,
+      "learning_rate": 0.00019657731958762887,
+      "loss": 0.7568,
+      "step": 234
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.2656266391277313,
+      "learning_rate": 0.0001965360824742268,
+      "loss": 0.7311,
+      "step": 235
+    },
+    {
+      "epoch": 0.15104,
+      "grad_norm": 0.285236656665802,
+      "learning_rate": 0.00019649484536082474,
+      "loss": 0.7986,
+      "step": 236
+    },
+    {
+      "epoch": 0.15168,
+      "grad_norm": 0.309198260307312,
+      "learning_rate": 0.0001964536082474227,
+      "loss": 0.7057,
+      "step": 237
+    },
+    {
+      "epoch": 0.15232,
+      "grad_norm": 0.28539809584617615,
+      "learning_rate": 0.00019641237113402063,
+      "loss": 0.7327,
+      "step": 238
+    },
+    {
+      "epoch": 0.15296,
+      "grad_norm": 0.29469773173332214,
+      "learning_rate": 0.00019637113402061859,
+      "loss": 0.7357,
+      "step": 239
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.29075542092323303,
+      "learning_rate": 0.00019632989690721652,
+      "loss": 0.7049,
+      "step": 240
+    },
+    {
+      "epoch": 0.15424,
+      "grad_norm": 0.33665356040000916,
+      "learning_rate": 0.00019628865979381442,
+      "loss": 0.8172,
+      "step": 241
+    },
+    {
+      "epoch": 0.15488,
+      "grad_norm": 0.3171047568321228,
+      "learning_rate": 0.00019624742268041238,
+      "loss": 0.8551,
+      "step": 242
+    },
+    {
+      "epoch": 0.15552,
+      "grad_norm": 0.3064482808113098,
+      "learning_rate": 0.0001962061855670103,
+      "loss": 0.7181,
+      "step": 243
+    },
+    {
+      "epoch": 0.15616,
+      "grad_norm": 0.3284786343574524,
+      "learning_rate": 0.00019616494845360827,
+      "loss": 0.765,
+      "step": 244
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.2786236107349396,
+      "learning_rate": 0.0001961237113402062,
+      "loss": 0.6615,
+      "step": 245
+    },
+    {
+      "epoch": 0.15744,
+      "grad_norm": 0.3064919710159302,
+      "learning_rate": 0.00019608247422680413,
+      "loss": 0.7738,
+      "step": 246
+    },
+    {
+      "epoch": 0.15808,
+      "grad_norm": 0.29397594928741455,
+      "learning_rate": 0.0001960412371134021,
+      "loss": 0.7277,
+      "step": 247
+    },
+    {
+      "epoch": 0.15872,
+      "grad_norm": 0.29518526792526245,
+      "learning_rate": 0.000196,
+      "loss": 0.6338,
+      "step": 248
+    },
+    {
+      "epoch": 0.15936,
+      "grad_norm": 0.3199787139892578,
+      "learning_rate": 0.00019595876288659795,
+      "loss": 0.7938,
+      "step": 249
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3214777112007141,
+      "learning_rate": 0.00019591752577319588,
+      "loss": 0.6859,
+      "step": 250
+    },
+    {
+      "epoch": 0.16064,
+      "grad_norm": 0.2887002229690552,
+      "learning_rate": 0.00019587628865979381,
+      "loss": 0.7135,
+      "step": 251
+    },
+    {
+      "epoch": 0.16128,
+      "grad_norm": 0.2781178951263428,
+      "learning_rate": 0.00019583505154639177,
+      "loss": 0.657,
+      "step": 252
+    },
+    {
+      "epoch": 0.16192,
+      "grad_norm": 0.296069860458374,
+      "learning_rate": 0.0001957938144329897,
+      "loss": 0.748,
+      "step": 253
+    },
+    {
+      "epoch": 0.16256,
+      "grad_norm": 0.3103037476539612,
+      "learning_rate": 0.00019575257731958764,
+      "loss": 0.7194,
+      "step": 254
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.3148699998855591,
+      "learning_rate": 0.00019571134020618557,
+      "loss": 0.7694,
+      "step": 255
+    },
+    {
+      "epoch": 0.16384,
+      "grad_norm": 0.3139770030975342,
+      "learning_rate": 0.0001956701030927835,
+      "loss": 0.6795,
+      "step": 256
+    },
+    {
+      "epoch": 0.16448,
+      "grad_norm": 0.3233995735645294,
+      "learning_rate": 0.00019562886597938146,
+      "loss": 0.6597,
+      "step": 257
+    },
+    {
+      "epoch": 0.16512,
+      "grad_norm": 0.28936532139778137,
+      "learning_rate": 0.0001955876288659794,
+      "loss": 0.6939,
+      "step": 258
+    },
+    {
+      "epoch": 0.16576,
+      "grad_norm": 0.3064196705818176,
+      "learning_rate": 0.00019554639175257732,
+      "loss": 0.8145,
+      "step": 259
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.29498618841171265,
+      "learning_rate": 0.00019550515463917528,
+      "loss": 0.85,
+      "step": 260
+    },
+    {
+      "epoch": 0.16704,
+      "grad_norm": 0.3150680959224701,
+      "learning_rate": 0.0001954639175257732,
+      "loss": 0.7125,
+      "step": 261
+    },
+    {
+      "epoch": 0.16768,
+      "grad_norm": 0.2819904685020447,
+      "learning_rate": 0.00019542268041237114,
+      "loss": 0.7908,
+      "step": 262
+    },
+    {
+      "epoch": 0.16832,
+      "grad_norm": 0.3014717698097229,
+      "learning_rate": 0.00019538144329896907,
+      "loss": 0.6545,
+      "step": 263
+    },
+    {
+      "epoch": 0.16896,
+      "grad_norm": 0.2842758595943451,
+      "learning_rate": 0.000195340206185567,
+      "loss": 0.7416,
+      "step": 264
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3116391897201538,
+      "learning_rate": 0.00019529896907216496,
+      "loss": 0.8082,
+      "step": 265
+    },
+    {
+      "epoch": 0.17024,
+      "grad_norm": 0.3074429929256439,
+      "learning_rate": 0.0001952577319587629,
+      "loss": 0.6937,
+      "step": 266
+    },
+    {
+      "epoch": 0.17088,
+      "grad_norm": 0.30792734026908875,
+      "learning_rate": 0.00019521649484536085,
+      "loss": 0.7781,
+      "step": 267
+    },
+    {
+      "epoch": 0.17152,
+      "grad_norm": 0.31808385252952576,
+      "learning_rate": 0.00019517525773195878,
+      "loss": 0.7362,
+      "step": 268
+    },
+    {
+      "epoch": 0.17216,
+      "grad_norm": 0.31601226329803467,
+      "learning_rate": 0.0001951340206185567,
+      "loss": 0.8366,
+      "step": 269
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.2997550964355469,
+      "learning_rate": 0.00019509278350515464,
+      "loss": 0.6883,
+      "step": 270
+    },
+    {
+      "epoch": 0.17344,
+      "grad_norm": 0.32534125447273254,
+      "learning_rate": 0.00019505154639175258,
+      "loss": 0.7451,
+      "step": 271
+    },
+    {
+      "epoch": 0.17408,
+      "grad_norm": 0.3087508976459503,
+      "learning_rate": 0.00019501030927835053,
+      "loss": 0.7355,
+      "step": 272
+    },
+    {
+      "epoch": 0.17472,
+      "grad_norm": 0.3082291781902313,
+      "learning_rate": 0.00019496907216494847,
+      "loss": 0.6667,
+      "step": 273
+    },
+    {
+      "epoch": 0.17536,
+      "grad_norm": 0.2936236262321472,
+      "learning_rate": 0.0001949278350515464,
+      "loss": 0.7012,
+      "step": 274
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.31565698981285095,
+      "learning_rate": 0.00019488659793814435,
+      "loss": 0.7562,
+      "step": 275
+    },
+    {
+      "epoch": 0.17664,
+      "grad_norm": 0.2980412244796753,
+      "learning_rate": 0.00019484536082474229,
+      "loss": 0.8132,
+      "step": 276
+    },
+    {
+      "epoch": 0.17728,
+      "grad_norm": 0.28446707129478455,
+      "learning_rate": 0.00019480412371134022,
+      "loss": 0.7087,
+      "step": 277
+    },
+    {
+      "epoch": 0.17792,
+      "grad_norm": 0.3075585663318634,
+      "learning_rate": 0.00019476288659793815,
+      "loss": 0.7186,
+      "step": 278
+    },
+    {
+      "epoch": 0.17856,
+      "grad_norm": 0.34341374039649963,
+      "learning_rate": 0.00019472164948453608,
+      "loss": 0.8741,
+      "step": 279
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.2934441566467285,
+      "learning_rate": 0.00019468041237113404,
+      "loss": 0.7579,
+      "step": 280
+    },
+    {
+      "epoch": 0.17984,
+      "grad_norm": 0.3103979527950287,
+      "learning_rate": 0.00019463917525773197,
+      "loss": 0.9067,
+      "step": 281
+    },
+    {
+      "epoch": 0.18048,
+      "grad_norm": 0.2880348563194275,
+      "learning_rate": 0.00019459793814432993,
+      "loss": 0.8074,
+      "step": 282
+    },
+    {
+      "epoch": 0.18112,
+      "grad_norm": 0.31642019748687744,
+      "learning_rate": 0.00019455670103092786,
+      "loss": 0.7598,
+      "step": 283
+    },
+    {
+      "epoch": 0.18176,
+      "grad_norm": 0.298076331615448,
+      "learning_rate": 0.00019451546391752576,
+      "loss": 0.7023,
+      "step": 284
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.29522860050201416,
+      "learning_rate": 0.00019447422680412372,
+      "loss": 0.83,
+      "step": 285
+    },
+    {
+      "epoch": 0.18304,
+      "grad_norm": 0.30702751874923706,
+      "learning_rate": 0.00019443298969072165,
+      "loss": 0.6947,
+      "step": 286
+    },
+    {
+      "epoch": 0.18368,
+      "grad_norm": 0.31087347865104675,
+      "learning_rate": 0.0001943917525773196,
+      "loss": 0.7639,
+      "step": 287
+    },
+    {
+      "epoch": 0.18432,
+      "grad_norm": 0.32420194149017334,
+      "learning_rate": 0.00019435051546391754,
+      "loss": 0.7141,
+      "step": 288
+    },
+    {
+      "epoch": 0.18496,
+      "grad_norm": 0.3080320656299591,
+      "learning_rate": 0.00019430927835051547,
+      "loss": 0.6919,
+      "step": 289
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.33933165669441223,
+      "learning_rate": 0.0001942680412371134,
+      "loss": 0.798,
+      "step": 290
+    },
+    {
+      "epoch": 0.18624,
+      "grad_norm": 0.31771421432495117,
+      "learning_rate": 0.00019422680412371134,
+      "loss": 0.7,
+      "step": 291
+    },
+    {
+      "epoch": 0.18688,
+      "grad_norm": 0.307242214679718,
+      "learning_rate": 0.00019418556701030927,
+      "loss": 0.7267,
+      "step": 292
+    },
+    {
+      "epoch": 0.18752,
+      "grad_norm": 0.3211127519607544,
+      "learning_rate": 0.00019414432989690723,
+      "loss": 0.7053,
+      "step": 293
+    },
+    {
+      "epoch": 0.18816,
+      "grad_norm": 0.27931931614875793,
+      "learning_rate": 0.00019410309278350516,
+      "loss": 0.6285,
+      "step": 294
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.33384618163108826,
+      "learning_rate": 0.00019406185567010312,
+      "loss": 0.8749,
+      "step": 295
+    },
+    {
+      "epoch": 0.18944,
+      "grad_norm": 0.29143446683883667,
+      "learning_rate": 0.00019402061855670105,
+      "loss": 0.7627,
+      "step": 296
+    },
+    {
+      "epoch": 0.19008,
+      "grad_norm": 0.29536759853363037,
+      "learning_rate": 0.00019397938144329898,
+      "loss": 0.6539,
+      "step": 297
+    },
+    {
+      "epoch": 0.19072,
+      "grad_norm": 0.3121250867843628,
+      "learning_rate": 0.0001939381443298969,
+      "loss": 0.7931,
+      "step": 298
+    },
+    {
+      "epoch": 0.19136,
+      "grad_norm": 0.30223262310028076,
+      "learning_rate": 0.00019389690721649484,
+      "loss": 0.7557,
+      "step": 299
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3201999068260193,
+      "learning_rate": 0.0001938556701030928,
+      "loss": 0.7221,
+      "step": 300
+    },
+    {
+      "epoch": 0.19264,
+      "grad_norm": 0.29998666048049927,
+      "learning_rate": 0.00019381443298969073,
+      "loss": 0.7137,
+      "step": 301
+    },
+    {
+      "epoch": 0.19328,
+      "grad_norm": 0.2995801568031311,
+      "learning_rate": 0.00019377319587628866,
+      "loss": 0.7882,
+      "step": 302
+    },
+    {
+      "epoch": 0.19392,
+      "grad_norm": 0.31779414415359497,
+      "learning_rate": 0.00019373195876288662,
+      "loss": 0.7182,
+      "step": 303
+    },
+    {
+      "epoch": 0.19456,
+      "grad_norm": 0.30731555819511414,
+      "learning_rate": 0.00019369072164948455,
+      "loss": 0.7134,
+      "step": 304
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.30074137449264526,
+      "learning_rate": 0.00019364948453608248,
+      "loss": 0.6672,
+      "step": 305
+    },
+    {
+      "epoch": 0.19584,
+      "grad_norm": 0.291225403547287,
+      "learning_rate": 0.00019360824742268041,
+      "loss": 0.7608,
+      "step": 306
+    },
+    {
+      "epoch": 0.19648,
+      "grad_norm": 0.3082839548587799,
+      "learning_rate": 0.00019356701030927835,
+      "loss": 0.7222,
+      "step": 307
+    },
+    {
+      "epoch": 0.19712,
+      "grad_norm": 0.29131725430488586,
+      "learning_rate": 0.0001935257731958763,
+      "loss": 0.7472,
+      "step": 308
+    },
+    {
+      "epoch": 0.19776,
+      "grad_norm": 0.3589355945587158,
+      "learning_rate": 0.00019348453608247424,
+      "loss": 0.8046,
+      "step": 309
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3239380121231079,
+      "learning_rate": 0.0001934432989690722,
+      "loss": 0.7473,
+      "step": 310
+    },
+    {
+      "epoch": 0.19904,
+      "grad_norm": 0.30525436997413635,
+      "learning_rate": 0.00019340206185567012,
+      "loss": 0.7071,
+      "step": 311
+    },
+    {
+      "epoch": 0.19968,
+      "grad_norm": 0.280730277299881,
+      "learning_rate": 0.00019336082474226806,
+      "loss": 0.7066,
+      "step": 312
+    },
+    {
+      "epoch": 0.20032,
+      "grad_norm": 0.2713989019393921,
+      "learning_rate": 0.000193319587628866,
+      "loss": 0.6281,
+      "step": 313
+    },
+    {
+      "epoch": 0.20096,
+      "grad_norm": 0.29571083188056946,
+      "learning_rate": 0.00019327835051546392,
+      "loss": 0.6638,
+      "step": 314
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.31913262605667114,
+      "learning_rate": 0.00019323711340206188,
+      "loss": 0.6552,
+      "step": 315
+    },
+    {
+      "epoch": 0.20224,
+      "grad_norm": 0.28491929173469543,
+      "learning_rate": 0.0001931958762886598,
+      "loss": 0.7011,
+      "step": 316
+    },
+    {
+      "epoch": 0.20288,
+      "grad_norm": 0.3195081353187561,
+      "learning_rate": 0.00019315463917525774,
+      "loss": 0.7387,
+      "step": 317
+    },
+    {
+      "epoch": 0.20352,
+      "grad_norm": 0.33661699295043945,
+      "learning_rate": 0.0001931134020618557,
+      "loss": 0.6687,
+      "step": 318
+    },
+    {
+      "epoch": 0.20416,
+      "grad_norm": 0.2955434322357178,
+      "learning_rate": 0.0001930721649484536,
+      "loss": 0.7684,
+      "step": 319
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.28289079666137695,
+      "learning_rate": 0.00019303092783505153,
+      "loss": 0.6572,
+      "step": 320
+    },
+    {
+      "epoch": 0.20544,
+      "grad_norm": 0.3094116747379303,
+      "learning_rate": 0.0001929896907216495,
+      "loss": 0.6768,
+      "step": 321
+    },
+    {
+      "epoch": 0.20608,
+      "grad_norm": 0.325794517993927,
+      "learning_rate": 0.00019294845360824742,
+      "loss": 0.7834,
+      "step": 322
+    },
+    {
+      "epoch": 0.20672,
+      "grad_norm": 0.2948189377784729,
+      "learning_rate": 0.00019290721649484538,
+      "loss": 0.6914,
+      "step": 323
+    },
+    {
+      "epoch": 0.20736,
+      "grad_norm": 0.30359920859336853,
+      "learning_rate": 0.0001928659793814433,
+      "loss": 0.7238,
+      "step": 324
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.31644025444984436,
+      "learning_rate": 0.00019282474226804124,
+      "loss": 0.7689,
+      "step": 325
+    },
+    {
+      "epoch": 0.20864,
+      "grad_norm": 0.3070341646671295,
+      "learning_rate": 0.00019278350515463918,
+      "loss": 0.7513,
+      "step": 326
+    },
+    {
+      "epoch": 0.20928,
+      "grad_norm": 0.30977943539619446,
+      "learning_rate": 0.0001927422680412371,
+      "loss": 0.7387,
+      "step": 327
+    },
+    {
+      "epoch": 0.20992,
+      "grad_norm": 0.2939586341381073,
+      "learning_rate": 0.00019270103092783506,
+      "loss": 0.702,
+      "step": 328
+    },
+    {
+      "epoch": 0.21056,
+      "grad_norm": 0.31276288628578186,
+      "learning_rate": 0.000192659793814433,
+      "loss": 0.8218,
+      "step": 329
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.2854152321815491,
+      "learning_rate": 0.00019261855670103093,
+      "loss": 0.7118,
+      "step": 330
+    },
+    {
+      "epoch": 0.21184,
+      "grad_norm": 0.29665955901145935,
+      "learning_rate": 0.00019257731958762889,
+      "loss": 0.6829,
+      "step": 331
+    },
+    {
+      "epoch": 0.21248,
+      "grad_norm": 0.323794424533844,
+      "learning_rate": 0.00019253608247422682,
+      "loss": 0.6829,
+      "step": 332
+    },
+    {
+      "epoch": 0.21312,
+      "grad_norm": 0.29837536811828613,
+      "learning_rate": 0.00019249484536082475,
+      "loss": 0.6792,
+      "step": 333
+    },
+    {
+      "epoch": 0.21376,
+      "grad_norm": 0.2850310206413269,
+      "learning_rate": 0.00019245360824742268,
+      "loss": 0.7633,
+      "step": 334
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.33181217312812805,
+      "learning_rate": 0.0001924123711340206,
+      "loss": 0.7211,
+      "step": 335
+    },
+    {
+      "epoch": 0.21504,
+      "grad_norm": 0.2929527759552002,
+      "learning_rate": 0.00019237113402061857,
+      "loss": 0.7471,
+      "step": 336
+    },
+    {
+      "epoch": 0.21568,
+      "grad_norm": 0.3233756721019745,
+      "learning_rate": 0.0001923298969072165,
+      "loss": 0.6448,
+      "step": 337
+    },
+    {
+      "epoch": 0.21632,
+      "grad_norm": 0.288126677274704,
+      "learning_rate": 0.00019228865979381446,
+      "loss": 0.7007,
+      "step": 338
+    },
+    {
+      "epoch": 0.21696,
+      "grad_norm": 0.3070041239261627,
+      "learning_rate": 0.0001922474226804124,
+      "loss": 0.6973,
+      "step": 339
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3397638201713562,
+      "learning_rate": 0.00019220618556701032,
+      "loss": 0.8021,
+      "step": 340
+    },
+    {
+      "epoch": 0.21824,
+      "grad_norm": 0.3471389412879944,
+      "learning_rate": 0.00019216494845360825,
+      "loss": 0.703,
+      "step": 341
+    },
+    {
+      "epoch": 0.21888,
+      "grad_norm": 0.30681872367858887,
+      "learning_rate": 0.00019212371134020618,
+      "loss": 0.7937,
+      "step": 342
+    },
+    {
+      "epoch": 0.21952,
+      "grad_norm": 0.3199484646320343,
+      "learning_rate": 0.00019208247422680414,
+      "loss": 0.7815,
+      "step": 343
+    },
+    {
+      "epoch": 0.22016,
+      "grad_norm": 0.30274996161460876,
+      "learning_rate": 0.00019204123711340207,
+      "loss": 0.6108,
+      "step": 344
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.30199941992759705,
+      "learning_rate": 0.000192,
+      "loss": 0.6959,
+      "step": 345
+    },
+    {
+      "epoch": 0.22144,
+      "grad_norm": 0.2959499955177307,
+      "learning_rate": 0.00019195876288659796,
+      "loss": 0.8194,
+      "step": 346
+    },
+    {
+      "epoch": 0.22208,
+      "grad_norm": 0.32915207743644714,
+      "learning_rate": 0.0001919175257731959,
+      "loss": 0.7702,
+      "step": 347
+    },
+    {
+      "epoch": 0.22272,
+      "grad_norm": 0.3221168518066406,
+      "learning_rate": 0.00019187628865979383,
+      "loss": 0.6817,
+      "step": 348
+    },
+    {
+      "epoch": 0.22336,
+      "grad_norm": 0.3403017222881317,
+      "learning_rate": 0.00019183505154639176,
+      "loss": 0.7964,
+      "step": 349
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.32378193736076355,
+      "learning_rate": 0.0001917938144329897,
+      "loss": 0.6296,
+      "step": 350
+    },
+    {
+      "epoch": 0.22464,
+      "grad_norm": 0.3086368143558502,
+      "learning_rate": 0.00019175257731958765,
+      "loss": 0.802,
+      "step": 351
+    },
+    {
+      "epoch": 0.22528,
+      "grad_norm": 0.3113609850406647,
+      "learning_rate": 0.00019171134020618558,
+      "loss": 0.7497,
+      "step": 352
+    },
+    {
+      "epoch": 0.22592,
+      "grad_norm": 0.31738126277923584,
+      "learning_rate": 0.0001916701030927835,
+      "loss": 0.7499,
+      "step": 353
+    },
+    {
+      "epoch": 0.22656,
+      "grad_norm": 0.3091988265514374,
+      "learning_rate": 0.00019162886597938147,
+      "loss": 0.7301,
+      "step": 354
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3437817096710205,
+      "learning_rate": 0.00019158762886597937,
+      "loss": 0.7669,
+      "step": 355
+    },
+    {
+      "epoch": 0.22784,
+      "grad_norm": 0.2972509264945984,
+      "learning_rate": 0.00019154639175257733,
+      "loss": 0.7769,
+      "step": 356
+    },
+    {
+      "epoch": 0.22848,
+      "grad_norm": 0.30129578709602356,
+      "learning_rate": 0.00019150515463917526,
+      "loss": 0.7598,
+      "step": 357
+    },
+    {
+      "epoch": 0.22912,
+      "grad_norm": 0.3107849359512329,
+      "learning_rate": 0.0001914639175257732,
+      "loss": 0.6709,
+      "step": 358
+    },
+    {
+      "epoch": 0.22976,
+      "grad_norm": 0.35392850637435913,
+      "learning_rate": 0.00019142268041237115,
+      "loss": 0.8555,
+      "step": 359
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.30610328912734985,
+      "learning_rate": 0.00019138144329896908,
+      "loss": 0.7808,
+      "step": 360
+    },
+    {
+      "epoch": 0.23104,
+      "grad_norm": 0.35875633358955383,
+      "learning_rate": 0.00019134020618556704,
+      "loss": 0.8592,
+      "step": 361
+    },
+    {
+      "epoch": 0.23168,
+      "grad_norm": 0.3110954761505127,
+      "learning_rate": 0.00019129896907216494,
+      "loss": 0.7078,
+      "step": 362
+    },
+    {
+      "epoch": 0.23232,
+      "grad_norm": 0.27289018034935,
+      "learning_rate": 0.00019125773195876288,
+      "loss": 0.6692,
+      "step": 363
+    },
+    {
+      "epoch": 0.23296,
+      "grad_norm": 0.3352426290512085,
+      "learning_rate": 0.00019121649484536083,
+      "loss": 0.7528,
+      "step": 364
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.29104283452033997,
+      "learning_rate": 0.00019117525773195877,
+      "loss": 0.7467,
+      "step": 365
+    },
+    {
+      "epoch": 0.23424,
+      "grad_norm": 0.32967209815979004,
+      "learning_rate": 0.00019113402061855672,
+      "loss": 0.6876,
+      "step": 366
+    },
+    {
+      "epoch": 0.23488,
+      "grad_norm": 0.3251032531261444,
+      "learning_rate": 0.00019109278350515466,
+      "loss": 0.7207,
+      "step": 367
+    },
+    {
+      "epoch": 0.23552,
+      "grad_norm": 0.3755771219730377,
+      "learning_rate": 0.0001910515463917526,
+      "loss": 0.8832,
+      "step": 368
+    },
+    {
+      "epoch": 0.23616,
+      "grad_norm": 0.30565115809440613,
+      "learning_rate": 0.00019101030927835052,
+      "loss": 0.7321,
+      "step": 369
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.29345694184303284,
+      "learning_rate": 0.00019096907216494845,
+      "loss": 0.7635,
+      "step": 370
+    },
+    {
+      "epoch": 0.23744,
+      "grad_norm": 0.31952527165412903,
+      "learning_rate": 0.0001909278350515464,
+      "loss": 0.7105,
+      "step": 371
+    },
+    {
+      "epoch": 0.23808,
+      "grad_norm": 0.2801203727722168,
+      "learning_rate": 0.00019088659793814434,
+      "loss": 0.6931,
+      "step": 372
+    },
+    {
+      "epoch": 0.23872,
+      "grad_norm": 0.3395910859107971,
+      "learning_rate": 0.00019084536082474227,
+      "loss": 0.8312,
+      "step": 373
+    },
+    {
+      "epoch": 0.23936,
+      "grad_norm": 0.29749882221221924,
+      "learning_rate": 0.00019080412371134023,
+      "loss": 0.8044,
+      "step": 374
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3202598989009857,
+      "learning_rate": 0.00019076288659793816,
+      "loss": 0.7597,
+      "step": 375
+    },
+    {
+      "epoch": 0.24064,
+      "grad_norm": 0.31127414107322693,
+      "learning_rate": 0.0001907216494845361,
+      "loss": 0.6789,
+      "step": 376
+    },
+    {
+      "epoch": 0.24128,
+      "grad_norm": 0.312526136636734,
+      "learning_rate": 0.00019068041237113402,
+      "loss": 0.6663,
+      "step": 377
+    },
+    {
+      "epoch": 0.24192,
+      "grad_norm": 0.3485929071903229,
+      "learning_rate": 0.00019063917525773195,
+      "loss": 0.7969,
+      "step": 378
+    },
+    {
+      "epoch": 0.24256,
+      "grad_norm": 0.31651830673217773,
+      "learning_rate": 0.0001905979381443299,
+      "loss": 0.7138,
+      "step": 379
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.32089951634407043,
+      "learning_rate": 0.00019055670103092784,
+      "loss": 0.7305,
+      "step": 380
+    },
+    {
+      "epoch": 0.24384,
+      "grad_norm": 0.30567267537117004,
+      "learning_rate": 0.00019051546391752577,
+      "loss": 0.764,
+      "step": 381
+    },
+    {
+      "epoch": 0.24448,
+      "grad_norm": 0.32761886715888977,
+      "learning_rate": 0.00019047422680412373,
+      "loss": 0.7246,
+      "step": 382
+    },
+    {
+      "epoch": 0.24512,
+      "grad_norm": 0.3102027475833893,
+      "learning_rate": 0.00019043298969072166,
+      "loss": 0.7343,
+      "step": 383
+    },
+    {
+      "epoch": 0.24576,
+      "grad_norm": 0.30226072669029236,
+      "learning_rate": 0.0001903917525773196,
+      "loss": 0.6858,
+      "step": 384
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.2709049880504608,
+      "learning_rate": 0.00019035051546391753,
+      "loss": 0.6157,
+      "step": 385
+    },
+    {
+      "epoch": 0.24704,
+      "grad_norm": 0.31257662177085876,
+      "learning_rate": 0.00019030927835051546,
+      "loss": 0.6197,
+      "step": 386
+    },
+    {
+      "epoch": 0.24768,
+      "grad_norm": 0.3185356855392456,
+      "learning_rate": 0.00019026804123711342,
+      "loss": 0.6526,
+      "step": 387
+    },
+    {
+      "epoch": 0.24832,
+      "grad_norm": 0.3150332272052765,
+      "learning_rate": 0.00019022680412371135,
+      "loss": 0.7083,
+      "step": 388
+    },
+    {
+      "epoch": 0.24896,
+      "grad_norm": 0.3336682617664337,
+      "learning_rate": 0.0001901855670103093,
+      "loss": 0.7431,
+      "step": 389
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3282051682472229,
+      "learning_rate": 0.00019014432989690724,
+      "loss": 0.7113,
+      "step": 390
+    },
+    {
+      "epoch": 0.25024,
+      "grad_norm": 0.3453441858291626,
+      "learning_rate": 0.00019010309278350514,
+      "loss": 0.855,
+      "step": 391
+    },
+    {
+      "epoch": 0.25088,
+      "grad_norm": 0.3266576826572418,
+      "learning_rate": 0.0001900618556701031,
+      "loss": 0.6524,
+      "step": 392
+    },
+    {
+      "epoch": 0.25152,
+      "grad_norm": 0.300717830657959,
+      "learning_rate": 0.00019002061855670103,
+      "loss": 0.6772,
+      "step": 393
+    },
+    {
+      "epoch": 0.25216,
+      "grad_norm": 0.2966857850551605,
+      "learning_rate": 0.000189979381443299,
+      "loss": 0.6347,
+      "step": 394
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.26930931210517883,
+      "learning_rate": 0.00018993814432989692,
+      "loss": 0.6857,
+      "step": 395
+    },
+    {
+      "epoch": 0.25344,
+      "grad_norm": 0.3187718093395233,
+      "learning_rate": 0.00018989690721649485,
+      "loss": 0.7721,
+      "step": 396
+    },
+    {
+      "epoch": 0.25408,
+      "grad_norm": 0.3119317293167114,
+      "learning_rate": 0.0001898556701030928,
+      "loss": 0.8068,
+      "step": 397
+    },
+    {
+      "epoch": 0.25472,
+      "grad_norm": 0.3182973563671112,
+      "learning_rate": 0.00018981443298969071,
+      "loss": 0.695,
+      "step": 398
+    },
+    {
+      "epoch": 0.25536,
+      "grad_norm": 0.31561198830604553,
+      "learning_rate": 0.00018977319587628867,
+      "loss": 0.736,
+      "step": 399
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3218092620372772,
+      "learning_rate": 0.0001897319587628866,
+      "loss": 0.761,
+      "step": 400
+    },
+    {
+      "epoch": 0.25664,
+      "grad_norm": 0.33970728516578674,
+      "learning_rate": 0.00018969072164948454,
+      "loss": 0.7104,
+      "step": 401
+    },
+    {
+      "epoch": 0.25728,
+      "grad_norm": 0.27917760610580444,
+      "learning_rate": 0.0001896494845360825,
+      "loss": 0.7128,
+      "step": 402
+    },
+    {
+      "epoch": 0.25792,
+      "grad_norm": 0.3380156457424164,
+      "learning_rate": 0.00018960824742268043,
+      "loss": 0.6953,
+      "step": 403
+    },
+    {
+      "epoch": 0.25856,
+      "grad_norm": 0.3621920049190521,
+      "learning_rate": 0.00018956701030927836,
+      "loss": 0.7469,
+      "step": 404
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.2967955470085144,
+      "learning_rate": 0.0001895257731958763,
+      "loss": 0.8306,
+      "step": 405
+    },
+    {
+      "epoch": 0.25984,
+      "grad_norm": 0.3198679983615875,
+      "learning_rate": 0.00018948453608247422,
+      "loss": 0.6471,
+      "step": 406
+    },
+    {
+      "epoch": 0.26048,
+      "grad_norm": 0.30034980177879333,
+      "learning_rate": 0.00018944329896907218,
+      "loss": 0.718,
+      "step": 407
+    },
+    {
+      "epoch": 0.26112,
+      "grad_norm": 0.3326522707939148,
+      "learning_rate": 0.0001894020618556701,
+      "loss": 0.643,
+      "step": 408
+    },
+    {
+      "epoch": 0.26176,
+      "grad_norm": 0.3270142674446106,
+      "learning_rate": 0.00018936082474226804,
+      "loss": 0.7433,
+      "step": 409
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.31910303235054016,
+      "learning_rate": 0.000189319587628866,
+      "loss": 0.7663,
+      "step": 410
+    },
+    {
+      "epoch": 0.26304,
+      "grad_norm": 0.2916382849216461,
+      "learning_rate": 0.00018927835051546393,
+      "loss": 0.7819,
+      "step": 411
+    },
+    {
+      "epoch": 0.26368,
+      "grad_norm": 0.3039431571960449,
+      "learning_rate": 0.00018923711340206186,
+      "loss": 0.6154,
+      "step": 412
+    },
+    {
+      "epoch": 0.26432,
+      "grad_norm": 0.33125901222229004,
+      "learning_rate": 0.0001891958762886598,
+      "loss": 0.6401,
+      "step": 413
+    },
+    {
+      "epoch": 0.26496,
+      "grad_norm": 0.33211550116539,
+      "learning_rate": 0.00018915463917525772,
+      "loss": 0.6546,
+      "step": 414
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.2855866551399231,
+      "learning_rate": 0.00018911340206185568,
+      "loss": 0.7732,
+      "step": 415
+    },
+    {
+      "epoch": 0.26624,
+      "grad_norm": 0.28563952445983887,
+      "learning_rate": 0.0001890721649484536,
+      "loss": 0.7375,
+      "step": 416
+    },
+    {
+      "epoch": 0.26688,
+      "grad_norm": 0.31947028636932373,
+      "learning_rate": 0.00018903092783505157,
+      "loss": 0.7006,
+      "step": 417
+    },
+    {
+      "epoch": 0.26752,
+      "grad_norm": 0.30819830298423767,
+      "learning_rate": 0.0001889896907216495,
+      "loss": 0.803,
+      "step": 418
+    },
+    {
+      "epoch": 0.26816,
+      "grad_norm": 0.31668663024902344,
+      "learning_rate": 0.00018894845360824743,
+      "loss": 0.775,
+      "step": 419
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.2968495190143585,
+      "learning_rate": 0.00018890721649484537,
+      "loss": 0.7604,
+      "step": 420
+    },
+    {
+      "epoch": 0.26944,
+      "grad_norm": 0.3007802963256836,
+      "learning_rate": 0.0001888659793814433,
+      "loss": 0.6976,
+      "step": 421
+    },
+    {
+      "epoch": 0.27008,
+      "grad_norm": 0.3191244602203369,
+      "learning_rate": 0.00018882474226804126,
+      "loss": 0.773,
+      "step": 422
+    },
+    {
+      "epoch": 0.27072,
+      "grad_norm": 0.30016082525253296,
+      "learning_rate": 0.00018878350515463919,
+      "loss": 0.7836,
+      "step": 423
+    },
+    {
+      "epoch": 0.27136,
+      "grad_norm": 0.3263542354106903,
+      "learning_rate": 0.00018874226804123712,
+      "loss": 0.6471,
+      "step": 424
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.31896039843559265,
+      "learning_rate": 0.00018870103092783508,
+      "loss": 0.618,
+      "step": 425
+    },
+    {
+      "epoch": 0.27264,
+      "grad_norm": 0.3496481478214264,
+      "learning_rate": 0.000188659793814433,
+      "loss": 0.7501,
+      "step": 426
+    },
+    {
+      "epoch": 0.27328,
+      "grad_norm": 0.32309311628341675,
+      "learning_rate": 0.00018861855670103094,
+      "loss": 0.742,
+      "step": 427
+    },
+    {
+      "epoch": 0.27392,
+      "grad_norm": 0.33307623863220215,
+      "learning_rate": 0.00018857731958762887,
+      "loss": 0.7473,
+      "step": 428
+    },
+    {
+      "epoch": 0.27456,
+      "grad_norm": 0.3056267499923706,
+      "learning_rate": 0.0001885360824742268,
+      "loss": 0.677,
+      "step": 429
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3015357553958893,
+      "learning_rate": 0.00018849484536082476,
+      "loss": 0.8583,
+      "step": 430
+    },
+    {
+      "epoch": 0.27584,
+      "grad_norm": 0.34647443890571594,
+      "learning_rate": 0.0001884536082474227,
+      "loss": 0.7137,
+      "step": 431
+    },
+    {
+      "epoch": 0.27648,
+      "grad_norm": 0.29870176315307617,
+      "learning_rate": 0.00018841237113402065,
+      "loss": 0.7199,
+      "step": 432
+    },
+    {
+      "epoch": 0.27712,
+      "grad_norm": 0.2656466066837311,
+      "learning_rate": 0.00018837113402061858,
+      "loss": 0.6994,
+      "step": 433
+    },
+    {
+      "epoch": 0.27776,
+      "grad_norm": 0.3220881223678589,
+      "learning_rate": 0.00018832989690721648,
+      "loss": 0.7069,
+      "step": 434
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.3169515132904053,
+      "learning_rate": 0.00018828865979381444,
+      "loss": 0.7575,
+      "step": 435
+    },
+    {
+      "epoch": 0.27904,
+      "grad_norm": 0.33047881722450256,
+      "learning_rate": 0.00018824742268041237,
+      "loss": 0.6844,
+      "step": 436
+    },
+    {
+      "epoch": 0.27968,
+      "grad_norm": 0.2863800525665283,
+      "learning_rate": 0.00018820618556701033,
+      "loss": 0.6259,
+      "step": 437
+    },
+    {
+      "epoch": 0.28032,
+      "grad_norm": 0.3175205588340759,
+      "learning_rate": 0.00018816494845360826,
+      "loss": 0.7278,
+      "step": 438
+    },
+    {
+      "epoch": 0.28096,
+      "grad_norm": 0.29102823138237,
+      "learning_rate": 0.0001881237113402062,
+      "loss": 0.7613,
+      "step": 439
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3170917332172394,
+      "learning_rate": 0.00018808247422680413,
+      "loss": 0.8114,
+      "step": 440
+    },
+    {
+      "epoch": 0.28224,
+      "grad_norm": 0.32672619819641113,
+      "learning_rate": 0.00018804123711340206,
+      "loss": 0.6385,
+      "step": 441
+    },
+    {
+      "epoch": 0.28288,
+      "grad_norm": 0.3139190971851349,
+      "learning_rate": 0.000188,
+      "loss": 0.864,
+      "step": 442
+    },
+    {
+      "epoch": 0.28352,
+      "grad_norm": 0.3168294131755829,
+      "learning_rate": 0.00018795876288659795,
+      "loss": 0.6955,
+      "step": 443
+    },
+    {
+      "epoch": 0.28416,
+      "grad_norm": 0.28454044461250305,
+      "learning_rate": 0.00018791752577319588,
+      "loss": 0.6282,
+      "step": 444
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.2962913513183594,
+      "learning_rate": 0.00018787628865979384,
+      "loss": 0.7146,
+      "step": 445
+    },
+    {
+      "epoch": 0.28544,
+      "grad_norm": 0.3182694613933563,
+      "learning_rate": 0.00018783505154639177,
+      "loss": 0.731,
+      "step": 446
+    },
+    {
+      "epoch": 0.28608,
+      "grad_norm": 0.30787360668182373,
+      "learning_rate": 0.0001877938144329897,
+      "loss": 0.7222,
+      "step": 447
+    },
+    {
+      "epoch": 0.28672,
+      "grad_norm": 0.3067628741264343,
+      "learning_rate": 0.00018775257731958763,
+      "loss": 0.7853,
+      "step": 448
+    },
+    {
+      "epoch": 0.28736,
+      "grad_norm": 0.31983888149261475,
+      "learning_rate": 0.00018771134020618556,
+      "loss": 0.8183,
+      "step": 449
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.28941866755485535,
+      "learning_rate": 0.00018767010309278352,
+      "loss": 0.7665,
+      "step": 450
+    },
+    {
+      "epoch": 0.28864,
+      "grad_norm": 0.33365219831466675,
+      "learning_rate": 0.00018762886597938145,
+      "loss": 0.8302,
+      "step": 451
+    },
+    {
+      "epoch": 0.28928,
+      "grad_norm": 0.32798248529434204,
+      "learning_rate": 0.00018758762886597938,
+      "loss": 0.6545,
+      "step": 452
+    },
+    {
+      "epoch": 0.28992,
+      "grad_norm": 0.3089250326156616,
+      "learning_rate": 0.00018754639175257734,
+      "loss": 0.6906,
+      "step": 453
+    },
+    {
+      "epoch": 0.29056,
+      "grad_norm": 0.2974041700363159,
+      "learning_rate": 0.00018750515463917527,
+      "loss": 0.8122,
+      "step": 454
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.3062783181667328,
+      "learning_rate": 0.0001874639175257732,
+      "loss": 0.6368,
+      "step": 455
+    },
+    {
+      "epoch": 0.29184,
+      "grad_norm": 0.32750093936920166,
+      "learning_rate": 0.00018742268041237114,
+      "loss": 0.72,
+      "step": 456
+    },
+    {
+      "epoch": 0.29248,
+      "grad_norm": 0.28121083974838257,
+      "learning_rate": 0.00018738144329896907,
+      "loss": 0.7208,
+      "step": 457
+    },
+    {
+      "epoch": 0.29312,
+      "grad_norm": 0.3138170838356018,
+      "learning_rate": 0.00018734020618556702,
+      "loss": 0.7919,
+      "step": 458
+    },
+    {
+      "epoch": 0.29376,
+      "grad_norm": 0.34761953353881836,
+      "learning_rate": 0.00018729896907216496,
+      "loss": 0.7225,
+      "step": 459
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.36817216873168945,
+      "learning_rate": 0.00018725773195876291,
+      "loss": 0.6676,
+      "step": 460
+    },
+    {
+      "epoch": 0.29504,
+      "grad_norm": 0.30768880248069763,
+      "learning_rate": 0.00018721649484536085,
+      "loss": 0.6524,
+      "step": 461
+    },
+    {
+      "epoch": 0.29568,
+      "grad_norm": 0.3130376935005188,
+      "learning_rate": 0.00018717525773195878,
+      "loss": 0.7513,
+      "step": 462
+    },
+    {
+      "epoch": 0.29632,
+      "grad_norm": 0.30241551995277405,
+      "learning_rate": 0.0001871340206185567,
+      "loss": 0.659,
+      "step": 463
+    },
+    {
+      "epoch": 0.29696,
+      "grad_norm": 0.3223501443862915,
+      "learning_rate": 0.00018709278350515464,
+      "loss": 0.7486,
+      "step": 464
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3197935223579407,
+      "learning_rate": 0.0001870515463917526,
+      "loss": 0.7157,
+      "step": 465
+    },
+    {
+      "epoch": 0.29824,
+      "grad_norm": 0.3418366014957428,
+      "learning_rate": 0.00018701030927835053,
+      "loss": 0.7414,
+      "step": 466
+    },
+    {
+      "epoch": 0.29888,
+      "grad_norm": 0.30754223465919495,
+      "learning_rate": 0.00018696907216494846,
+      "loss": 0.6865,
+      "step": 467
+    },
+    {
+      "epoch": 0.29952,
+      "grad_norm": 0.3156290352344513,
+      "learning_rate": 0.00018692783505154642,
+      "loss": 0.7565,
+      "step": 468
+    },
+    {
+      "epoch": 0.30016,
+      "grad_norm": 0.32256850600242615,
+      "learning_rate": 0.00018688659793814432,
+      "loss": 0.7249,
+      "step": 469
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.31836530566215515,
+      "learning_rate": 0.00018684536082474225,
+      "loss": 0.6787,
+      "step": 470
+    },
+    {
+      "epoch": 0.30144,
+      "grad_norm": 0.3175853192806244,
+      "learning_rate": 0.0001868041237113402,
+      "loss": 0.6755,
+      "step": 471
+    },
+    {
+      "epoch": 0.30208,
+      "grad_norm": 0.3368013799190521,
+      "learning_rate": 0.00018676288659793814,
+      "loss": 0.733,
+      "step": 472
+    },
+    {
+      "epoch": 0.30272,
+      "grad_norm": 0.3084401488304138,
+      "learning_rate": 0.0001867216494845361,
+      "loss": 0.6819,
+      "step": 473
+    },
+    {
+      "epoch": 0.30336,
+      "grad_norm": 0.3278505802154541,
+      "learning_rate": 0.00018668041237113403,
+      "loss": 0.7327,
+      "step": 474
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.33178839087486267,
+      "learning_rate": 0.00018663917525773196,
+      "loss": 0.7066,
+      "step": 475
+    },
+    {
+      "epoch": 0.30464,
+      "grad_norm": 0.34669503569602966,
+      "learning_rate": 0.0001865979381443299,
+      "loss": 0.7668,
+      "step": 476
+    },
+    {
+      "epoch": 0.30528,
+      "grad_norm": 0.33522486686706543,
+      "learning_rate": 0.00018655670103092783,
+      "loss": 0.7528,
+      "step": 477
+    },
+    {
+      "epoch": 0.30592,
+      "grad_norm": 0.3110879063606262,
+      "learning_rate": 0.00018651546391752579,
+      "loss": 0.6319,
+      "step": 478
+    },
+    {
+      "epoch": 0.30656,
+      "grad_norm": 0.30556023120880127,
+      "learning_rate": 0.00018647422680412372,
+      "loss": 0.704,
+      "step": 479
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.28580984473228455,
+      "learning_rate": 0.00018643298969072165,
+      "loss": 0.6515,
+      "step": 480
+    },
+    {
+      "epoch": 0.30784,
+      "grad_norm": 0.32033205032348633,
+      "learning_rate": 0.0001863917525773196,
+      "loss": 0.8232,
+      "step": 481
+    },
+    {
+      "epoch": 0.30848,
+      "grad_norm": 0.3057061731815338,
+      "learning_rate": 0.00018635051546391754,
+      "loss": 0.7137,
+      "step": 482
+    },
+    {
+      "epoch": 0.30912,
+      "grad_norm": 0.295612096786499,
+      "learning_rate": 0.00018630927835051547,
+      "loss": 0.6721,
+      "step": 483
+    },
+    {
+      "epoch": 0.30976,
+      "grad_norm": 0.3382381796836853,
+      "learning_rate": 0.0001862680412371134,
+      "loss": 0.7338,
+      "step": 484
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.30730482935905457,
+      "learning_rate": 0.00018622680412371133,
+      "loss": 0.677,
+      "step": 485
+    },
+    {
+      "epoch": 0.31104,
+      "grad_norm": 0.3336603045463562,
+      "learning_rate": 0.0001861855670103093,
+      "loss": 0.6866,
+      "step": 486
+    },
+    {
+      "epoch": 0.31168,
+      "grad_norm": 0.32780736684799194,
+      "learning_rate": 0.00018614432989690722,
+      "loss": 0.7177,
+      "step": 487
+    },
+    {
+      "epoch": 0.31232,
+      "grad_norm": 0.3147113621234894,
+      "learning_rate": 0.00018610309278350518,
+      "loss": 0.7784,
+      "step": 488
+    },
+    {
+      "epoch": 0.31296,
+      "grad_norm": 0.301918089389801,
+      "learning_rate": 0.0001860618556701031,
+      "loss": 0.7823,
+      "step": 489
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.30411750078201294,
+      "learning_rate": 0.00018602061855670104,
+      "loss": 0.5905,
+      "step": 490
+    },
+    {
+      "epoch": 0.31424,
+      "grad_norm": 0.31566423177719116,
+      "learning_rate": 0.00018597938144329897,
+      "loss": 0.6916,
+      "step": 491
+    },
+    {
+      "epoch": 0.31488,
+      "grad_norm": 0.32280048727989197,
+      "learning_rate": 0.0001859381443298969,
+      "loss": 0.8135,
+      "step": 492
+    },
+    {
+      "epoch": 0.31552,
+      "grad_norm": 0.3336408734321594,
+      "learning_rate": 0.00018589690721649486,
+      "loss": 0.8329,
+      "step": 493
+    },
+    {
+      "epoch": 0.31616,
+      "grad_norm": 0.3190591037273407,
+      "learning_rate": 0.0001858556701030928,
+      "loss": 0.7271,
+      "step": 494
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.317510724067688,
+      "learning_rate": 0.00018581443298969073,
+      "loss": 0.7531,
+      "step": 495
+    },
+    {
+      "epoch": 0.31744,
+      "grad_norm": 0.3220292031764984,
+      "learning_rate": 0.00018577319587628868,
+      "loss": 0.7468,
+      "step": 496
+    },
+    {
+      "epoch": 0.31808,
+      "grad_norm": 0.33332934975624084,
+      "learning_rate": 0.00018573195876288662,
+      "loss": 0.7782,
+      "step": 497
+    },
+    {
+      "epoch": 0.31872,
+      "grad_norm": 0.31983014941215515,
+      "learning_rate": 0.00018569072164948455,
+      "loss": 0.6785,
+      "step": 498
+    },
+    {
+      "epoch": 0.31936,
+      "grad_norm": 0.30875346064567566,
+      "learning_rate": 0.00018564948453608248,
+      "loss": 0.7293,
+      "step": 499
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3104054033756256,
+      "learning_rate": 0.0001856082474226804,
+      "loss": 0.8647,
+      "step": 500
+    },
+    {
+      "epoch": 0.32064,
+      "grad_norm": 0.3232969343662262,
+      "learning_rate": 0.00018556701030927837,
+      "loss": 0.7075,
+      "step": 501
+    },
+    {
+      "epoch": 0.32128,
+      "grad_norm": 0.30094465613365173,
+      "learning_rate": 0.0001855257731958763,
+      "loss": 0.7349,
+      "step": 502
+    },
+    {
+      "epoch": 0.32192,
+      "grad_norm": 0.31936967372894287,
+      "learning_rate": 0.00018548453608247423,
+      "loss": 0.6942,
+      "step": 503
+    },
+    {
+      "epoch": 0.32256,
+      "grad_norm": 0.30298441648483276,
+      "learning_rate": 0.0001854432989690722,
+      "loss": 0.672,
+      "step": 504
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.32368478178977966,
+      "learning_rate": 0.0001854020618556701,
+      "loss": 0.8569,
+      "step": 505
+    },
+    {
+      "epoch": 0.32384,
+      "grad_norm": 0.315775066614151,
+      "learning_rate": 0.00018536082474226805,
+      "loss": 0.7074,
+      "step": 506
+    },
+    {
+      "epoch": 0.32448,
+      "grad_norm": 0.3086112141609192,
+      "learning_rate": 0.00018531958762886598,
+      "loss": 0.6999,
+      "step": 507
+    },
+    {
+      "epoch": 0.32512,
+      "grad_norm": 0.3149326741695404,
+      "learning_rate": 0.00018527835051546391,
+      "loss": 0.7788,
+      "step": 508
+    },
+    {
+      "epoch": 0.32576,
+      "grad_norm": 0.33051931858062744,
+      "learning_rate": 0.00018523711340206187,
+      "loss": 0.7185,
+      "step": 509
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.33453604578971863,
+      "learning_rate": 0.0001851958762886598,
+      "loss": 0.8052,
+      "step": 510
+    },
+    {
+      "epoch": 0.32704,
+      "grad_norm": 0.3444625735282898,
+      "learning_rate": 0.00018515463917525776,
+      "loss": 0.7368,
+      "step": 511
+    },
+    {
+      "epoch": 0.32768,
+      "grad_norm": 0.3116897642612457,
+      "learning_rate": 0.00018511340206185567,
+      "loss": 0.8394,
+      "step": 512
+    },
+    {
+      "epoch": 0.32832,
+      "grad_norm": 0.29211899638175964,
+      "learning_rate": 0.0001850721649484536,
+      "loss": 0.7944,
+      "step": 513
+    },
+    {
+      "epoch": 0.32896,
+      "grad_norm": 0.2860858142375946,
+      "learning_rate": 0.00018503092783505156,
+      "loss": 0.6336,
+      "step": 514
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3269069790840149,
+      "learning_rate": 0.0001849896907216495,
+      "loss": 0.8108,
+      "step": 515
+    },
+    {
+      "epoch": 0.33024,
+      "grad_norm": 0.2810918092727661,
+      "learning_rate": 0.00018494845360824745,
+      "loss": 0.6293,
+      "step": 516
+    },
+    {
+      "epoch": 0.33088,
+      "grad_norm": 0.3181713819503784,
+      "learning_rate": 0.00018490721649484538,
+      "loss": 0.8255,
+      "step": 517
+    },
+    {
+      "epoch": 0.33152,
+      "grad_norm": 0.2919624149799347,
+      "learning_rate": 0.0001848659793814433,
+      "loss": 0.6341,
+      "step": 518
+    },
+    {
+      "epoch": 0.33216,
+      "grad_norm": 0.31228554248809814,
+      "learning_rate": 0.00018482474226804124,
+      "loss": 0.7362,
+      "step": 519
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3720919191837311,
+      "learning_rate": 0.00018478350515463917,
+      "loss": 0.847,
+      "step": 520
+    },
+    {
+      "epoch": 0.33344,
+      "grad_norm": 0.30149605870246887,
+      "learning_rate": 0.00018474226804123713,
+      "loss": 0.7663,
+      "step": 521
+    },
+    {
+      "epoch": 0.33408,
+      "grad_norm": 0.3007080554962158,
+      "learning_rate": 0.00018470103092783506,
+      "loss": 0.7402,
+      "step": 522
+    },
+    {
+      "epoch": 0.33472,
+      "grad_norm": 0.3232499659061432,
+      "learning_rate": 0.000184659793814433,
+      "loss": 0.6512,
+      "step": 523
+    },
+    {
+      "epoch": 0.33536,
+      "grad_norm": 0.33574604988098145,
+      "learning_rate": 0.00018461855670103095,
+      "loss": 0.7921,
+      "step": 524
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3470611274242401,
+      "learning_rate": 0.00018457731958762888,
+      "loss": 0.7603,
+      "step": 525
+    },
+    {
+      "epoch": 0.33664,
+      "grad_norm": 0.3139747977256775,
+      "learning_rate": 0.0001845360824742268,
+      "loss": 0.7882,
+      "step": 526
+    },
+    {
+      "epoch": 0.33728,
+      "grad_norm": 0.32434818148612976,
+      "learning_rate": 0.00018449484536082474,
+      "loss": 0.7353,
+      "step": 527
+    },
+    {
+      "epoch": 0.33792,
+      "grad_norm": 0.3150811791419983,
+      "learning_rate": 0.00018445360824742267,
+      "loss": 0.8075,
+      "step": 528
+    },
+    {
+      "epoch": 0.33856,
+      "grad_norm": 0.31737667322158813,
+      "learning_rate": 0.00018441237113402063,
+      "loss": 0.6079,
+      "step": 529
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.33673223853111267,
+      "learning_rate": 0.00018437113402061856,
+      "loss": 0.741,
+      "step": 530
+    },
+    {
+      "epoch": 0.33984,
+      "grad_norm": 0.32814377546310425,
+      "learning_rate": 0.0001843298969072165,
+      "loss": 0.7121,
+      "step": 531
+    },
+    {
+      "epoch": 0.34048,
+      "grad_norm": 0.31837186217308044,
+      "learning_rate": 0.00018428865979381445,
+      "loss": 0.7648,
+      "step": 532
+    },
+    {
+      "epoch": 0.34112,
+      "grad_norm": 0.30446767807006836,
+      "learning_rate": 0.00018424742268041239,
+      "loss": 0.5615,
+      "step": 533
+    },
+    {
+      "epoch": 0.34176,
+      "grad_norm": 0.31597232818603516,
+      "learning_rate": 0.00018420618556701032,
+      "loss": 0.7937,
+      "step": 534
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.3117212653160095,
+      "learning_rate": 0.00018416494845360825,
+      "loss": 0.7124,
+      "step": 535
+    },
+    {
+      "epoch": 0.34304,
+      "grad_norm": 0.3184095621109009,
+      "learning_rate": 0.00018412371134020618,
+      "loss": 0.7149,
+      "step": 536
+    },
+    {
+      "epoch": 0.34368,
+      "grad_norm": 0.3065534234046936,
+      "learning_rate": 0.00018408247422680414,
+      "loss": 0.682,
+      "step": 537
+    },
+    {
+      "epoch": 0.34432,
+      "grad_norm": 0.30083295702934265,
+      "learning_rate": 0.00018404123711340207,
+      "loss": 0.6837,
+      "step": 538
+    },
+    {
+      "epoch": 0.34496,
+      "grad_norm": 0.3477546274662018,
+      "learning_rate": 0.00018400000000000003,
+      "loss": 0.768,
+      "step": 539
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.30386462807655334,
+      "learning_rate": 0.00018395876288659796,
+      "loss": 0.7838,
+      "step": 540
+    },
+    {
+      "epoch": 0.34624,
+      "grad_norm": 0.30766555666923523,
+      "learning_rate": 0.00018391752577319586,
+      "loss": 0.7411,
+      "step": 541
+    },
+    {
+      "epoch": 0.34688,
+      "grad_norm": 0.28504031896591187,
+      "learning_rate": 0.00018387628865979382,
+      "loss": 0.6744,
+      "step": 542
+    },
+    {
+      "epoch": 0.34752,
+      "grad_norm": 0.2924272119998932,
+      "learning_rate": 0.00018383505154639175,
+      "loss": 0.688,
+      "step": 543
+    },
+    {
+      "epoch": 0.34816,
+      "grad_norm": 0.3069016635417938,
+      "learning_rate": 0.0001837938144329897,
+      "loss": 0.7277,
+      "step": 544
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.3368631601333618,
+      "learning_rate": 0.00018375257731958764,
+      "loss": 0.7493,
+      "step": 545
+    },
+    {
+      "epoch": 0.34944,
+      "grad_norm": 0.3183855414390564,
+      "learning_rate": 0.00018371134020618557,
+      "loss": 0.7747,
+      "step": 546
+    },
+    {
+      "epoch": 0.35008,
+      "grad_norm": 0.32446008920669556,
+      "learning_rate": 0.00018367010309278353,
+      "loss": 0.6704,
+      "step": 547
+    },
+    {
+      "epoch": 0.35072,
+      "grad_norm": 0.34938210248947144,
+      "learning_rate": 0.00018362886597938144,
+      "loss": 0.7892,
+      "step": 548
+    },
+    {
+      "epoch": 0.35136,
+      "grad_norm": 0.2923019826412201,
+      "learning_rate": 0.0001835876288659794,
+      "loss": 0.7432,
+      "step": 549
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3351273238658905,
+      "learning_rate": 0.00018354639175257733,
+      "loss": 0.7012,
+      "step": 550
+    },
+    {
+      "epoch": 0.35264,
+      "grad_norm": 0.32217180728912354,
+      "learning_rate": 0.00018350515463917526,
+      "loss": 0.7507,
+      "step": 551
+    },
+    {
+      "epoch": 0.35328,
+      "grad_norm": 0.3369709849357605,
+      "learning_rate": 0.00018346391752577322,
+      "loss": 0.7765,
+      "step": 552
+    },
+    {
+      "epoch": 0.35392,
+      "grad_norm": 0.3284771144390106,
+      "learning_rate": 0.00018342268041237115,
+      "loss": 0.6704,
+      "step": 553
+    },
+    {
+      "epoch": 0.35456,
+      "grad_norm": 0.3167947232723236,
+      "learning_rate": 0.00018338144329896908,
+      "loss": 0.6291,
+      "step": 554
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.2957805097103119,
+      "learning_rate": 0.000183340206185567,
+      "loss": 0.6652,
+      "step": 555
+    },
+    {
+      "epoch": 0.35584,
+      "grad_norm": 0.3164500594139099,
+      "learning_rate": 0.00018329896907216494,
+      "loss": 0.8223,
+      "step": 556
+    },
+    {
+      "epoch": 0.35648,
+      "grad_norm": 0.3126467168331146,
+      "learning_rate": 0.0001832577319587629,
+      "loss": 0.6573,
+      "step": 557
+    },
+    {
+      "epoch": 0.35712,
+      "grad_norm": 0.31474146246910095,
+      "learning_rate": 0.00018321649484536083,
+      "loss": 0.6528,
+      "step": 558
+    },
+    {
+      "epoch": 0.35776,
+      "grad_norm": 0.35340675711631775,
+      "learning_rate": 0.0001831752577319588,
+      "loss": 0.8127,
+      "step": 559
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.33534303307533264,
+      "learning_rate": 0.00018313402061855672,
+      "loss": 0.6999,
+      "step": 560
+    },
+    {
+      "epoch": 0.35904,
+      "grad_norm": 0.3314031958580017,
+      "learning_rate": 0.00018309278350515465,
+      "loss": 0.6535,
+      "step": 561
+    },
+    {
+      "epoch": 0.35968,
+      "grad_norm": 0.3222474753856659,
+      "learning_rate": 0.00018305154639175258,
+      "loss": 0.7352,
+      "step": 562
+    },
+    {
+      "epoch": 0.36032,
+      "grad_norm": 0.2976485788822174,
+      "learning_rate": 0.0001830103092783505,
+      "loss": 0.6229,
+      "step": 563
+    },
+    {
+      "epoch": 0.36096,
+      "grad_norm": 0.3436873257160187,
+      "learning_rate": 0.00018296907216494844,
+      "loss": 0.6886,
+      "step": 564
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3189018964767456,
+      "learning_rate": 0.0001829278350515464,
+      "loss": 0.7893,
+      "step": 565
+    },
+    {
+      "epoch": 0.36224,
+      "grad_norm": 0.3039281368255615,
+      "learning_rate": 0.00018288659793814433,
+      "loss": 0.6964,
+      "step": 566
+    },
+    {
+      "epoch": 0.36288,
+      "grad_norm": 0.32948389649391174,
+      "learning_rate": 0.0001828453608247423,
+      "loss": 0.6845,
+      "step": 567
+    },
+    {
+      "epoch": 0.36352,
+      "grad_norm": 0.30820393562316895,
+      "learning_rate": 0.00018280412371134022,
+      "loss": 0.8031,
+      "step": 568
+    },
+    {
+      "epoch": 0.36416,
+      "grad_norm": 0.33417025208473206,
+      "learning_rate": 0.00018276288659793816,
+      "loss": 0.8031,
+      "step": 569
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.31425607204437256,
+      "learning_rate": 0.0001827216494845361,
+      "loss": 0.7717,
+      "step": 570
+    },
+    {
+      "epoch": 0.36544,
+      "grad_norm": 0.29514193534851074,
+      "learning_rate": 0.00018268041237113402,
+      "loss": 0.5876,
+      "step": 571
+    },
+    {
+      "epoch": 0.36608,
+      "grad_norm": 0.3510682284832001,
+      "learning_rate": 0.00018263917525773198,
+      "loss": 0.6231,
+      "step": 572
+    },
+    {
+      "epoch": 0.36672,
+      "grad_norm": 0.2989828884601593,
+      "learning_rate": 0.0001825979381443299,
+      "loss": 0.7944,
+      "step": 573
+    },
+    {
+      "epoch": 0.36736,
+      "grad_norm": 0.3122637867927551,
+      "learning_rate": 0.00018255670103092784,
+      "loss": 0.6865,
+      "step": 574
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3377734422683716,
+      "learning_rate": 0.0001825154639175258,
+      "loss": 0.7755,
+      "step": 575
+    },
+    {
+      "epoch": 0.36864,
+      "grad_norm": 0.33263498544692993,
+      "learning_rate": 0.00018247422680412373,
+      "loss": 0.6879,
+      "step": 576
+    },
+    {
+      "epoch": 0.36928,
+      "grad_norm": 0.29371869564056396,
+      "learning_rate": 0.00018243298969072166,
+      "loss": 0.6858,
+      "step": 577
+    },
+    {
+      "epoch": 0.36992,
+      "grad_norm": 0.31379029154777527,
+      "learning_rate": 0.0001823917525773196,
+      "loss": 0.6524,
+      "step": 578
+    },
+    {
+      "epoch": 0.37056,
+      "grad_norm": 0.31624743342399597,
+      "learning_rate": 0.00018235051546391752,
+      "loss": 0.6046,
+      "step": 579
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.31369104981422424,
+      "learning_rate": 0.00018230927835051548,
+      "loss": 0.6339,
+      "step": 580
+    },
+    {
+      "epoch": 0.37184,
+      "grad_norm": 0.3421556055545807,
+      "learning_rate": 0.0001822680412371134,
+      "loss": 0.7818,
+      "step": 581
+    },
+    {
+      "epoch": 0.37248,
+      "grad_norm": 0.3361581265926361,
+      "learning_rate": 0.00018222680412371137,
+      "loss": 0.6696,
+      "step": 582
+    },
+    {
+      "epoch": 0.37312,
+      "grad_norm": 0.29861336946487427,
+      "learning_rate": 0.0001821855670103093,
+      "loss": 0.7688,
+      "step": 583
+    },
+    {
+      "epoch": 0.37376,
+      "grad_norm": 0.3147892951965332,
+      "learning_rate": 0.0001821443298969072,
+      "loss": 0.7119,
+      "step": 584
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.3357600271701813,
+      "learning_rate": 0.00018210309278350516,
+      "loss": 0.8376,
+      "step": 585
+    },
+    {
+      "epoch": 0.37504,
+      "grad_norm": 0.356073796749115,
+      "learning_rate": 0.0001820618556701031,
+      "loss": 0.767,
+      "step": 586
+    },
+    {
+      "epoch": 0.37568,
+      "grad_norm": 0.32769110798835754,
+      "learning_rate": 0.00018202061855670105,
+      "loss": 0.6884,
+      "step": 587
+    },
+    {
+      "epoch": 0.37632,
+      "grad_norm": 0.3291815221309662,
+      "learning_rate": 0.00018197938144329898,
+      "loss": 0.6887,
+      "step": 588
+    },
+    {
+      "epoch": 0.37696,
+      "grad_norm": 0.32520556449890137,
+      "learning_rate": 0.00018193814432989692,
+      "loss": 0.6619,
+      "step": 589
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.31937602162361145,
+      "learning_rate": 0.00018189690721649485,
+      "loss": 0.7888,
+      "step": 590
+    },
+    {
+      "epoch": 0.37824,
+      "grad_norm": 0.28734534978866577,
+      "learning_rate": 0.00018185567010309278,
+      "loss": 0.6354,
+      "step": 591
+    },
+    {
+      "epoch": 0.37888,
+      "grad_norm": 0.359750360250473,
+      "learning_rate": 0.0001818144329896907,
+      "loss": 0.7663,
+      "step": 592
+    },
+    {
+      "epoch": 0.37952,
+      "grad_norm": 0.3230169713497162,
+      "learning_rate": 0.00018177319587628867,
+      "loss": 0.7597,
+      "step": 593
+    },
+    {
+      "epoch": 0.38016,
+      "grad_norm": 0.30331283807754517,
+      "learning_rate": 0.0001817319587628866,
+      "loss": 0.7407,
+      "step": 594
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3382543623447418,
+      "learning_rate": 0.00018169072164948456,
+      "loss": 0.5983,
+      "step": 595
+    },
+    {
+      "epoch": 0.38144,
+      "grad_norm": 0.31564861536026,
+      "learning_rate": 0.0001816494845360825,
+      "loss": 0.6453,
+      "step": 596
+    },
+    {
+      "epoch": 0.38208,
+      "grad_norm": 0.29695287346839905,
+      "learning_rate": 0.00018160824742268042,
+      "loss": 0.7001,
+      "step": 597
+    },
+    {
+      "epoch": 0.38272,
+      "grad_norm": 0.3027021288871765,
+      "learning_rate": 0.00018156701030927835,
+      "loss": 0.7316,
+      "step": 598
+    },
+    {
+      "epoch": 0.38336,
+      "grad_norm": 0.3261658549308777,
+      "learning_rate": 0.00018152577319587628,
+      "loss": 0.7279,
+      "step": 599
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.31820055842399597,
+      "learning_rate": 0.00018148453608247424,
+      "loss": 0.6472,
+      "step": 600
+    },
+    {
+      "epoch": 0.38464,
+      "grad_norm": 0.29130715131759644,
+      "learning_rate": 0.00018144329896907217,
+      "loss": 0.6452,
+      "step": 601
+    },
+    {
+      "epoch": 0.38528,
+      "grad_norm": 0.3414769172668457,
+      "learning_rate": 0.0001814020618556701,
+      "loss": 0.8091,
+      "step": 602
+    },
+    {
+      "epoch": 0.38592,
+      "grad_norm": 0.323068231344223,
+      "learning_rate": 0.00018136082474226806,
+      "loss": 0.8272,
+      "step": 603
+    },
+    {
+      "epoch": 0.38656,
+      "grad_norm": 0.30228736996650696,
+      "learning_rate": 0.000181319587628866,
+      "loss": 0.6859,
+      "step": 604
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.2849486768245697,
+      "learning_rate": 0.00018127835051546393,
+      "loss": 0.6738,
+      "step": 605
+    },
+    {
+      "epoch": 0.38784,
+      "grad_norm": 0.3121798038482666,
+      "learning_rate": 0.00018123711340206186,
+      "loss": 0.7491,
+      "step": 606
+    },
+    {
+      "epoch": 0.38848,
+      "grad_norm": 0.29611915349960327,
+      "learning_rate": 0.0001811958762886598,
+      "loss": 0.6918,
+      "step": 607
+    },
+    {
+      "epoch": 0.38912,
+      "grad_norm": 0.32199347019195557,
+      "learning_rate": 0.00018115463917525775,
+      "loss": 0.7524,
+      "step": 608
+    },
+    {
+      "epoch": 0.38976,
+      "grad_norm": 0.2937721610069275,
+      "learning_rate": 0.00018111340206185568,
+      "loss": 0.6408,
+      "step": 609
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.29907405376434326,
+      "learning_rate": 0.00018107216494845364,
+      "loss": 0.6683,
+      "step": 610
+    },
+    {
+      "epoch": 0.39104,
+      "grad_norm": 0.33071473240852356,
+      "learning_rate": 0.00018103092783505157,
+      "loss": 0.6956,
+      "step": 611
+    },
+    {
+      "epoch": 0.39168,
+      "grad_norm": 0.3242502808570862,
+      "learning_rate": 0.0001809896907216495,
+      "loss": 0.7464,
+      "step": 612
+    },
+    {
+      "epoch": 0.39232,
+      "grad_norm": 0.3559030592441559,
+      "learning_rate": 0.00018094845360824743,
+      "loss": 0.6681,
+      "step": 613
+    },
+    {
+      "epoch": 0.39296,
+      "grad_norm": 0.2962280809879303,
+      "learning_rate": 0.00018090721649484536,
+      "loss": 0.7762,
+      "step": 614
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.28452780842781067,
+      "learning_rate": 0.00018086597938144332,
+      "loss": 0.7021,
+      "step": 615
+    },
+    {
+      "epoch": 0.39424,
+      "grad_norm": 0.35916876792907715,
+      "learning_rate": 0.00018082474226804125,
+      "loss": 0.7758,
+      "step": 616
+    },
+    {
+      "epoch": 0.39488,
+      "grad_norm": 0.3153558373451233,
+      "learning_rate": 0.00018078350515463918,
+      "loss": 0.695,
+      "step": 617
+    },
+    {
+      "epoch": 0.39552,
+      "grad_norm": 0.3376162648200989,
+      "learning_rate": 0.00018074226804123714,
+      "loss": 0.6825,
+      "step": 618
+    },
+    {
+      "epoch": 0.39616,
+      "grad_norm": 0.3012005388736725,
+      "learning_rate": 0.00018070103092783507,
+      "loss": 0.6841,
+      "step": 619
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.29879581928253174,
+      "learning_rate": 0.000180659793814433,
+      "loss": 0.6009,
+      "step": 620
+    },
+    {
+      "epoch": 0.39744,
+      "grad_norm": 0.34166449308395386,
+      "learning_rate": 0.00018061855670103093,
+      "loss": 0.7356,
+      "step": 621
+    },
+    {
+      "epoch": 0.39808,
+      "grad_norm": 0.307958722114563,
+      "learning_rate": 0.00018057731958762887,
+      "loss": 0.7801,
+      "step": 622
+    },
+    {
+      "epoch": 0.39872,
+      "grad_norm": 0.3334157168865204,
+      "learning_rate": 0.00018053608247422682,
+      "loss": 0.7001,
+      "step": 623
+    },
+    {
+      "epoch": 0.39936,
+      "grad_norm": 0.32468053698539734,
+      "learning_rate": 0.00018049484536082475,
+      "loss": 0.8652,
+      "step": 624
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3132997155189514,
+      "learning_rate": 0.00018045360824742269,
+      "loss": 0.6874,
+      "step": 625
+    },
+    {
+      "epoch": 0.40064,
+      "grad_norm": 0.31620553135871887,
+      "learning_rate": 0.00018041237113402062,
+      "loss": 0.6982,
+      "step": 626
+    },
+    {
+      "epoch": 0.40128,
+      "grad_norm": 0.3290863037109375,
+      "learning_rate": 0.00018037113402061855,
+      "loss": 0.7402,
+      "step": 627
+    },
+    {
+      "epoch": 0.40192,
+      "grad_norm": 0.3129526674747467,
+      "learning_rate": 0.0001803298969072165,
+      "loss": 0.7625,
+      "step": 628
+    },
+    {
+      "epoch": 0.40256,
+      "grad_norm": 0.33314669132232666,
+      "learning_rate": 0.00018028865979381444,
+      "loss": 0.721,
+      "step": 629
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.34176406264305115,
+      "learning_rate": 0.00018024742268041237,
+      "loss": 0.7526,
+      "step": 630
+    },
+    {
+      "epoch": 0.40384,
+      "grad_norm": 0.3387579321861267,
+      "learning_rate": 0.00018020618556701033,
+      "loss": 0.7795,
+      "step": 631
+    },
+    {
+      "epoch": 0.40448,
+      "grad_norm": 0.36744049191474915,
+      "learning_rate": 0.00018016494845360826,
+      "loss": 0.7363,
+      "step": 632
+    },
+    {
+      "epoch": 0.40512,
+      "grad_norm": 0.34345245361328125,
+      "learning_rate": 0.0001801237113402062,
+      "loss": 0.8184,
+      "step": 633
+    },
+    {
+      "epoch": 0.40576,
+      "grad_norm": 0.2873213291168213,
+      "learning_rate": 0.00018008247422680412,
+      "loss": 0.7022,
+      "step": 634
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3166600465774536,
+      "learning_rate": 0.00018004123711340205,
+      "loss": 0.6679,
+      "step": 635
+    },
+    {
+      "epoch": 0.40704,
+      "grad_norm": 0.31453585624694824,
+      "learning_rate": 0.00018,
+      "loss": 0.6398,
+      "step": 636
+    },
+    {
+      "epoch": 0.40768,
+      "grad_norm": 0.332944393157959,
+      "learning_rate": 0.00017995876288659794,
+      "loss": 0.7118,
+      "step": 637
+    },
+    {
+      "epoch": 0.40832,
+      "grad_norm": 0.3085038959980011,
+      "learning_rate": 0.0001799175257731959,
+      "loss": 0.6242,
+      "step": 638
+    },
+    {
+      "epoch": 0.40896,
+      "grad_norm": 0.32425934076309204,
+      "learning_rate": 0.00017987628865979383,
+      "loss": 0.6546,
+      "step": 639
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.30966541171073914,
+      "learning_rate": 0.00017983505154639176,
+      "loss": 0.8029,
+      "step": 640
+    },
+    {
+      "epoch": 0.41024,
+      "grad_norm": 0.2859591543674469,
+      "learning_rate": 0.0001797938144329897,
+      "loss": 0.7304,
+      "step": 641
+    },
+    {
+      "epoch": 0.41088,
+      "grad_norm": 0.3151989281177521,
+      "learning_rate": 0.00017975257731958763,
+      "loss": 0.7851,
+      "step": 642
+    },
+    {
+      "epoch": 0.41152,
+      "grad_norm": 0.30177900195121765,
+      "learning_rate": 0.00017971134020618558,
+      "loss": 0.7174,
+      "step": 643
+    },
+    {
+      "epoch": 0.41216,
+      "grad_norm": 0.3529605567455292,
+      "learning_rate": 0.00017967010309278352,
+      "loss": 0.771,
+      "step": 644
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3392142653465271,
+      "learning_rate": 0.00017962886597938145,
+      "loss": 0.7245,
+      "step": 645
+    },
+    {
+      "epoch": 0.41344,
+      "grad_norm": 0.31237298250198364,
+      "learning_rate": 0.0001795876288659794,
+      "loss": 0.7565,
+      "step": 646
+    },
+    {
+      "epoch": 0.41408,
+      "grad_norm": 0.3477345407009125,
+      "learning_rate": 0.00017954639175257734,
+      "loss": 0.7599,
+      "step": 647
+    },
+    {
+      "epoch": 0.41472,
+      "grad_norm": 0.3220370411872864,
+      "learning_rate": 0.00017950515463917527,
+      "loss": 0.6809,
+      "step": 648
+    },
+    {
+      "epoch": 0.41536,
+      "grad_norm": 0.3059755265712738,
+      "learning_rate": 0.0001794639175257732,
+      "loss": 0.7662,
+      "step": 649
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3359750807285309,
+      "learning_rate": 0.00017942268041237113,
+      "loss": 0.7056,
+      "step": 650
+    },
+    {
+      "epoch": 0.41664,
+      "grad_norm": 0.3313213586807251,
+      "learning_rate": 0.0001793814432989691,
+      "loss": 0.7062,
+      "step": 651
+    },
+    {
+      "epoch": 0.41728,
+      "grad_norm": 0.3429223597049713,
+      "learning_rate": 0.00017934020618556702,
+      "loss": 0.7527,
+      "step": 652
+    },
+    {
+      "epoch": 0.41792,
+      "grad_norm": 0.31370845437049866,
+      "learning_rate": 0.00017929896907216495,
+      "loss": 0.7454,
+      "step": 653
+    },
+    {
+      "epoch": 0.41856,
+      "grad_norm": 0.3164546489715576,
+      "learning_rate": 0.0001792577319587629,
+      "loss": 0.7491,
+      "step": 654
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.29535427689552307,
+      "learning_rate": 0.00017921649484536081,
+      "loss": 0.7737,
+      "step": 655
+    },
+    {
+      "epoch": 0.41984,
+      "grad_norm": 0.3005925118923187,
+      "learning_rate": 0.00017917525773195877,
+      "loss": 0.6782,
+      "step": 656
+    },
+    {
+      "epoch": 0.42048,
+      "grad_norm": 0.32721176743507385,
+      "learning_rate": 0.0001791340206185567,
+      "loss": 0.7827,
+      "step": 657
+    },
+    {
+      "epoch": 0.42112,
+      "grad_norm": 0.32902729511260986,
+      "learning_rate": 0.00017909278350515463,
+      "loss": 0.6163,
+      "step": 658
+    },
+    {
+      "epoch": 0.42176,
+      "grad_norm": 0.2902410328388214,
+      "learning_rate": 0.0001790515463917526,
+      "loss": 0.6975,
+      "step": 659
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3021790087223053,
+      "learning_rate": 0.00017901030927835052,
+      "loss": 0.7393,
+      "step": 660
+    },
+    {
+      "epoch": 0.42304,
+      "grad_norm": 0.34339800477027893,
+      "learning_rate": 0.00017896907216494848,
+      "loss": 0.7284,
+      "step": 661
+    },
+    {
+      "epoch": 0.42368,
+      "grad_norm": 0.3379479944705963,
+      "learning_rate": 0.0001789278350515464,
+      "loss": 0.7154,
+      "step": 662
+    },
+    {
+      "epoch": 0.42432,
+      "grad_norm": 0.2908755838871002,
+      "learning_rate": 0.00017888659793814432,
+      "loss": 0.6994,
+      "step": 663
+    },
+    {
+      "epoch": 0.42496,
+      "grad_norm": 0.3394453525543213,
+      "learning_rate": 0.00017884536082474228,
+      "loss": 0.6988,
+      "step": 664
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.32524189352989197,
+      "learning_rate": 0.0001788041237113402,
+      "loss": 0.6895,
+      "step": 665
+    },
+    {
+      "epoch": 0.42624,
+      "grad_norm": 0.3299076557159424,
+      "learning_rate": 0.00017876288659793817,
+      "loss": 0.7442,
+      "step": 666
+    },
+    {
+      "epoch": 0.42688,
+      "grad_norm": 0.3226897418498993,
+      "learning_rate": 0.0001787216494845361,
+      "loss": 0.7352,
+      "step": 667
+    },
+    {
+      "epoch": 0.42752,
+      "grad_norm": 0.36559486389160156,
+      "learning_rate": 0.00017868041237113403,
+      "loss": 0.7647,
+      "step": 668
+    },
+    {
+      "epoch": 0.42816,
+      "grad_norm": 0.2977141737937927,
+      "learning_rate": 0.00017863917525773196,
+      "loss": 0.6734,
+      "step": 669
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3319728672504425,
+      "learning_rate": 0.0001785979381443299,
+      "loss": 0.7525,
+      "step": 670
+    },
+    {
+      "epoch": 0.42944,
+      "grad_norm": 0.3145429790019989,
+      "learning_rate": 0.00017855670103092785,
+      "loss": 0.7966,
+      "step": 671
+    },
+    {
+      "epoch": 0.43008,
+      "grad_norm": 0.35581204295158386,
+      "learning_rate": 0.00017851546391752578,
+      "loss": 0.7058,
+      "step": 672
+    },
+    {
+      "epoch": 0.43072,
+      "grad_norm": 0.3501969277858734,
+      "learning_rate": 0.0001784742268041237,
+      "loss": 0.7558,
+      "step": 673
+    },
+    {
+      "epoch": 0.43136,
+      "grad_norm": 0.3305762708187103,
+      "learning_rate": 0.00017843298969072167,
+      "loss": 0.6496,
+      "step": 674
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3152034282684326,
+      "learning_rate": 0.0001783917525773196,
+      "loss": 0.7064,
+      "step": 675
+    },
+    {
+      "epoch": 0.43264,
+      "grad_norm": 0.3159230947494507,
+      "learning_rate": 0.00017835051546391753,
+      "loss": 0.6622,
+      "step": 676
+    },
+    {
+      "epoch": 0.43328,
+      "grad_norm": 0.3200649917125702,
+      "learning_rate": 0.00017830927835051546,
+      "loss": 0.6585,
+      "step": 677
+    },
+    {
+      "epoch": 0.43392,
+      "grad_norm": 0.31407856941223145,
+      "learning_rate": 0.0001782680412371134,
+      "loss": 0.6623,
+      "step": 678
+    },
+    {
+      "epoch": 0.43456,
+      "grad_norm": 0.3082878887653351,
+      "learning_rate": 0.00017822680412371135,
+      "loss": 0.674,
+      "step": 679
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.2928681969642639,
+      "learning_rate": 0.00017818556701030929,
+      "loss": 0.7088,
+      "step": 680
+    },
+    {
+      "epoch": 0.43584,
+      "grad_norm": 0.32372942566871643,
+      "learning_rate": 0.00017814432989690724,
+      "loss": 0.7245,
+      "step": 681
+    },
+    {
+      "epoch": 0.43648,
+      "grad_norm": 0.3413868248462677,
+      "learning_rate": 0.00017810309278350518,
+      "loss": 0.714,
+      "step": 682
+    },
+    {
+      "epoch": 0.43712,
+      "grad_norm": 0.35775700211524963,
+      "learning_rate": 0.0001780618556701031,
+      "loss": 0.7298,
+      "step": 683
+    },
+    {
+      "epoch": 0.43776,
+      "grad_norm": 0.3211750090122223,
+      "learning_rate": 0.00017802061855670104,
+      "loss": 0.6826,
+      "step": 684
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3271782100200653,
+      "learning_rate": 0.00017797938144329897,
+      "loss": 0.7528,
+      "step": 685
+    },
+    {
+      "epoch": 0.43904,
+      "grad_norm": 0.29311588406562805,
+      "learning_rate": 0.0001779381443298969,
+      "loss": 0.7534,
+      "step": 686
+    },
+    {
+      "epoch": 0.43968,
+      "grad_norm": 0.31222617626190186,
+      "learning_rate": 0.00017789690721649486,
+      "loss": 0.6043,
+      "step": 687
+    },
+    {
+      "epoch": 0.44032,
+      "grad_norm": 0.3035969138145447,
+      "learning_rate": 0.0001778556701030928,
+      "loss": 0.6856,
+      "step": 688
+    },
+    {
+      "epoch": 0.44096,
+      "grad_norm": 0.30713456869125366,
+      "learning_rate": 0.00017781443298969075,
+      "loss": 0.7055,
+      "step": 689
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3052692711353302,
+      "learning_rate": 0.00017777319587628868,
+      "loss": 0.7253,
+      "step": 690
+    },
+    {
+      "epoch": 0.44224,
+      "grad_norm": 0.32757455110549927,
+      "learning_rate": 0.00017773195876288658,
+      "loss": 0.6783,
+      "step": 691
+    },
+    {
+      "epoch": 0.44288,
+      "grad_norm": 0.30219218134880066,
+      "learning_rate": 0.00017769072164948454,
+      "loss": 0.6895,
+      "step": 692
+    },
+    {
+      "epoch": 0.44352,
+      "grad_norm": 0.31325563788414,
+      "learning_rate": 0.00017764948453608247,
+      "loss": 0.815,
+      "step": 693
+    },
+    {
+      "epoch": 0.44416,
+      "grad_norm": 0.355072557926178,
+      "learning_rate": 0.00017760824742268043,
+      "loss": 0.7643,
+      "step": 694
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.298994243144989,
+      "learning_rate": 0.00017756701030927836,
+      "loss": 0.6461,
+      "step": 695
+    },
+    {
+      "epoch": 0.44544,
+      "grad_norm": 0.3194555640220642,
+      "learning_rate": 0.0001775257731958763,
+      "loss": 0.6773,
+      "step": 696
+    },
+    {
+      "epoch": 0.44608,
+      "grad_norm": 0.3051382899284363,
+      "learning_rate": 0.00017748453608247425,
+      "loss": 0.5972,
+      "step": 697
+    },
+    {
+      "epoch": 0.44672,
+      "grad_norm": 0.3300171494483948,
+      "learning_rate": 0.00017744329896907216,
+      "loss": 0.7206,
+      "step": 698
+    },
+    {
+      "epoch": 0.44736,
+      "grad_norm": 0.3081996738910675,
+      "learning_rate": 0.00017740206185567012,
+      "loss": 0.6901,
+      "step": 699
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.327961802482605,
+      "learning_rate": 0.00017736082474226805,
+      "loss": 0.6731,
+      "step": 700
+    },
+    {
+      "epoch": 0.44864,
+      "grad_norm": 0.29139766097068787,
+      "learning_rate": 0.00017731958762886598,
+      "loss": 0.633,
+      "step": 701
+    },
+    {
+      "epoch": 0.44928,
+      "grad_norm": 0.3187619149684906,
+      "learning_rate": 0.00017727835051546394,
+      "loss": 0.6589,
+      "step": 702
+    },
+    {
+      "epoch": 0.44992,
+      "grad_norm": 0.36440667510032654,
+      "learning_rate": 0.00017723711340206187,
+      "loss": 0.7919,
+      "step": 703
+    },
+    {
+      "epoch": 0.45056,
+      "grad_norm": 0.29476261138916016,
+      "learning_rate": 0.0001771958762886598,
+      "loss": 0.5712,
+      "step": 704
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3510898947715759,
+      "learning_rate": 0.00017715463917525773,
+      "loss": 0.8076,
+      "step": 705
+    },
+    {
+      "epoch": 0.45184,
+      "grad_norm": 0.312646746635437,
+      "learning_rate": 0.00017711340206185566,
+      "loss": 0.7149,
+      "step": 706
+    },
+    {
+      "epoch": 0.45248,
+      "grad_norm": 0.3762478530406952,
+      "learning_rate": 0.00017707216494845362,
+      "loss": 0.7185,
+      "step": 707
+    },
+    {
+      "epoch": 0.45312,
+      "grad_norm": 0.33013173937797546,
+      "learning_rate": 0.00017703092783505155,
+      "loss": 0.6954,
+      "step": 708
+    },
+    {
+      "epoch": 0.45376,
+      "grad_norm": 0.3213675618171692,
+      "learning_rate": 0.0001769896907216495,
+      "loss": 0.6854,
+      "step": 709
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.2897025942802429,
+      "learning_rate": 0.00017694845360824744,
+      "loss": 0.6802,
+      "step": 710
+    },
+    {
+      "epoch": 0.45504,
+      "grad_norm": 0.33537644147872925,
+      "learning_rate": 0.00017690721649484537,
+      "loss": 0.6631,
+      "step": 711
+    },
+    {
+      "epoch": 0.45568,
+      "grad_norm": 0.34509536623954773,
+      "learning_rate": 0.0001768659793814433,
+      "loss": 0.6515,
+      "step": 712
+    },
+    {
+      "epoch": 0.45632,
+      "grad_norm": 0.3233640193939209,
+      "learning_rate": 0.00017682474226804123,
+      "loss": 0.7443,
+      "step": 713
+    },
+    {
+      "epoch": 0.45696,
+      "grad_norm": 0.312272310256958,
+      "learning_rate": 0.00017678350515463917,
+      "loss": 0.676,
+      "step": 714
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3364972472190857,
+      "learning_rate": 0.00017674226804123712,
+      "loss": 0.6519,
+      "step": 715
+    },
+    {
+      "epoch": 0.45824,
+      "grad_norm": 0.31538116931915283,
+      "learning_rate": 0.00017670103092783506,
+      "loss": 0.7844,
+      "step": 716
+    },
+    {
+      "epoch": 0.45888,
+      "grad_norm": 0.32454052567481995,
+      "learning_rate": 0.00017665979381443301,
+      "loss": 0.7009,
+      "step": 717
+    },
+    {
+      "epoch": 0.45952,
+      "grad_norm": 0.3240455687046051,
+      "learning_rate": 0.00017661855670103095,
+      "loss": 0.7845,
+      "step": 718
+    },
+    {
+      "epoch": 0.46016,
+      "grad_norm": 0.32182687520980835,
+      "learning_rate": 0.00017657731958762888,
+      "loss": 0.8407,
+      "step": 719
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.33177661895751953,
+      "learning_rate": 0.0001765360824742268,
+      "loss": 0.702,
+      "step": 720
+    },
+    {
+      "epoch": 0.46144,
+      "grad_norm": 0.32734885811805725,
+      "learning_rate": 0.00017649484536082474,
+      "loss": 0.801,
+      "step": 721
+    },
+    {
+      "epoch": 0.46208,
+      "grad_norm": 0.33752530813217163,
+      "learning_rate": 0.0001764536082474227,
+      "loss": 0.6907,
+      "step": 722
+    },
+    {
+      "epoch": 0.46272,
+      "grad_norm": 0.32092300057411194,
+      "learning_rate": 0.00017641237113402063,
+      "loss": 0.6323,
+      "step": 723
+    },
+    {
+      "epoch": 0.46336,
+      "grad_norm": 0.3301701247692108,
+      "learning_rate": 0.00017637113402061856,
+      "loss": 0.7884,
+      "step": 724
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.33426719903945923,
+      "learning_rate": 0.00017632989690721652,
+      "loss": 0.6985,
+      "step": 725
+    },
+    {
+      "epoch": 0.46464,
+      "grad_norm": 0.30391931533813477,
+      "learning_rate": 0.00017628865979381445,
+      "loss": 0.624,
+      "step": 726
+    },
+    {
+      "epoch": 0.46528,
+      "grad_norm": 0.3372322916984558,
+      "learning_rate": 0.00017624742268041238,
+      "loss": 0.7026,
+      "step": 727
+    },
+    {
+      "epoch": 0.46592,
+      "grad_norm": 0.3417535722255707,
+      "learning_rate": 0.0001762061855670103,
+      "loss": 0.6905,
+      "step": 728
+    },
+    {
+      "epoch": 0.46656,
+      "grad_norm": 0.3263862133026123,
+      "learning_rate": 0.00017616494845360824,
+      "loss": 0.7487,
+      "step": 729
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.31210842728614807,
+      "learning_rate": 0.0001761237113402062,
+      "loss": 0.7205,
+      "step": 730
+    },
+    {
+      "epoch": 0.46784,
+      "grad_norm": 0.31495293974876404,
+      "learning_rate": 0.00017608247422680413,
+      "loss": 0.6847,
+      "step": 731
+    },
+    {
+      "epoch": 0.46848,
+      "grad_norm": 0.31665417551994324,
+      "learning_rate": 0.0001760412371134021,
+      "loss": 0.7541,
+      "step": 732
+    },
+    {
+      "epoch": 0.46912,
+      "grad_norm": 0.34036847949028015,
+      "learning_rate": 0.00017600000000000002,
+      "loss": 0.6935,
+      "step": 733
+    },
+    {
+      "epoch": 0.46976,
+      "grad_norm": 0.3252568244934082,
+      "learning_rate": 0.00017595876288659793,
+      "loss": 0.761,
+      "step": 734
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.31876492500305176,
+      "learning_rate": 0.00017591752577319589,
+      "loss": 0.7228,
+      "step": 735
+    },
+    {
+      "epoch": 0.47104,
+      "grad_norm": 0.31521862745285034,
+      "learning_rate": 0.00017587628865979382,
+      "loss": 0.7177,
+      "step": 736
+    },
+    {
+      "epoch": 0.47168,
+      "grad_norm": 0.3257991373538971,
+      "learning_rate": 0.00017583505154639177,
+      "loss": 0.6461,
+      "step": 737
+    },
+    {
+      "epoch": 0.47232,
+      "grad_norm": 0.3280256986618042,
+      "learning_rate": 0.0001757938144329897,
+      "loss": 0.7308,
+      "step": 738
+    },
+    {
+      "epoch": 0.47296,
+      "grad_norm": 0.32144924998283386,
+      "learning_rate": 0.00017575257731958764,
+      "loss": 0.736,
+      "step": 739
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.32443079352378845,
+      "learning_rate": 0.00017571134020618557,
+      "loss": 0.789,
+      "step": 740
+    },
+    {
+      "epoch": 0.47424,
+      "grad_norm": 0.32073211669921875,
+      "learning_rate": 0.0001756701030927835,
+      "loss": 0.8098,
+      "step": 741
+    },
+    {
+      "epoch": 0.47488,
+      "grad_norm": 0.3057394325733185,
+      "learning_rate": 0.00017562886597938146,
+      "loss": 0.6704,
+      "step": 742
+    },
+    {
+      "epoch": 0.47552,
+      "grad_norm": 0.3523183763027191,
+      "learning_rate": 0.0001755876288659794,
+      "loss": 0.8224,
+      "step": 743
+    },
+    {
+      "epoch": 0.47616,
+      "grad_norm": 0.29038292169570923,
+      "learning_rate": 0.00017554639175257732,
+      "loss": 0.6499,
+      "step": 744
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.303993284702301,
+      "learning_rate": 0.00017550515463917528,
+      "loss": 0.6755,
+      "step": 745
+    },
+    {
+      "epoch": 0.47744,
+      "grad_norm": 0.30337685346603394,
+      "learning_rate": 0.0001754639175257732,
+      "loss": 0.6107,
+      "step": 746
+    },
+    {
+      "epoch": 0.47808,
+      "grad_norm": 0.3224083185195923,
+      "learning_rate": 0.00017542268041237114,
+      "loss": 0.7048,
+      "step": 747
+    },
+    {
+      "epoch": 0.47872,
+      "grad_norm": 0.31377819180488586,
+      "learning_rate": 0.00017538144329896907,
+      "loss": 0.64,
+      "step": 748
+    },
+    {
+      "epoch": 0.47936,
+      "grad_norm": 0.32864290475845337,
+      "learning_rate": 0.000175340206185567,
+      "loss": 0.6489,
+      "step": 749
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.311472624540329,
+      "learning_rate": 0.00017529896907216496,
+      "loss": 0.7515,
+      "step": 750
+    },
+    {
+      "epoch": 0.48064,
+      "grad_norm": 0.3309240937232971,
+      "learning_rate": 0.0001752577319587629,
+      "loss": 0.6424,
+      "step": 751
+    },
+    {
+      "epoch": 0.48128,
+      "grad_norm": 0.36461833119392395,
+      "learning_rate": 0.00017521649484536083,
+      "loss": 0.7726,
+      "step": 752
+    },
+    {
+      "epoch": 0.48192,
+      "grad_norm": 0.3393152058124542,
+      "learning_rate": 0.00017517525773195878,
+      "loss": 0.7695,
+      "step": 753
+    },
+    {
+      "epoch": 0.48256,
+      "grad_norm": 0.3230816721916199,
+      "learning_rate": 0.00017513402061855671,
+      "loss": 0.7402,
+      "step": 754
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.35010290145874023,
+      "learning_rate": 0.00017509278350515465,
+      "loss": 0.6834,
+      "step": 755
+    },
+    {
+      "epoch": 0.48384,
+      "grad_norm": 0.3625297248363495,
+      "learning_rate": 0.00017505154639175258,
+      "loss": 0.7338,
+      "step": 756
+    },
+    {
+      "epoch": 0.48448,
+      "grad_norm": 0.32331860065460205,
+      "learning_rate": 0.0001750103092783505,
+      "loss": 0.7245,
+      "step": 757
+    },
+    {
+      "epoch": 0.48512,
+      "grad_norm": 0.3649183213710785,
+      "learning_rate": 0.00017496907216494847,
+      "loss": 0.7,
+      "step": 758
+    },
+    {
+      "epoch": 0.48576,
+      "grad_norm": 0.3333425223827362,
+      "learning_rate": 0.0001749278350515464,
+      "loss": 0.6109,
+      "step": 759
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3211543560028076,
+      "learning_rate": 0.00017488659793814436,
+      "loss": 0.6099,
+      "step": 760
+    },
+    {
+      "epoch": 0.48704,
+      "grad_norm": 0.33552712202072144,
+      "learning_rate": 0.0001748453608247423,
+      "loss": 0.7296,
+      "step": 761
+    },
+    {
+      "epoch": 0.48768,
+      "grad_norm": 0.3309902548789978,
+      "learning_rate": 0.00017480412371134022,
+      "loss": 0.8381,
+      "step": 762
+    },
+    {
+      "epoch": 0.48832,
+      "grad_norm": 0.3188836872577667,
+      "learning_rate": 0.00017476288659793815,
+      "loss": 0.7366,
+      "step": 763
+    },
+    {
+      "epoch": 0.48896,
+      "grad_norm": 0.3274898827075958,
+      "learning_rate": 0.00017472164948453608,
+      "loss": 0.6387,
+      "step": 764
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3125731348991394,
+      "learning_rate": 0.00017468041237113404,
+      "loss": 0.6921,
+      "step": 765
+    },
+    {
+      "epoch": 0.49024,
+      "grad_norm": 0.3419846296310425,
+      "learning_rate": 0.00017463917525773197,
+      "loss": 0.7288,
+      "step": 766
+    },
+    {
+      "epoch": 0.49088,
+      "grad_norm": 0.3256465494632721,
+      "learning_rate": 0.0001745979381443299,
+      "loss": 0.7227,
+      "step": 767
+    },
+    {
+      "epoch": 0.49152,
+      "grad_norm": 0.33060264587402344,
+      "learning_rate": 0.00017455670103092786,
+      "loss": 0.7706,
+      "step": 768
+    },
+    {
+      "epoch": 0.49216,
+      "grad_norm": 0.31276652216911316,
+      "learning_rate": 0.0001745154639175258,
+      "loss": 0.781,
+      "step": 769
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.33390018343925476,
+      "learning_rate": 0.00017447422680412372,
+      "loss": 0.6634,
+      "step": 770
+    },
+    {
+      "epoch": 0.49344,
+      "grad_norm": 0.31565195322036743,
+      "learning_rate": 0.00017443298969072165,
+      "loss": 0.6423,
+      "step": 771
+    },
+    {
+      "epoch": 0.49408,
+      "grad_norm": 0.3246792256832123,
+      "learning_rate": 0.00017439175257731959,
+      "loss": 0.712,
+      "step": 772
+    },
+    {
+      "epoch": 0.49472,
+      "grad_norm": 0.325123131275177,
+      "learning_rate": 0.00017435051546391754,
+      "loss": 0.7351,
+      "step": 773
+    },
+    {
+      "epoch": 0.49536,
+      "grad_norm": 0.34222570061683655,
+      "learning_rate": 0.00017430927835051548,
+      "loss": 0.7501,
+      "step": 774
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3257981240749359,
+      "learning_rate": 0.0001742680412371134,
+      "loss": 0.6632,
+      "step": 775
+    },
+    {
+      "epoch": 0.49664,
+      "grad_norm": 0.3276258409023285,
+      "learning_rate": 0.00017422680412371134,
+      "loss": 0.6802,
+      "step": 776
+    },
+    {
+      "epoch": 0.49728,
+      "grad_norm": 0.31268545985221863,
+      "learning_rate": 0.00017418556701030927,
+      "loss": 0.6272,
+      "step": 777
+    },
+    {
+      "epoch": 0.49792,
+      "grad_norm": 0.3343692719936371,
+      "learning_rate": 0.00017414432989690723,
+      "loss": 0.7872,
+      "step": 778
+    },
+    {
+      "epoch": 0.49856,
+      "grad_norm": 0.31009477376937866,
+      "learning_rate": 0.00017410309278350516,
+      "loss": 0.7548,
+      "step": 779
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.33793899416923523,
+      "learning_rate": 0.0001740618556701031,
+      "loss": 0.7811,
+      "step": 780
+    },
+    {
+      "epoch": 0.49984,
+      "grad_norm": 0.3331272304058075,
+      "learning_rate": 0.00017402061855670105,
+      "loss": 0.6547,
+      "step": 781
+    },
+    {
+      "epoch": 0.50048,
+      "grad_norm": 0.3375319838523865,
+      "learning_rate": 0.00017397938144329898,
+      "loss": 0.6276,
+      "step": 782
+    },
+    {
+      "epoch": 0.50112,
+      "grad_norm": 0.33825623989105225,
+      "learning_rate": 0.0001739381443298969,
+      "loss": 0.6304,
+      "step": 783
+    },
+    {
+      "epoch": 0.50176,
+      "grad_norm": 0.35491976141929626,
+      "learning_rate": 0.00017389690721649484,
+      "loss": 0.8598,
+      "step": 784
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.31369227170944214,
+      "learning_rate": 0.00017385567010309277,
+      "loss": 0.6692,
+      "step": 785
+    },
+    {
+      "epoch": 0.50304,
+      "grad_norm": 0.3175995349884033,
+      "learning_rate": 0.00017381443298969073,
+      "loss": 0.6631,
+      "step": 786
+    },
+    {
+      "epoch": 0.50368,
+      "grad_norm": 0.33434727787971497,
+      "learning_rate": 0.00017377319587628866,
+      "loss": 0.786,
+      "step": 787
+    },
+    {
+      "epoch": 0.50432,
+      "grad_norm": 0.34534233808517456,
+      "learning_rate": 0.00017373195876288662,
+      "loss": 0.7397,
+      "step": 788
+    },
+    {
+      "epoch": 0.50496,
+      "grad_norm": 0.33616194128990173,
+      "learning_rate": 0.00017369072164948455,
+      "loss": 0.7276,
+      "step": 789
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.33229860663414,
+      "learning_rate": 0.00017364948453608248,
+      "loss": 0.7246,
+      "step": 790
+    },
+    {
+      "epoch": 0.50624,
+      "grad_norm": 0.31816366314888,
+      "learning_rate": 0.00017360824742268042,
+      "loss": 0.7098,
+      "step": 791
+    },
+    {
+      "epoch": 0.50688,
+      "grad_norm": 0.3122235834598541,
+      "learning_rate": 0.00017356701030927835,
+      "loss": 0.6253,
+      "step": 792
+    },
+    {
+      "epoch": 0.50752,
+      "grad_norm": 0.3092046082019806,
+      "learning_rate": 0.0001735257731958763,
+      "loss": 0.7733,
+      "step": 793
+    },
+    {
+      "epoch": 0.50816,
+      "grad_norm": 0.33151775598526,
+      "learning_rate": 0.00017348453608247424,
+      "loss": 0.7207,
+      "step": 794
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.32141223549842834,
+      "learning_rate": 0.00017344329896907217,
+      "loss": 0.6723,
+      "step": 795
+    },
+    {
+      "epoch": 0.50944,
+      "grad_norm": 0.3060906231403351,
+      "learning_rate": 0.00017340206185567013,
+      "loss": 0.6978,
+      "step": 796
+    },
+    {
+      "epoch": 0.51008,
+      "grad_norm": 0.2968650460243225,
+      "learning_rate": 0.00017336082474226806,
+      "loss": 0.6816,
+      "step": 797
+    },
+    {
+      "epoch": 0.51072,
+      "grad_norm": 0.33153387904167175,
+      "learning_rate": 0.000173319587628866,
+      "loss": 0.7527,
+      "step": 798
+    },
+    {
+      "epoch": 0.51136,
+      "grad_norm": 0.3402288258075714,
+      "learning_rate": 0.00017327835051546392,
+      "loss": 0.7537,
+      "step": 799
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.33059123158454895,
+      "learning_rate": 0.00017323711340206185,
+      "loss": 0.665,
+      "step": 800
+    },
+    {
+      "epoch": 0.51264,
+      "grad_norm": 0.3006879389286041,
+      "learning_rate": 0.0001731958762886598,
+      "loss": 0.6893,
+      "step": 801
+    },
+    {
+      "epoch": 0.51328,
+      "grad_norm": 0.3455508053302765,
+      "learning_rate": 0.00017315463917525774,
+      "loss": 0.6659,
+      "step": 802
+    },
+    {
+      "epoch": 0.51392,
+      "grad_norm": 0.34279516339302063,
+      "learning_rate": 0.00017311340206185567,
+      "loss": 0.6435,
+      "step": 803
+    },
+    {
+      "epoch": 0.51456,
+      "grad_norm": 0.33856919407844543,
+      "learning_rate": 0.00017307216494845363,
+      "loss": 0.6379,
+      "step": 804
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.34094688296318054,
+      "learning_rate": 0.00017303092783505154,
+      "loss": 0.8024,
+      "step": 805
+    },
+    {
+      "epoch": 0.51584,
+      "grad_norm": 0.3296661078929901,
+      "learning_rate": 0.0001729896907216495,
+      "loss": 0.6338,
+      "step": 806
+    },
+    {
+      "epoch": 0.51648,
+      "grad_norm": 0.29954472184181213,
+      "learning_rate": 0.00017294845360824742,
+      "loss": 0.8055,
+      "step": 807
+    },
+    {
+      "epoch": 0.51712,
+      "grad_norm": 0.2994474470615387,
+      "learning_rate": 0.00017290721649484536,
+      "loss": 0.7469,
+      "step": 808
+    },
+    {
+      "epoch": 0.51776,
+      "grad_norm": 0.28242233395576477,
+      "learning_rate": 0.00017286597938144331,
+      "loss": 0.6375,
+      "step": 809
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.32285964488983154,
+      "learning_rate": 0.00017282474226804125,
+      "loss": 0.7156,
+      "step": 810
+    },
+    {
+      "epoch": 0.51904,
+      "grad_norm": 0.3198898434638977,
+      "learning_rate": 0.0001727835051546392,
+      "loss": 0.7194,
+      "step": 811
+    },
+    {
+      "epoch": 0.51968,
+      "grad_norm": 0.34587299823760986,
+      "learning_rate": 0.0001727422680412371,
+      "loss": 0.7395,
+      "step": 812
+    },
+    {
+      "epoch": 0.52032,
+      "grad_norm": 0.3137355148792267,
+      "learning_rate": 0.00017270103092783504,
+      "loss": 0.6716,
+      "step": 813
+    },
+    {
+      "epoch": 0.52096,
+      "grad_norm": 0.34323760867118835,
+      "learning_rate": 0.000172659793814433,
+      "loss": 0.7162,
+      "step": 814
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.30069801211357117,
+      "learning_rate": 0.00017261855670103093,
+      "loss": 0.6447,
+      "step": 815
+    },
+    {
+      "epoch": 0.52224,
+      "grad_norm": 0.3398602604866028,
+      "learning_rate": 0.0001725773195876289,
+      "loss": 0.6681,
+      "step": 816
+    },
+    {
+      "epoch": 0.52288,
+      "grad_norm": 0.3466479480266571,
+      "learning_rate": 0.00017253608247422682,
+      "loss": 0.7147,
+      "step": 817
+    },
+    {
+      "epoch": 0.52352,
+      "grad_norm": 0.3227523863315582,
+      "learning_rate": 0.00017249484536082475,
+      "loss": 0.791,
+      "step": 818
+    },
+    {
+      "epoch": 0.52416,
+      "grad_norm": 0.32204312086105347,
+      "learning_rate": 0.00017245360824742268,
+      "loss": 0.6393,
+      "step": 819
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3570602834224701,
+      "learning_rate": 0.0001724123711340206,
+      "loss": 0.7549,
+      "step": 820
+    },
+    {
+      "epoch": 0.52544,
+      "grad_norm": 0.3069867789745331,
+      "learning_rate": 0.00017237113402061857,
+      "loss": 0.7725,
+      "step": 821
+    },
+    {
+      "epoch": 0.52608,
+      "grad_norm": 0.3699318468570709,
+      "learning_rate": 0.0001723298969072165,
+      "loss": 0.7802,
+      "step": 822
+    },
+    {
+      "epoch": 0.52672,
+      "grad_norm": 0.32556188106536865,
+      "learning_rate": 0.00017228865979381443,
+      "loss": 0.7522,
+      "step": 823
+    },
+    {
+      "epoch": 0.52736,
+      "grad_norm": 0.33946576714515686,
+      "learning_rate": 0.0001722474226804124,
+      "loss": 0.728,
+      "step": 824
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3280791640281677,
+      "learning_rate": 0.00017220618556701032,
+      "loss": 0.6937,
+      "step": 825
+    },
+    {
+      "epoch": 0.52864,
+      "grad_norm": 0.33134588599205017,
+      "learning_rate": 0.00017216494845360825,
+      "loss": 0.823,
+      "step": 826
+    },
+    {
+      "epoch": 0.52928,
+      "grad_norm": 0.3262774348258972,
+      "learning_rate": 0.00017212371134020619,
+      "loss": 0.8363,
+      "step": 827
+    },
+    {
+      "epoch": 0.52992,
+      "grad_norm": 0.3134404718875885,
+      "learning_rate": 0.00017208247422680412,
+      "loss": 0.7497,
+      "step": 828
+    },
+    {
+      "epoch": 0.53056,
+      "grad_norm": 0.3240647614002228,
+      "learning_rate": 0.00017204123711340208,
+      "loss": 0.7388,
+      "step": 829
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.35194048285484314,
+      "learning_rate": 0.000172,
+      "loss": 0.6824,
+      "step": 830
+    },
+    {
+      "epoch": 0.53184,
+      "grad_norm": 0.3119235634803772,
+      "learning_rate": 0.00017195876288659796,
+      "loss": 0.6029,
+      "step": 831
+    },
+    {
+      "epoch": 0.53248,
+      "grad_norm": 0.3406645655632019,
+      "learning_rate": 0.0001719175257731959,
+      "loss": 0.6807,
+      "step": 832
+    },
+    {
+      "epoch": 0.53312,
+      "grad_norm": 0.3759150803089142,
+      "learning_rate": 0.00017187628865979383,
+      "loss": 0.7416,
+      "step": 833
+    },
+    {
+      "epoch": 0.53376,
+      "grad_norm": 0.3304033875465393,
+      "learning_rate": 0.00017183505154639176,
+      "loss": 0.7561,
+      "step": 834
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3753022253513336,
+      "learning_rate": 0.0001717938144329897,
+      "loss": 0.7369,
+      "step": 835
+    },
+    {
+      "epoch": 0.53504,
+      "grad_norm": 0.35440653562545776,
+      "learning_rate": 0.00017175257731958762,
+      "loss": 0.7129,
+      "step": 836
+    },
+    {
+      "epoch": 0.53568,
+      "grad_norm": 0.3174979090690613,
+      "learning_rate": 0.00017171134020618558,
+      "loss": 0.6698,
+      "step": 837
+    },
+    {
+      "epoch": 0.53632,
+      "grad_norm": 0.315489262342453,
+      "learning_rate": 0.0001716701030927835,
+      "loss": 0.6015,
+      "step": 838
+    },
+    {
+      "epoch": 0.53696,
+      "grad_norm": 0.3052067756652832,
+      "learning_rate": 0.00017162886597938147,
+      "loss": 0.6733,
+      "step": 839
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3351444900035858,
+      "learning_rate": 0.0001715876288659794,
+      "loss": 0.7242,
+      "step": 840
+    },
+    {
+      "epoch": 0.53824,
+      "grad_norm": 0.3258063495159149,
+      "learning_rate": 0.0001715463917525773,
+      "loss": 0.6307,
+      "step": 841
+    },
+    {
+      "epoch": 0.53888,
+      "grad_norm": 0.3013196885585785,
+      "learning_rate": 0.00017150515463917526,
+      "loss": 0.6586,
+      "step": 842
+    },
+    {
+      "epoch": 0.53952,
+      "grad_norm": 0.3370300233364105,
+      "learning_rate": 0.0001714639175257732,
+      "loss": 0.6601,
+      "step": 843
+    },
+    {
+      "epoch": 0.54016,
+      "grad_norm": 0.3310944139957428,
+      "learning_rate": 0.00017142268041237115,
+      "loss": 0.6789,
+      "step": 844
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.351567804813385,
+      "learning_rate": 0.00017138144329896908,
+      "loss": 0.6548,
+      "step": 845
+    },
+    {
+      "epoch": 0.54144,
+      "grad_norm": 0.3249686658382416,
+      "learning_rate": 0.00017134020618556702,
+      "loss": 0.6895,
+      "step": 846
+    },
+    {
+      "epoch": 0.54208,
+      "grad_norm": 0.3230130970478058,
+      "learning_rate": 0.00017129896907216497,
+      "loss": 0.7679,
+      "step": 847
+    },
+    {
+      "epoch": 0.54272,
+      "grad_norm": 0.304153174161911,
+      "learning_rate": 0.00017125773195876288,
+      "loss": 0.6788,
+      "step": 848
+    },
+    {
+      "epoch": 0.54336,
+      "grad_norm": 0.3333081901073456,
+      "learning_rate": 0.00017121649484536084,
+      "loss": 0.7447,
+      "step": 849
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.37612634897232056,
+      "learning_rate": 0.00017117525773195877,
+      "loss": 0.8132,
+      "step": 850
+    },
+    {
+      "epoch": 0.54464,
+      "grad_norm": 0.32075515389442444,
+      "learning_rate": 0.0001711340206185567,
+      "loss": 0.7459,
+      "step": 851
+    },
+    {
+      "epoch": 0.54528,
+      "grad_norm": 0.30256494879722595,
+      "learning_rate": 0.00017109278350515466,
+      "loss": 0.7734,
+      "step": 852
+    },
+    {
+      "epoch": 0.54592,
+      "grad_norm": 0.30430516600608826,
+      "learning_rate": 0.0001710515463917526,
+      "loss": 0.7279,
+      "step": 853
+    },
+    {
+      "epoch": 0.54656,
+      "grad_norm": 0.3340049088001251,
+      "learning_rate": 0.00017101030927835055,
+      "loss": 0.6239,
+      "step": 854
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.31556272506713867,
+      "learning_rate": 0.00017096907216494845,
+      "loss": 0.713,
+      "step": 855
+    },
+    {
+      "epoch": 0.54784,
+      "grad_norm": 0.3813299834728241,
+      "learning_rate": 0.00017092783505154638,
+      "loss": 0.7289,
+      "step": 856
+    },
+    {
+      "epoch": 0.54848,
+      "grad_norm": 0.3160710036754608,
+      "learning_rate": 0.00017088659793814434,
+      "loss": 0.6654,
+      "step": 857
+    },
+    {
+      "epoch": 0.54912,
+      "grad_norm": 0.3170619010925293,
+      "learning_rate": 0.00017084536082474227,
+      "loss": 0.7211,
+      "step": 858
+    },
+    {
+      "epoch": 0.54976,
+      "grad_norm": 0.3428855836391449,
+      "learning_rate": 0.00017080412371134023,
+      "loss": 0.7043,
+      "step": 859
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.31656891107559204,
+      "learning_rate": 0.00017076288659793816,
+      "loss": 0.8287,
+      "step": 860
+    },
+    {
+      "epoch": 0.55104,
+      "grad_norm": 0.30554285645484924,
+      "learning_rate": 0.0001707216494845361,
+      "loss": 0.7167,
+      "step": 861
+    },
+    {
+      "epoch": 0.55168,
+      "grad_norm": 0.32109346985816956,
+      "learning_rate": 0.00017068041237113402,
+      "loss": 0.7657,
+      "step": 862
+    },
+    {
+      "epoch": 0.55232,
+      "grad_norm": 0.35519644618034363,
+      "learning_rate": 0.00017063917525773196,
+      "loss": 0.6572,
+      "step": 863
+    },
+    {
+      "epoch": 0.55296,
+      "grad_norm": 0.3689737021923065,
+      "learning_rate": 0.0001705979381443299,
+      "loss": 0.6825,
+      "step": 864
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.31525516510009766,
+      "learning_rate": 0.00017055670103092785,
+      "loss": 0.7143,
+      "step": 865
+    },
+    {
+      "epoch": 0.55424,
+      "grad_norm": 0.34069299697875977,
+      "learning_rate": 0.00017051546391752578,
+      "loss": 0.7288,
+      "step": 866
+    },
+    {
+      "epoch": 0.55488,
+      "grad_norm": 0.34350720047950745,
+      "learning_rate": 0.00017047422680412373,
+      "loss": 0.6908,
+      "step": 867
+    },
+    {
+      "epoch": 0.55552,
+      "grad_norm": 0.3231012225151062,
+      "learning_rate": 0.00017043298969072167,
+      "loss": 0.7122,
+      "step": 868
+    },
+    {
+      "epoch": 0.55616,
+      "grad_norm": 0.3389771580696106,
+      "learning_rate": 0.0001703917525773196,
+      "loss": 0.8773,
+      "step": 869
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3275061547756195,
+      "learning_rate": 0.00017035051546391753,
+      "loss": 0.706,
+      "step": 870
+    },
+    {
+      "epoch": 0.55744,
+      "grad_norm": 0.3371143043041229,
+      "learning_rate": 0.00017030927835051546,
+      "loss": 0.7035,
+      "step": 871
+    },
+    {
+      "epoch": 0.55808,
+      "grad_norm": 0.2950092554092407,
+      "learning_rate": 0.00017026804123711342,
+      "loss": 0.6031,
+      "step": 872
+    },
+    {
+      "epoch": 0.55872,
+      "grad_norm": 0.31304749846458435,
+      "learning_rate": 0.00017022680412371135,
+      "loss": 0.599,
+      "step": 873
+    },
+    {
+      "epoch": 0.55936,
+      "grad_norm": 0.33457091450691223,
+      "learning_rate": 0.00017018556701030928,
+      "loss": 0.6985,
+      "step": 874
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3452294170856476,
+      "learning_rate": 0.00017014432989690724,
+      "loss": 0.8691,
+      "step": 875
+    },
+    {
+      "epoch": 0.56064,
+      "grad_norm": 0.3514443635940552,
+      "learning_rate": 0.00017010309278350517,
+      "loss": 0.7062,
+      "step": 876
+    },
+    {
+      "epoch": 0.56128,
+      "grad_norm": 0.3137763738632202,
+      "learning_rate": 0.0001700618556701031,
+      "loss": 0.6624,
+      "step": 877
+    },
+    {
+      "epoch": 0.56192,
+      "grad_norm": 0.29632478952407837,
+      "learning_rate": 0.00017002061855670103,
+      "loss": 0.6423,
+      "step": 878
+    },
+    {
+      "epoch": 0.56256,
+      "grad_norm": 0.34383177757263184,
+      "learning_rate": 0.00016997938144329896,
+      "loss": 0.6719,
+      "step": 879
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.3423959016799927,
+      "learning_rate": 0.00016993814432989692,
+      "loss": 0.7162,
+      "step": 880
+    },
+    {
+      "epoch": 0.56384,
+      "grad_norm": 0.3326077461242676,
+      "learning_rate": 0.00016989690721649485,
+      "loss": 0.7064,
+      "step": 881
+    },
+    {
+      "epoch": 0.56448,
+      "grad_norm": 0.3344801068305969,
+      "learning_rate": 0.0001698556701030928,
+      "loss": 0.6514,
+      "step": 882
+    },
+    {
+      "epoch": 0.56512,
+      "grad_norm": 0.31385329365730286,
+      "learning_rate": 0.00016981443298969074,
+      "loss": 0.5551,
+      "step": 883
+    },
+    {
+      "epoch": 0.56576,
+      "grad_norm": 0.3367070257663727,
+      "learning_rate": 0.00016977319587628865,
+      "loss": 0.6973,
+      "step": 884
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.307830274105072,
+      "learning_rate": 0.0001697319587628866,
+      "loss": 0.7259,
+      "step": 885
+    },
+    {
+      "epoch": 0.56704,
+      "grad_norm": 0.34202420711517334,
+      "learning_rate": 0.00016969072164948454,
+      "loss": 0.6056,
+      "step": 886
+    },
+    {
+      "epoch": 0.56768,
+      "grad_norm": 0.3216591477394104,
+      "learning_rate": 0.0001696494845360825,
+      "loss": 0.7366,
+      "step": 887
+    },
+    {
+      "epoch": 0.56832,
+      "grad_norm": 0.3417046070098877,
+      "learning_rate": 0.00016960824742268043,
+      "loss": 0.7227,
+      "step": 888
+    },
+    {
+      "epoch": 0.56896,
+      "grad_norm": 0.33095914125442505,
+      "learning_rate": 0.00016956701030927836,
+      "loss": 0.808,
+      "step": 889
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3283424377441406,
+      "learning_rate": 0.0001695257731958763,
+      "loss": 0.7093,
+      "step": 890
+    },
+    {
+      "epoch": 0.57024,
+      "grad_norm": 0.3133627772331238,
+      "learning_rate": 0.00016948453608247422,
+      "loss": 0.636,
+      "step": 891
+    },
+    {
+      "epoch": 0.57088,
+      "grad_norm": 0.33971133828163147,
+      "learning_rate": 0.00016944329896907218,
+      "loss": 0.6846,
+      "step": 892
+    },
+    {
+      "epoch": 0.57152,
+      "grad_norm": 0.345731258392334,
+      "learning_rate": 0.0001694020618556701,
+      "loss": 0.7428,
+      "step": 893
+    },
+    {
+      "epoch": 0.57216,
+      "grad_norm": 0.3539867699146271,
+      "learning_rate": 0.00016936082474226804,
+      "loss": 0.6212,
+      "step": 894
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3494645655155182,
+      "learning_rate": 0.000169319587628866,
+      "loss": 0.6784,
+      "step": 895
+    },
+    {
+      "epoch": 0.57344,
+      "grad_norm": 0.3267348110675812,
+      "learning_rate": 0.00016927835051546393,
+      "loss": 0.6761,
+      "step": 896
+    },
+    {
+      "epoch": 0.57408,
+      "grad_norm": 0.3245634138584137,
+      "learning_rate": 0.00016923711340206186,
+      "loss": 0.7772,
+      "step": 897
+    },
+    {
+      "epoch": 0.57472,
+      "grad_norm": 0.35841062664985657,
+      "learning_rate": 0.0001691958762886598,
+      "loss": 0.7379,
+      "step": 898
+    },
+    {
+      "epoch": 0.57536,
+      "grad_norm": 0.31480541825294495,
+      "learning_rate": 0.00016915463917525773,
+      "loss": 0.7993,
+      "step": 899
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.31690216064453125,
+      "learning_rate": 0.00016911340206185568,
+      "loss": 0.7679,
+      "step": 900
+    },
+    {
+      "epoch": 0.57664,
+      "grad_norm": 0.3120574951171875,
+      "learning_rate": 0.00016907216494845361,
+      "loss": 0.741,
+      "step": 901
+    },
+    {
+      "epoch": 0.57728,
+      "grad_norm": 0.30411088466644287,
+      "learning_rate": 0.00016903092783505155,
+      "loss": 0.7283,
+      "step": 902
+    },
+    {
+      "epoch": 0.57792,
+      "grad_norm": 0.2972199022769928,
+      "learning_rate": 0.0001689896907216495,
+      "loss": 0.6897,
+      "step": 903
+    },
+    {
+      "epoch": 0.57856,
+      "grad_norm": 0.34232184290885925,
+      "learning_rate": 0.00016894845360824744,
+      "loss": 0.6644,
+      "step": 904
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.30028679966926575,
+      "learning_rate": 0.00016890721649484537,
+      "loss": 0.7296,
+      "step": 905
+    },
+    {
+      "epoch": 0.57984,
+      "grad_norm": 0.3369309902191162,
+      "learning_rate": 0.0001688659793814433,
+      "loss": 0.7256,
+      "step": 906
+    },
+    {
+      "epoch": 0.58048,
+      "grad_norm": 0.3728419840335846,
+      "learning_rate": 0.00016882474226804123,
+      "loss": 0.7035,
+      "step": 907
+    },
+    {
+      "epoch": 0.58112,
+      "grad_norm": 0.3298209607601166,
+      "learning_rate": 0.0001687835051546392,
+      "loss": 0.7744,
+      "step": 908
+    },
+    {
+      "epoch": 0.58176,
+      "grad_norm": 0.331310510635376,
+      "learning_rate": 0.00016874226804123712,
+      "loss": 0.723,
+      "step": 909
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3091805577278137,
+      "learning_rate": 0.00016870103092783508,
+      "loss": 0.7319,
+      "step": 910
+    },
+    {
+      "epoch": 0.58304,
+      "grad_norm": 0.35518544912338257,
+      "learning_rate": 0.000168659793814433,
+      "loss": 0.7018,
+      "step": 911
+    },
+    {
+      "epoch": 0.58368,
+      "grad_norm": 0.3369411528110504,
+      "learning_rate": 0.00016861855670103094,
+      "loss": 0.6626,
+      "step": 912
+    },
+    {
+      "epoch": 0.58432,
+      "grad_norm": 0.3651466965675354,
+      "learning_rate": 0.00016857731958762887,
+      "loss": 0.7184,
+      "step": 913
+    },
+    {
+      "epoch": 0.58496,
+      "grad_norm": 0.3625977337360382,
+      "learning_rate": 0.0001685360824742268,
+      "loss": 0.7235,
+      "step": 914
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.33360275626182556,
+      "learning_rate": 0.00016849484536082476,
+      "loss": 0.669,
+      "step": 915
+    },
+    {
+      "epoch": 0.58624,
+      "grad_norm": 0.3369734585285187,
+      "learning_rate": 0.0001684536082474227,
+      "loss": 0.7566,
+      "step": 916
+    },
+    {
+      "epoch": 0.58688,
+      "grad_norm": 0.36869296431541443,
+      "learning_rate": 0.00016841237113402062,
+      "loss": 0.815,
+      "step": 917
+    },
+    {
+      "epoch": 0.58752,
+      "grad_norm": 0.3215596377849579,
+      "learning_rate": 0.00016837113402061858,
+      "loss": 0.6007,
+      "step": 918
+    },
+    {
+      "epoch": 0.58816,
+      "grad_norm": 0.33065927028656006,
+      "learning_rate": 0.0001683298969072165,
+      "loss": 0.7808,
+      "step": 919
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.28555265069007874,
+      "learning_rate": 0.00016828865979381444,
+      "loss": 0.6182,
+      "step": 920
+    },
+    {
+      "epoch": 0.58944,
+      "grad_norm": 0.30287542939186096,
+      "learning_rate": 0.00016824742268041238,
+      "loss": 0.5704,
+      "step": 921
+    },
+    {
+      "epoch": 0.59008,
+      "grad_norm": 0.33167949318885803,
+      "learning_rate": 0.0001682061855670103,
+      "loss": 0.6968,
+      "step": 922
+    },
+    {
+      "epoch": 0.59072,
+      "grad_norm": 0.34504199028015137,
+      "learning_rate": 0.00016816494845360827,
+      "loss": 0.7429,
+      "step": 923
+    },
+    {
+      "epoch": 0.59136,
+      "grad_norm": 0.2966318428516388,
+      "learning_rate": 0.0001681237113402062,
+      "loss": 0.5487,
+      "step": 924
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3301185667514801,
+      "learning_rate": 0.00016808247422680413,
+      "loss": 0.5468,
+      "step": 925
+    },
+    {
+      "epoch": 0.59264,
+      "grad_norm": 0.3259033262729645,
+      "learning_rate": 0.00016804123711340206,
+      "loss": 0.7658,
+      "step": 926
+    },
+    {
+      "epoch": 0.59328,
+      "grad_norm": 0.3407130241394043,
+      "learning_rate": 0.000168,
+      "loss": 0.7686,
+      "step": 927
+    },
+    {
+      "epoch": 0.59392,
+      "grad_norm": 0.30277690291404724,
+      "learning_rate": 0.00016795876288659795,
+      "loss": 0.6899,
+      "step": 928
+    },
+    {
+      "epoch": 0.59456,
+      "grad_norm": 0.3400406539440155,
+      "learning_rate": 0.00016791752577319588,
+      "loss": 0.6615,
+      "step": 929
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3555680513381958,
+      "learning_rate": 0.0001678762886597938,
+      "loss": 0.721,
+      "step": 930
+    },
+    {
+      "epoch": 0.59584,
+      "grad_norm": 0.3539385199546814,
+      "learning_rate": 0.00016783505154639177,
+      "loss": 0.8091,
+      "step": 931
+    },
+    {
+      "epoch": 0.59648,
+      "grad_norm": 0.3724691569805145,
+      "learning_rate": 0.0001677938144329897,
+      "loss": 0.6678,
+      "step": 932
+    },
+    {
+      "epoch": 0.59712,
+      "grad_norm": 0.32717397809028625,
+      "learning_rate": 0.00016775257731958763,
+      "loss": 0.7497,
+      "step": 933
+    },
+    {
+      "epoch": 0.59776,
+      "grad_norm": 0.303411066532135,
+      "learning_rate": 0.00016771134020618556,
+      "loss": 0.6635,
+      "step": 934
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.35136744379997253,
+      "learning_rate": 0.0001676701030927835,
+      "loss": 0.8002,
+      "step": 935
+    },
+    {
+      "epoch": 0.59904,
+      "grad_norm": 0.32482367753982544,
+      "learning_rate": 0.00016762886597938145,
+      "loss": 0.6675,
+      "step": 936
+    },
+    {
+      "epoch": 0.59968,
+      "grad_norm": 0.33438512682914734,
+      "learning_rate": 0.00016758762886597938,
+      "loss": 0.5938,
+      "step": 937
+    },
+    {
+      "epoch": 0.60032,
+      "grad_norm": 0.3102699816226959,
+      "learning_rate": 0.00016754639175257734,
+      "loss": 0.7251,
+      "step": 938
+    },
+    {
+      "epoch": 0.60096,
+      "grad_norm": 0.30473557114601135,
+      "learning_rate": 0.00016750515463917527,
+      "loss": 0.6338,
+      "step": 939
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.31448429822921753,
+      "learning_rate": 0.0001674639175257732,
+      "loss": 0.6642,
+      "step": 940
+    },
+    {
+      "epoch": 0.60224,
+      "grad_norm": 0.31132256984710693,
+      "learning_rate": 0.00016742268041237114,
+      "loss": 0.639,
+      "step": 941
+    },
+    {
+      "epoch": 0.60288,
+      "grad_norm": 0.346417635679245,
+      "learning_rate": 0.00016738144329896907,
+      "loss": 0.6373,
+      "step": 942
+    },
+    {
+      "epoch": 0.60352,
+      "grad_norm": 0.3371962904930115,
+      "learning_rate": 0.00016734020618556703,
+      "loss": 0.7374,
+      "step": 943
+    },
+    {
+      "epoch": 0.60416,
+      "grad_norm": 0.3331504464149475,
+      "learning_rate": 0.00016729896907216496,
+      "loss": 0.7302,
+      "step": 944
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.33896365761756897,
+      "learning_rate": 0.0001672577319587629,
+      "loss": 0.7194,
+      "step": 945
+    },
+    {
+      "epoch": 0.60544,
+      "grad_norm": 0.3428703546524048,
+      "learning_rate": 0.00016721649484536085,
+      "loss": 0.7848,
+      "step": 946
+    },
+    {
+      "epoch": 0.60608,
+      "grad_norm": 0.3296687602996826,
+      "learning_rate": 0.00016717525773195878,
+      "loss": 0.6241,
+      "step": 947
+    },
+    {
+      "epoch": 0.60672,
+      "grad_norm": 0.3429982364177704,
+      "learning_rate": 0.0001671340206185567,
+      "loss": 0.6811,
+      "step": 948
+    },
+    {
+      "epoch": 0.60736,
+      "grad_norm": 0.3100501298904419,
+      "learning_rate": 0.00016709278350515464,
+      "loss": 0.7782,
+      "step": 949
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3103218972682953,
+      "learning_rate": 0.00016705154639175257,
+      "loss": 0.6722,
+      "step": 950
+    },
+    {
+      "epoch": 0.60864,
+      "grad_norm": 0.2994841933250427,
+      "learning_rate": 0.00016701030927835053,
+      "loss": 0.6558,
+      "step": 951
+    },
+    {
+      "epoch": 0.60928,
+      "grad_norm": 0.31650424003601074,
+      "learning_rate": 0.00016696907216494846,
+      "loss": 0.7694,
+      "step": 952
+    },
+    {
+      "epoch": 0.60992,
+      "grad_norm": 0.3342834711074829,
+      "learning_rate": 0.00016692783505154642,
+      "loss": 0.8535,
+      "step": 953
+    },
+    {
+      "epoch": 0.61056,
+      "grad_norm": 0.34426960349082947,
+      "learning_rate": 0.00016688659793814435,
+      "loss": 0.6073,
+      "step": 954
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.34301304817199707,
+      "learning_rate": 0.00016684536082474226,
+      "loss": 0.6019,
+      "step": 955
+    },
+    {
+      "epoch": 0.61184,
+      "grad_norm": 0.35110563039779663,
+      "learning_rate": 0.00016680412371134021,
+      "loss": 0.7701,
+      "step": 956
+    },
+    {
+      "epoch": 0.61248,
+      "grad_norm": 0.32015711069107056,
+      "learning_rate": 0.00016676288659793815,
+      "loss": 0.5966,
+      "step": 957
+    },
+    {
+      "epoch": 0.61312,
+      "grad_norm": 0.3504869043827057,
+      "learning_rate": 0.00016672164948453608,
+      "loss": 0.7685,
+      "step": 958
+    },
+    {
+      "epoch": 0.61376,
+      "grad_norm": 0.3113017976284027,
+      "learning_rate": 0.00016668041237113404,
+      "loss": 0.64,
+      "step": 959
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3245644271373749,
+      "learning_rate": 0.00016663917525773197,
+      "loss": 0.7616,
+      "step": 960
+    },
+    {
+      "epoch": 0.61504,
+      "grad_norm": 0.31344395875930786,
+      "learning_rate": 0.00016659793814432993,
+      "loss": 0.6728,
+      "step": 961
+    },
+    {
+      "epoch": 0.61568,
+      "grad_norm": 0.3070579469203949,
+      "learning_rate": 0.00016655670103092783,
+      "loss": 0.5944,
+      "step": 962
+    },
+    {
+      "epoch": 0.61632,
+      "grad_norm": 0.35053929686546326,
+      "learning_rate": 0.00016651546391752576,
+      "loss": 0.7577,
+      "step": 963
+    },
+    {
+      "epoch": 0.61696,
+      "grad_norm": 0.30264538526535034,
+      "learning_rate": 0.00016647422680412372,
+      "loss": 0.803,
+      "step": 964
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.363376647233963,
+      "learning_rate": 0.00016643298969072165,
+      "loss": 0.7972,
+      "step": 965
+    },
+    {
+      "epoch": 0.61824,
+      "grad_norm": 0.30571556091308594,
+      "learning_rate": 0.0001663917525773196,
+      "loss": 0.6544,
+      "step": 966
+    },
+    {
+      "epoch": 0.61888,
+      "grad_norm": 0.33258652687072754,
+      "learning_rate": 0.00016635051546391754,
+      "loss": 0.7373,
+      "step": 967
+    },
+    {
+      "epoch": 0.61952,
+      "grad_norm": 0.3081071376800537,
+      "learning_rate": 0.00016630927835051547,
+      "loss": 0.6544,
+      "step": 968
+    },
+    {
+      "epoch": 0.62016,
+      "grad_norm": 0.32135868072509766,
+      "learning_rate": 0.0001662680412371134,
+      "loss": 0.6388,
+      "step": 969
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3405911922454834,
+      "learning_rate": 0.00016622680412371133,
+      "loss": 0.7722,
+      "step": 970
+    },
+    {
+      "epoch": 0.62144,
+      "grad_norm": 0.3372403085231781,
+      "learning_rate": 0.0001661855670103093,
+      "loss": 0.5981,
+      "step": 971
+    },
+    {
+      "epoch": 0.62208,
+      "grad_norm": 0.3280060887336731,
+      "learning_rate": 0.00016614432989690722,
+      "loss": 0.6649,
+      "step": 972
+    },
+    {
+      "epoch": 0.62272,
+      "grad_norm": 0.327841192483902,
+      "learning_rate": 0.00016610309278350515,
+      "loss": 0.6551,
+      "step": 973
+    },
+    {
+      "epoch": 0.62336,
+      "grad_norm": 0.29221057891845703,
+      "learning_rate": 0.0001660618556701031,
+      "loss": 0.6056,
+      "step": 974
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.33232420682907104,
+      "learning_rate": 0.00016602061855670104,
+      "loss": 0.7446,
+      "step": 975
+    },
+    {
+      "epoch": 0.62464,
+      "grad_norm": 0.3291051983833313,
+      "learning_rate": 0.00016597938144329898,
+      "loss": 0.6772,
+      "step": 976
+    },
+    {
+      "epoch": 0.62528,
+      "grad_norm": 0.3483774662017822,
+      "learning_rate": 0.0001659381443298969,
+      "loss": 0.6615,
+      "step": 977
+    },
+    {
+      "epoch": 0.62592,
+      "grad_norm": 0.30923715233802795,
+      "learning_rate": 0.00016589690721649484,
+      "loss": 0.7244,
+      "step": 978
+    },
+    {
+      "epoch": 0.62656,
+      "grad_norm": 0.33429014682769775,
+      "learning_rate": 0.0001658556701030928,
+      "loss": 0.6586,
+      "step": 979
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3600437641143799,
+      "learning_rate": 0.00016581443298969073,
+      "loss": 0.7631,
+      "step": 980
+    },
+    {
+      "epoch": 0.62784,
+      "grad_norm": 0.3739021122455597,
+      "learning_rate": 0.00016577319587628869,
+      "loss": 0.6549,
+      "step": 981
+    },
+    {
+      "epoch": 0.62848,
+      "grad_norm": 0.339814692735672,
+      "learning_rate": 0.00016573195876288662,
+      "loss": 0.7362,
+      "step": 982
+    },
+    {
+      "epoch": 0.62912,
+      "grad_norm": 0.3724810481071472,
+      "learning_rate": 0.00016569072164948455,
+      "loss": 0.7713,
+      "step": 983
+    },
+    {
+      "epoch": 0.62976,
+      "grad_norm": 0.34007740020751953,
+      "learning_rate": 0.00016564948453608248,
+      "loss": 0.7068,
+      "step": 984
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.35730883479118347,
+      "learning_rate": 0.0001656082474226804,
+      "loss": 0.879,
+      "step": 985
+    },
+    {
+      "epoch": 0.63104,
+      "grad_norm": 0.3350812494754791,
+      "learning_rate": 0.00016556701030927834,
+      "loss": 0.7646,
+      "step": 986
+    },
+    {
+      "epoch": 0.63168,
+      "grad_norm": 0.3421354293823242,
+      "learning_rate": 0.0001655257731958763,
+      "loss": 0.7043,
+      "step": 987
+    },
+    {
+      "epoch": 0.63232,
+      "grad_norm": 0.3471168875694275,
+      "learning_rate": 0.00016548453608247423,
+      "loss": 0.6216,
+      "step": 988
+    },
+    {
+      "epoch": 0.63296,
+      "grad_norm": 0.31793344020843506,
+      "learning_rate": 0.0001654432989690722,
+      "loss": 0.7128,
+      "step": 989
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.33751174807548523,
+      "learning_rate": 0.00016540206185567012,
+      "loss": 0.6313,
+      "step": 990
+    },
+    {
+      "epoch": 0.63424,
+      "grad_norm": 0.3807850778102875,
+      "learning_rate": 0.00016536082474226803,
+      "loss": 0.7691,
+      "step": 991
+    },
+    {
+      "epoch": 0.63488,
+      "grad_norm": 0.33190733194351196,
+      "learning_rate": 0.00016531958762886598,
+      "loss": 0.6755,
+      "step": 992
+    },
+    {
+      "epoch": 0.63552,
+      "grad_norm": 0.341623455286026,
+      "learning_rate": 0.00016527835051546392,
+      "loss": 0.687,
+      "step": 993
+    },
+    {
+      "epoch": 0.63616,
+      "grad_norm": 0.3153999149799347,
+      "learning_rate": 0.00016523711340206187,
+      "loss": 0.773,
+      "step": 994
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3124261796474457,
+      "learning_rate": 0.0001651958762886598,
+      "loss": 0.6754,
+      "step": 995
+    },
+    {
+      "epoch": 0.63744,
+      "grad_norm": 0.3141627907752991,
+      "learning_rate": 0.00016515463917525774,
+      "loss": 0.7158,
+      "step": 996
+    },
+    {
+      "epoch": 0.63808,
+      "grad_norm": 0.31757795810699463,
+      "learning_rate": 0.0001651134020618557,
+      "loss": 0.6942,
+      "step": 997
+    },
+    {
+      "epoch": 0.63872,
+      "grad_norm": 0.3496686816215515,
+      "learning_rate": 0.0001650721649484536,
+      "loss": 0.7345,
+      "step": 998
+    },
+    {
+      "epoch": 0.63936,
+      "grad_norm": 0.3580033481121063,
+      "learning_rate": 0.00016503092783505156,
+      "loss": 0.6239,
+      "step": 999
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3445329964160919,
+      "learning_rate": 0.0001649896907216495,
+      "loss": 0.6492,
+      "step": 1000
+    },
+    {
+      "epoch": 0.64064,
+      "grad_norm": 0.3224124312400818,
+      "learning_rate": 0.00016494845360824742,
+      "loss": 0.6641,
+      "step": 1001
+    },
+    {
+      "epoch": 0.64128,
+      "grad_norm": 0.3362014889717102,
+      "learning_rate": 0.00016490721649484538,
+      "loss": 0.608,
+      "step": 1002
+    },
+    {
+      "epoch": 0.64192,
+      "grad_norm": 0.32538992166519165,
+      "learning_rate": 0.0001648659793814433,
+      "loss": 0.716,
+      "step": 1003
+    },
+    {
+      "epoch": 0.64256,
+      "grad_norm": 0.323122501373291,
+      "learning_rate": 0.00016482474226804127,
+      "loss": 0.6479,
+      "step": 1004
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.34535911679267883,
+      "learning_rate": 0.00016478350515463917,
+      "loss": 0.7522,
+      "step": 1005
+    },
+    {
+      "epoch": 0.64384,
+      "grad_norm": 0.3218434751033783,
+      "learning_rate": 0.0001647422680412371,
+      "loss": 0.6887,
+      "step": 1006
+    },
+    {
+      "epoch": 0.64448,
+      "grad_norm": 0.2968931198120117,
+      "learning_rate": 0.00016470103092783506,
+      "loss": 0.6368,
+      "step": 1007
+    },
+    {
+      "epoch": 0.64512,
+      "grad_norm": 0.3102390170097351,
+      "learning_rate": 0.000164659793814433,
+      "loss": 0.685,
+      "step": 1008
+    },
+    {
+      "epoch": 0.64576,
+      "grad_norm": 0.31813618540763855,
+      "learning_rate": 0.00016461855670103095,
+      "loss": 0.5939,
+      "step": 1009
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.31699109077453613,
+      "learning_rate": 0.00016457731958762888,
+      "loss": 0.6605,
+      "step": 1010
+    },
+    {
+      "epoch": 0.64704,
+      "grad_norm": 0.30718788504600525,
+      "learning_rate": 0.00016453608247422681,
+      "loss": 0.7266,
+      "step": 1011
+    },
+    {
+      "epoch": 0.64768,
+      "grad_norm": 0.32789191603660583,
+      "learning_rate": 0.00016449484536082475,
+      "loss": 0.6372,
+      "step": 1012
+    },
+    {
+      "epoch": 0.64832,
+      "grad_norm": 0.34586301445961,
+      "learning_rate": 0.00016445360824742268,
+      "loss": 0.6332,
+      "step": 1013
+    },
+    {
+      "epoch": 0.64896,
+      "grad_norm": 0.31883180141448975,
+      "learning_rate": 0.00016441237113402063,
+      "loss": 0.7089,
+      "step": 1014
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.36051660776138306,
+      "learning_rate": 0.00016437113402061857,
+      "loss": 0.6881,
+      "step": 1015
+    },
+    {
+      "epoch": 0.65024,
+      "grad_norm": 0.32305341958999634,
+      "learning_rate": 0.0001643298969072165,
+      "loss": 0.7483,
+      "step": 1016
+    },
+    {
+      "epoch": 0.65088,
+      "grad_norm": 0.3307187557220459,
+      "learning_rate": 0.00016428865979381446,
+      "loss": 0.784,
+      "step": 1017
+    },
+    {
+      "epoch": 0.65152,
+      "grad_norm": 0.33016180992126465,
+      "learning_rate": 0.0001642474226804124,
+      "loss": 0.6995,
+      "step": 1018
+    },
+    {
+      "epoch": 0.65216,
+      "grad_norm": 0.32196250557899475,
+      "learning_rate": 0.00016420618556701032,
+      "loss": 0.6612,
+      "step": 1019
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3720751702785492,
+      "learning_rate": 0.00016416494845360825,
+      "loss": 0.7381,
+      "step": 1020
+    },
+    {
+      "epoch": 0.65344,
+      "grad_norm": 0.32992464303970337,
+      "learning_rate": 0.00016412371134020618,
+      "loss": 0.7506,
+      "step": 1021
+    },
+    {
+      "epoch": 0.65408,
+      "grad_norm": 0.34346675872802734,
+      "learning_rate": 0.00016408247422680414,
+      "loss": 0.6467,
+      "step": 1022
+    },
+    {
+      "epoch": 0.65472,
+      "grad_norm": 0.3066868484020233,
+      "learning_rate": 0.00016404123711340207,
+      "loss": 0.6626,
+      "step": 1023
+    },
+    {
+      "epoch": 0.65536,
+      "grad_norm": 0.33430781960487366,
+      "learning_rate": 0.000164,
+      "loss": 0.7345,
+      "step": 1024
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.32400012016296387,
+      "learning_rate": 0.00016395876288659796,
+      "loss": 0.7087,
+      "step": 1025
+    },
+    {
+      "epoch": 0.65664,
+      "grad_norm": 0.33005645871162415,
+      "learning_rate": 0.0001639175257731959,
+      "loss": 0.6602,
+      "step": 1026
+    },
+    {
+      "epoch": 0.65728,
+      "grad_norm": 0.3338261842727661,
+      "learning_rate": 0.00016387628865979382,
+      "loss": 0.7808,
+      "step": 1027
+    },
+    {
+      "epoch": 0.65792,
+      "grad_norm": 0.31470897793769836,
+      "learning_rate": 0.00016383505154639175,
+      "loss": 0.6633,
+      "step": 1028
+    },
+    {
+      "epoch": 0.65856,
+      "grad_norm": 0.30812013149261475,
+      "learning_rate": 0.00016379381443298969,
+      "loss": 0.6969,
+      "step": 1029
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.32313019037246704,
+      "learning_rate": 0.00016375257731958764,
+      "loss": 0.6779,
+      "step": 1030
+    },
+    {
+      "epoch": 0.65984,
+      "grad_norm": 0.31749245524406433,
+      "learning_rate": 0.00016371134020618558,
+      "loss": 0.6248,
+      "step": 1031
+    },
+    {
+      "epoch": 0.66048,
+      "grad_norm": 0.3323476314544678,
+      "learning_rate": 0.00016367010309278353,
+      "loss": 0.5641,
+      "step": 1032
+    },
+    {
+      "epoch": 0.66112,
+      "grad_norm": 0.38572579622268677,
+      "learning_rate": 0.00016362886597938146,
+      "loss": 0.7785,
+      "step": 1033
+    },
+    {
+      "epoch": 0.66176,
+      "grad_norm": 0.33272165060043335,
+      "learning_rate": 0.00016358762886597937,
+      "loss": 0.7344,
+      "step": 1034
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.33589982986450195,
+      "learning_rate": 0.00016354639175257733,
+      "loss": 0.7063,
+      "step": 1035
+    },
+    {
+      "epoch": 0.66304,
+      "grad_norm": 0.3410196006298065,
+      "learning_rate": 0.00016350515463917526,
+      "loss": 0.7152,
+      "step": 1036
+    },
+    {
+      "epoch": 0.66368,
+      "grad_norm": 0.3133014738559723,
+      "learning_rate": 0.00016346391752577322,
+      "loss": 0.7182,
+      "step": 1037
+    },
+    {
+      "epoch": 0.66432,
+      "grad_norm": 0.33403444290161133,
+      "learning_rate": 0.00016342268041237115,
+      "loss": 0.5634,
+      "step": 1038
+    },
+    {
+      "epoch": 0.66496,
+      "grad_norm": 0.34611040353775024,
+      "learning_rate": 0.00016338144329896908,
+      "loss": 0.7402,
+      "step": 1039
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.34640181064605713,
+      "learning_rate": 0.000163340206185567,
+      "loss": 0.7054,
+      "step": 1040
+    },
+    {
+      "epoch": 0.66624,
+      "grad_norm": 0.31487298011779785,
+      "learning_rate": 0.00016329896907216494,
+      "loss": 0.7865,
+      "step": 1041
+    },
+    {
+      "epoch": 0.66688,
+      "grad_norm": 0.3364202678203583,
+      "learning_rate": 0.0001632577319587629,
+      "loss": 0.7265,
+      "step": 1042
+    },
+    {
+      "epoch": 0.66752,
+      "grad_norm": 0.3132322132587433,
+      "learning_rate": 0.00016321649484536083,
+      "loss": 0.6427,
+      "step": 1043
+    },
+    {
+      "epoch": 0.66816,
+      "grad_norm": 0.3442807197570801,
+      "learning_rate": 0.00016317525773195876,
+      "loss": 0.6644,
+      "step": 1044
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.33366212248802185,
+      "learning_rate": 0.00016313402061855672,
+      "loss": 0.7323,
+      "step": 1045
+    },
+    {
+      "epoch": 0.66944,
+      "grad_norm": 0.3035692572593689,
+      "learning_rate": 0.00016309278350515465,
+      "loss": 0.7639,
+      "step": 1046
+    },
+    {
+      "epoch": 0.67008,
+      "grad_norm": 0.32975611090660095,
+      "learning_rate": 0.00016305154639175258,
+      "loss": 0.6307,
+      "step": 1047
+    },
+    {
+      "epoch": 0.67072,
+      "grad_norm": 0.32276296615600586,
+      "learning_rate": 0.00016301030927835052,
+      "loss": 0.6573,
+      "step": 1048
+    },
+    {
+      "epoch": 0.67136,
+      "grad_norm": 0.33434420824050903,
+      "learning_rate": 0.00016296907216494845,
+      "loss": 0.7149,
+      "step": 1049
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3394414782524109,
+      "learning_rate": 0.0001629278350515464,
+      "loss": 0.6849,
+      "step": 1050
+    },
+    {
+      "epoch": 0.67264,
+      "grad_norm": 0.3162088692188263,
+      "learning_rate": 0.00016288659793814434,
+      "loss": 0.7282,
+      "step": 1051
+    },
+    {
+      "epoch": 0.67328,
+      "grad_norm": 0.3480288088321686,
+      "learning_rate": 0.00016284536082474227,
+      "loss": 0.6422,
+      "step": 1052
+    },
+    {
+      "epoch": 0.67392,
+      "grad_norm": 0.34124842286109924,
+      "learning_rate": 0.00016280412371134023,
+      "loss": 0.5823,
+      "step": 1053
+    },
+    {
+      "epoch": 0.67456,
+      "grad_norm": 0.32485413551330566,
+      "learning_rate": 0.00016276288659793816,
+      "loss": 0.6702,
+      "step": 1054
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.34774544835090637,
+      "learning_rate": 0.0001627216494845361,
+      "loss": 0.6977,
+      "step": 1055
+    },
+    {
+      "epoch": 0.67584,
+      "grad_norm": 0.3193374574184418,
+      "learning_rate": 0.00016268041237113402,
+      "loss": 0.7194,
+      "step": 1056
+    },
+    {
+      "epoch": 0.67648,
+      "grad_norm": 0.31021422147750854,
+      "learning_rate": 0.00016263917525773195,
+      "loss": 0.6316,
+      "step": 1057
+    },
+    {
+      "epoch": 0.67712,
+      "grad_norm": 0.34140804409980774,
+      "learning_rate": 0.0001625979381443299,
+      "loss": 0.6174,
+      "step": 1058
+    },
+    {
+      "epoch": 0.67776,
+      "grad_norm": 0.3179076611995697,
+      "learning_rate": 0.00016255670103092784,
+      "loss": 0.6361,
+      "step": 1059
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.333273321390152,
+      "learning_rate": 0.0001625154639175258,
+      "loss": 0.6562,
+      "step": 1060
+    },
+    {
+      "epoch": 0.67904,
+      "grad_norm": 0.32020020484924316,
+      "learning_rate": 0.00016247422680412373,
+      "loss": 0.6781,
+      "step": 1061
+    },
+    {
+      "epoch": 0.67968,
+      "grad_norm": 0.3052760362625122,
+      "learning_rate": 0.00016243298969072166,
+      "loss": 0.7539,
+      "step": 1062
+    },
+    {
+      "epoch": 0.68032,
+      "grad_norm": 0.34312549233436584,
+      "learning_rate": 0.0001623917525773196,
+      "loss": 0.7032,
+      "step": 1063
+    },
+    {
+      "epoch": 0.68096,
+      "grad_norm": 0.35150110721588135,
+      "learning_rate": 0.00016235051546391752,
+      "loss": 0.7346,
+      "step": 1064
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.35477447509765625,
+      "learning_rate": 0.00016230927835051548,
+      "loss": 0.7253,
+      "step": 1065
+    },
+    {
+      "epoch": 0.68224,
+      "grad_norm": 0.32397547364234924,
+      "learning_rate": 0.00016226804123711341,
+      "loss": 0.6916,
+      "step": 1066
+    },
+    {
+      "epoch": 0.68288,
+      "grad_norm": 0.32659369707107544,
+      "learning_rate": 0.00016222680412371134,
+      "loss": 0.7074,
+      "step": 1067
+    },
+    {
+      "epoch": 0.68352,
+      "grad_norm": 0.33425620198249817,
+      "learning_rate": 0.0001621855670103093,
+      "loss": 0.5837,
+      "step": 1068
+    },
+    {
+      "epoch": 0.68416,
+      "grad_norm": 0.337050199508667,
+      "learning_rate": 0.00016214432989690723,
+      "loss": 0.5633,
+      "step": 1069
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3391391336917877,
+      "learning_rate": 0.00016210309278350517,
+      "loss": 0.794,
+      "step": 1070
+    },
+    {
+      "epoch": 0.68544,
+      "grad_norm": 0.33143994212150574,
+      "learning_rate": 0.0001620618556701031,
+      "loss": 0.7853,
+      "step": 1071
+    },
+    {
+      "epoch": 0.68608,
+      "grad_norm": 0.3296218514442444,
+      "learning_rate": 0.00016202061855670103,
+      "loss": 0.6557,
+      "step": 1072
+    },
+    {
+      "epoch": 0.68672,
+      "grad_norm": 0.3163990378379822,
+      "learning_rate": 0.000161979381443299,
+      "loss": 0.6381,
+      "step": 1073
+    },
+    {
+      "epoch": 0.68736,
+      "grad_norm": 0.33094969391822815,
+      "learning_rate": 0.00016193814432989692,
+      "loss": 0.7631,
+      "step": 1074
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.37997323274612427,
+      "learning_rate": 0.00016189690721649488,
+      "loss": 0.7575,
+      "step": 1075
+    },
+    {
+      "epoch": 0.68864,
+      "grad_norm": 0.34439441561698914,
+      "learning_rate": 0.00016185567010309278,
+      "loss": 0.7235,
+      "step": 1076
+    },
+    {
+      "epoch": 0.68928,
+      "grad_norm": 0.34241726994514465,
+      "learning_rate": 0.0001618144329896907,
+      "loss": 0.6748,
+      "step": 1077
+    },
+    {
+      "epoch": 0.68992,
+      "grad_norm": 0.3499857187271118,
+      "learning_rate": 0.00016177319587628867,
+      "loss": 0.6797,
+      "step": 1078
+    },
+    {
+      "epoch": 0.69056,
+      "grad_norm": 0.34571373462677,
+      "learning_rate": 0.0001617319587628866,
+      "loss": 0.6945,
+      "step": 1079
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.303170770406723,
+      "learning_rate": 0.00016169072164948453,
+      "loss": 0.6487,
+      "step": 1080
+    },
+    {
+      "epoch": 0.69184,
+      "grad_norm": 0.34839507937431335,
+      "learning_rate": 0.0001616494845360825,
+      "loss": 0.6996,
+      "step": 1081
+    },
+    {
+      "epoch": 0.69248,
+      "grad_norm": 0.3313436806201935,
+      "learning_rate": 0.00016160824742268042,
+      "loss": 0.7085,
+      "step": 1082
+    },
+    {
+      "epoch": 0.69312,
+      "grad_norm": 0.3179072439670563,
+      "learning_rate": 0.00016156701030927835,
+      "loss": 0.6593,
+      "step": 1083
+    },
+    {
+      "epoch": 0.69376,
+      "grad_norm": 0.34023839235305786,
+      "learning_rate": 0.00016152577319587628,
+      "loss": 0.7743,
+      "step": 1084
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.31812798976898193,
+      "learning_rate": 0.00016148453608247422,
+      "loss": 0.6789,
+      "step": 1085
+    },
+    {
+      "epoch": 0.69504,
+      "grad_norm": 0.34498313069343567,
+      "learning_rate": 0.00016144329896907217,
+      "loss": 0.7095,
+      "step": 1086
+    },
+    {
+      "epoch": 0.69568,
+      "grad_norm": 0.3462882339954376,
+      "learning_rate": 0.0001614020618556701,
+      "loss": 0.7244,
+      "step": 1087
+    },
+    {
+      "epoch": 0.69632,
+      "grad_norm": 0.3056762218475342,
+      "learning_rate": 0.00016136082474226806,
+      "loss": 0.6776,
+      "step": 1088
+    },
+    {
+      "epoch": 0.69696,
+      "grad_norm": 0.32622817158699036,
+      "learning_rate": 0.000161319587628866,
+      "loss": 0.6913,
+      "step": 1089
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3265339136123657,
+      "learning_rate": 0.00016127835051546393,
+      "loss": 0.6769,
+      "step": 1090
+    },
+    {
+      "epoch": 0.69824,
+      "grad_norm": 0.35580015182495117,
+      "learning_rate": 0.00016123711340206186,
+      "loss": 0.722,
+      "step": 1091
+    },
+    {
+      "epoch": 0.69888,
+      "grad_norm": 0.35644227266311646,
+      "learning_rate": 0.0001611958762886598,
+      "loss": 0.6149,
+      "step": 1092
+    },
+    {
+      "epoch": 0.69952,
+      "grad_norm": 0.3259362280368805,
+      "learning_rate": 0.00016115463917525775,
+      "loss": 0.7277,
+      "step": 1093
+    },
+    {
+      "epoch": 0.70016,
+      "grad_norm": 0.32481932640075684,
+      "learning_rate": 0.00016111340206185568,
+      "loss": 0.6586,
+      "step": 1094
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3192051649093628,
+      "learning_rate": 0.0001610721649484536,
+      "loss": 0.7559,
+      "step": 1095
+    },
+    {
+      "epoch": 0.70144,
+      "grad_norm": 0.3211449980735779,
+      "learning_rate": 0.00016103092783505157,
+      "loss": 0.7427,
+      "step": 1096
+    },
+    {
+      "epoch": 0.70208,
+      "grad_norm": 0.345618337392807,
+      "learning_rate": 0.0001609896907216495,
+      "loss": 0.7099,
+      "step": 1097
+    },
+    {
+      "epoch": 0.70272,
+      "grad_norm": 0.31711891293525696,
+      "learning_rate": 0.00016094845360824743,
+      "loss": 0.6554,
+      "step": 1098
+    },
+    {
+      "epoch": 0.70336,
+      "grad_norm": 0.32048436999320984,
+      "learning_rate": 0.00016090721649484536,
+      "loss": 0.6337,
+      "step": 1099
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3331568241119385,
+      "learning_rate": 0.0001608659793814433,
+      "loss": 0.8142,
+      "step": 1100
+    },
+    {
+      "epoch": 0.70464,
+      "grad_norm": 0.33839017152786255,
+      "learning_rate": 0.00016082474226804125,
+      "loss": 0.7141,
+      "step": 1101
+    },
+    {
+      "epoch": 0.70528,
+      "grad_norm": 0.33948853611946106,
+      "learning_rate": 0.00016078350515463918,
+      "loss": 0.7301,
+      "step": 1102
+    },
+    {
+      "epoch": 0.70592,
+      "grad_norm": 0.3080214560031891,
+      "learning_rate": 0.00016074226804123714,
+      "loss": 0.7078,
+      "step": 1103
+    },
+    {
+      "epoch": 0.70656,
+      "grad_norm": 0.3655720055103302,
+      "learning_rate": 0.00016070103092783507,
+      "loss": 0.6938,
+      "step": 1104
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3072998821735382,
+      "learning_rate": 0.00016065979381443298,
+      "loss": 0.6262,
+      "step": 1105
+    },
+    {
+      "epoch": 0.70784,
+      "grad_norm": 0.3594801723957062,
+      "learning_rate": 0.00016061855670103094,
+      "loss": 0.7591,
+      "step": 1106
+    },
+    {
+      "epoch": 0.70848,
+      "grad_norm": 0.30161359906196594,
+      "learning_rate": 0.00016057731958762887,
+      "loss": 0.7809,
+      "step": 1107
+    },
+    {
+      "epoch": 0.70912,
+      "grad_norm": 0.34191441535949707,
+      "learning_rate": 0.0001605360824742268,
+      "loss": 0.6935,
+      "step": 1108
+    },
+    {
+      "epoch": 0.70976,
+      "grad_norm": 0.30140718817710876,
+      "learning_rate": 0.00016049484536082476,
+      "loss": 0.6536,
+      "step": 1109
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3341141641139984,
+      "learning_rate": 0.0001604536082474227,
+      "loss": 0.7743,
+      "step": 1110
+    },
+    {
+      "epoch": 0.71104,
+      "grad_norm": 0.3386651277542114,
+      "learning_rate": 0.00016041237113402065,
+      "loss": 0.7675,
+      "step": 1111
+    },
+    {
+      "epoch": 0.71168,
+      "grad_norm": 0.30079570412635803,
+      "learning_rate": 0.00016037113402061855,
+      "loss": 0.6616,
+      "step": 1112
+    },
+    {
+      "epoch": 0.71232,
+      "grad_norm": 0.32636722922325134,
+      "learning_rate": 0.00016032989690721648,
+      "loss": 0.7389,
+      "step": 1113
+    },
+    {
+      "epoch": 0.71296,
+      "grad_norm": 0.33846405148506165,
+      "learning_rate": 0.00016028865979381444,
+      "loss": 0.6944,
+      "step": 1114
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.30079424381256104,
+      "learning_rate": 0.00016024742268041237,
+      "loss": 0.6936,
+      "step": 1115
+    },
+    {
+      "epoch": 0.71424,
+      "grad_norm": 0.3410959839820862,
+      "learning_rate": 0.00016020618556701033,
+      "loss": 0.5932,
+      "step": 1116
+    },
+    {
+      "epoch": 0.71488,
+      "grad_norm": 0.3442777395248413,
+      "learning_rate": 0.00016016494845360826,
+      "loss": 0.7962,
+      "step": 1117
+    },
+    {
+      "epoch": 0.71552,
+      "grad_norm": 0.3035128116607666,
+      "learning_rate": 0.0001601237113402062,
+      "loss": 0.5364,
+      "step": 1118
+    },
+    {
+      "epoch": 0.71616,
+      "grad_norm": 0.34147998690605164,
+      "learning_rate": 0.00016008247422680412,
+      "loss": 0.6958,
+      "step": 1119
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.34075674414634705,
+      "learning_rate": 0.00016004123711340205,
+      "loss": 0.7481,
+      "step": 1120
+    },
+    {
+      "epoch": 0.71744,
+      "grad_norm": 0.32701680064201355,
+      "learning_rate": 0.00016,
+      "loss": 0.7091,
+      "step": 1121
+    },
+    {
+      "epoch": 0.71808,
+      "grad_norm": 0.32415634393692017,
+      "learning_rate": 0.00015995876288659794,
+      "loss": 0.652,
+      "step": 1122
+    },
+    {
+      "epoch": 0.71872,
+      "grad_norm": 0.35673046112060547,
+      "learning_rate": 0.00015991752577319588,
+      "loss": 0.7243,
+      "step": 1123
+    },
+    {
+      "epoch": 0.71936,
+      "grad_norm": 0.30253833532333374,
+      "learning_rate": 0.00015987628865979383,
+      "loss": 0.6633,
+      "step": 1124
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3327553868293762,
+      "learning_rate": 0.00015983505154639177,
+      "loss": 0.7344,
+      "step": 1125
+    },
+    {
+      "epoch": 0.72064,
+      "grad_norm": 0.37562182545661926,
+      "learning_rate": 0.0001597938144329897,
+      "loss": 0.7011,
+      "step": 1126
+    },
+    {
+      "epoch": 0.72128,
+      "grad_norm": 0.3346128463745117,
+      "learning_rate": 0.00015975257731958763,
+      "loss": 0.8312,
+      "step": 1127
+    },
+    {
+      "epoch": 0.72192,
+      "grad_norm": 0.3139009475708008,
+      "learning_rate": 0.00015971134020618556,
+      "loss": 0.6953,
+      "step": 1128
+    },
+    {
+      "epoch": 0.72256,
+      "grad_norm": 0.31798607110977173,
+      "learning_rate": 0.00015967010309278352,
+      "loss": 0.6581,
+      "step": 1129
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.35555124282836914,
+      "learning_rate": 0.00015962886597938145,
+      "loss": 0.6822,
+      "step": 1130
+    },
+    {
+      "epoch": 0.72384,
+      "grad_norm": 0.35806533694267273,
+      "learning_rate": 0.0001595876288659794,
+      "loss": 0.8015,
+      "step": 1131
+    },
+    {
+      "epoch": 0.72448,
+      "grad_norm": 0.2988489866256714,
+      "learning_rate": 0.00015954639175257734,
+      "loss": 0.5956,
+      "step": 1132
+    },
+    {
+      "epoch": 0.72512,
+      "grad_norm": 0.32034188508987427,
+      "learning_rate": 0.00015950515463917527,
+      "loss": 0.6814,
+      "step": 1133
+    },
+    {
+      "epoch": 0.72576,
+      "grad_norm": 0.34971657395362854,
+      "learning_rate": 0.0001594639175257732,
+      "loss": 0.6364,
+      "step": 1134
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3323684632778168,
+      "learning_rate": 0.00015942268041237113,
+      "loss": 0.5821,
+      "step": 1135
+    },
+    {
+      "epoch": 0.72704,
+      "grad_norm": 0.3544963598251343,
+      "learning_rate": 0.0001593814432989691,
+      "loss": 0.6273,
+      "step": 1136
+    },
+    {
+      "epoch": 0.72768,
+      "grad_norm": 0.3518659770488739,
+      "learning_rate": 0.00015934020618556702,
+      "loss": 0.687,
+      "step": 1137
+    },
+    {
+      "epoch": 0.72832,
+      "grad_norm": 0.325310617685318,
+      "learning_rate": 0.00015929896907216495,
+      "loss": 0.7532,
+      "step": 1138
+    },
+    {
+      "epoch": 0.72896,
+      "grad_norm": 0.3553629517555237,
+      "learning_rate": 0.0001592577319587629,
+      "loss": 0.6296,
+      "step": 1139
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.34940364956855774,
+      "learning_rate": 0.00015921649484536084,
+      "loss": 0.7721,
+      "step": 1140
+    },
+    {
+      "epoch": 0.73024,
+      "grad_norm": 0.31204769015312195,
+      "learning_rate": 0.00015917525773195875,
+      "loss": 0.6751,
+      "step": 1141
+    },
+    {
+      "epoch": 0.73088,
+      "grad_norm": 0.38105154037475586,
+      "learning_rate": 0.0001591340206185567,
+      "loss": 0.6803,
+      "step": 1142
+    },
+    {
+      "epoch": 0.73152,
+      "grad_norm": 0.32924070954322815,
+      "learning_rate": 0.00015909278350515464,
+      "loss": 0.7063,
+      "step": 1143
+    },
+    {
+      "epoch": 0.73216,
+      "grad_norm": 0.32752180099487305,
+      "learning_rate": 0.0001590515463917526,
+      "loss": 0.6466,
+      "step": 1144
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.3205970227718353,
+      "learning_rate": 0.00015901030927835053,
+      "loss": 0.6894,
+      "step": 1145
+    },
+    {
+      "epoch": 0.73344,
+      "grad_norm": 0.3504909873008728,
+      "learning_rate": 0.00015896907216494846,
+      "loss": 0.6023,
+      "step": 1146
+    },
+    {
+      "epoch": 0.73408,
+      "grad_norm": 0.35501784086227417,
+      "learning_rate": 0.00015892783505154642,
+      "loss": 0.6941,
+      "step": 1147
+    },
+    {
+      "epoch": 0.73472,
+      "grad_norm": 0.3443588316440582,
+      "learning_rate": 0.00015888659793814432,
+      "loss": 0.6583,
+      "step": 1148
+    },
+    {
+      "epoch": 0.73536,
+      "grad_norm": 0.33824223279953003,
+      "learning_rate": 0.00015884536082474228,
+      "loss": 0.6559,
+      "step": 1149
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.34292182326316833,
+      "learning_rate": 0.0001588041237113402,
+      "loss": 0.707,
+      "step": 1150
+    },
+    {
+      "epoch": 0.73664,
+      "grad_norm": 0.34511563181877136,
+      "learning_rate": 0.00015876288659793814,
+      "loss": 0.7009,
+      "step": 1151
+    },
+    {
+      "epoch": 0.73728,
+      "grad_norm": 0.3116765022277832,
+      "learning_rate": 0.0001587216494845361,
+      "loss": 0.7231,
+      "step": 1152
+    },
+    {
+      "epoch": 0.73792,
+      "grad_norm": 0.300601065158844,
+      "learning_rate": 0.00015868041237113403,
+      "loss": 0.7189,
+      "step": 1153
+    },
+    {
+      "epoch": 0.73856,
+      "grad_norm": 0.342246949672699,
+      "learning_rate": 0.000158639175257732,
+      "loss": 0.728,
+      "step": 1154
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3085081875324249,
+      "learning_rate": 0.0001585979381443299,
+      "loss": 0.8356,
+      "step": 1155
+    },
+    {
+      "epoch": 0.73984,
+      "grad_norm": 0.3277675211429596,
+      "learning_rate": 0.00015855670103092782,
+      "loss": 0.6698,
+      "step": 1156
+    },
+    {
+      "epoch": 0.74048,
+      "grad_norm": 0.3427579998970032,
+      "learning_rate": 0.00015851546391752578,
+      "loss": 0.7424,
+      "step": 1157
+    },
+    {
+      "epoch": 0.74112,
+      "grad_norm": 0.32654234766960144,
+      "learning_rate": 0.00015847422680412371,
+      "loss": 0.7195,
+      "step": 1158
+    },
+    {
+      "epoch": 0.74176,
+      "grad_norm": 0.3283117413520813,
+      "learning_rate": 0.00015843298969072167,
+      "loss": 0.7169,
+      "step": 1159
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.34052881598472595,
+      "learning_rate": 0.0001583917525773196,
+      "loss": 0.708,
+      "step": 1160
+    },
+    {
+      "epoch": 0.74304,
+      "grad_norm": 0.3213071823120117,
+      "learning_rate": 0.00015835051546391754,
+      "loss": 0.8579,
+      "step": 1161
+    },
+    {
+      "epoch": 0.74368,
+      "grad_norm": 0.33129480481147766,
+      "learning_rate": 0.00015830927835051547,
+      "loss": 0.8045,
+      "step": 1162
+    },
+    {
+      "epoch": 0.74432,
+      "grad_norm": 0.3442804515361786,
+      "learning_rate": 0.0001582680412371134,
+      "loss": 0.7048,
+      "step": 1163
+    },
+    {
+      "epoch": 0.74496,
+      "grad_norm": 0.3604481816291809,
+      "learning_rate": 0.00015822680412371136,
+      "loss": 0.6648,
+      "step": 1164
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3192715048789978,
+      "learning_rate": 0.0001581855670103093,
+      "loss": 0.649,
+      "step": 1165
+    },
+    {
+      "epoch": 0.74624,
+      "grad_norm": 0.31993117928504944,
+      "learning_rate": 0.00015814432989690722,
+      "loss": 0.755,
+      "step": 1166
+    },
+    {
+      "epoch": 0.74688,
+      "grad_norm": 0.33207136392593384,
+      "learning_rate": 0.00015810309278350518,
+      "loss": 0.5883,
+      "step": 1167
+    },
+    {
+      "epoch": 0.74752,
+      "grad_norm": 0.3333304226398468,
+      "learning_rate": 0.0001580618556701031,
+      "loss": 0.7162,
+      "step": 1168
+    },
+    {
+      "epoch": 0.74816,
+      "grad_norm": 0.35774415731430054,
+      "learning_rate": 0.00015802061855670104,
+      "loss": 0.6844,
+      "step": 1169
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.35619139671325684,
+      "learning_rate": 0.00015797938144329897,
+      "loss": 0.6539,
+      "step": 1170
+    },
+    {
+      "epoch": 0.74944,
+      "grad_norm": 0.29124128818511963,
+      "learning_rate": 0.0001579381443298969,
+      "loss": 0.5777,
+      "step": 1171
+    },
+    {
+      "epoch": 0.75008,
+      "grad_norm": 0.35174891352653503,
+      "learning_rate": 0.00015789690721649486,
+      "loss": 0.6984,
+      "step": 1172
+    },
+    {
+      "epoch": 0.75072,
+      "grad_norm": 0.3309306502342224,
+      "learning_rate": 0.0001578556701030928,
+      "loss": 0.8359,
+      "step": 1173
+    },
+    {
+      "epoch": 0.75136,
+      "grad_norm": 0.3367566764354706,
+      "learning_rate": 0.00015781443298969072,
+      "loss": 0.7528,
+      "step": 1174
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3369824290275574,
+      "learning_rate": 0.00015777319587628868,
+      "loss": 0.6845,
+      "step": 1175
+    },
+    {
+      "epoch": 0.75264,
+      "grad_norm": 0.3255024254322052,
+      "learning_rate": 0.0001577319587628866,
+      "loss": 0.6784,
+      "step": 1176
+    },
+    {
+      "epoch": 0.75328,
+      "grad_norm": 0.3489122688770294,
+      "learning_rate": 0.00015769072164948454,
+      "loss": 0.6216,
+      "step": 1177
+    },
+    {
+      "epoch": 0.75392,
+      "grad_norm": 0.34151795506477356,
+      "learning_rate": 0.00015764948453608248,
+      "loss": 0.7726,
+      "step": 1178
+    },
+    {
+      "epoch": 0.75456,
+      "grad_norm": 0.3039149343967438,
+      "learning_rate": 0.0001576082474226804,
+      "loss": 0.6658,
+      "step": 1179
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3411910831928253,
+      "learning_rate": 0.00015756701030927836,
+      "loss": 0.6425,
+      "step": 1180
+    },
+    {
+      "epoch": 0.75584,
+      "grad_norm": 0.3341788649559021,
+      "learning_rate": 0.0001575257731958763,
+      "loss": 0.7353,
+      "step": 1181
+    },
+    {
+      "epoch": 0.75648,
+      "grad_norm": 0.325399786233902,
+      "learning_rate": 0.00015748453608247425,
+      "loss": 0.712,
+      "step": 1182
+    },
+    {
+      "epoch": 0.75712,
+      "grad_norm": 0.3240582048892975,
+      "learning_rate": 0.00015744329896907219,
+      "loss": 0.7259,
+      "step": 1183
+    },
+    {
+      "epoch": 0.75776,
+      "grad_norm": 0.30633383989334106,
+      "learning_rate": 0.0001574020618556701,
+      "loss": 0.6759,
+      "step": 1184
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.31575968861579895,
+      "learning_rate": 0.00015736082474226805,
+      "loss": 0.7147,
+      "step": 1185
+    },
+    {
+      "epoch": 0.75904,
+      "grad_norm": 0.3075871765613556,
+      "learning_rate": 0.00015731958762886598,
+      "loss": 0.6546,
+      "step": 1186
+    },
+    {
+      "epoch": 0.75968,
+      "grad_norm": 0.3417414724826813,
+      "learning_rate": 0.00015727835051546394,
+      "loss": 0.7958,
+      "step": 1187
+    },
+    {
+      "epoch": 0.76032,
+      "grad_norm": 0.34046420454978943,
+      "learning_rate": 0.00015723711340206187,
+      "loss": 0.6942,
+      "step": 1188
+    },
+    {
+      "epoch": 0.76096,
+      "grad_norm": 0.32114773988723755,
+      "learning_rate": 0.0001571958762886598,
+      "loss": 0.7141,
+      "step": 1189
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3468434512615204,
+      "learning_rate": 0.00015715463917525773,
+      "loss": 0.7171,
+      "step": 1190
+    },
+    {
+      "epoch": 0.76224,
+      "grad_norm": 0.3526851534843445,
+      "learning_rate": 0.00015711340206185566,
+      "loss": 0.7194,
+      "step": 1191
+    },
+    {
+      "epoch": 0.76288,
+      "grad_norm": 0.3432551920413971,
+      "learning_rate": 0.00015707216494845362,
+      "loss": 0.801,
+      "step": 1192
+    },
+    {
+      "epoch": 0.76352,
+      "grad_norm": 0.3205793797969818,
+      "learning_rate": 0.00015703092783505155,
+      "loss": 0.5951,
+      "step": 1193
+    },
+    {
+      "epoch": 0.76416,
+      "grad_norm": 0.3824326694011688,
+      "learning_rate": 0.00015698969072164948,
+      "loss": 0.6591,
+      "step": 1194
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3394090235233307,
+      "learning_rate": 0.00015694845360824744,
+      "loss": 0.566,
+      "step": 1195
+    },
+    {
+      "epoch": 0.76544,
+      "grad_norm": 0.3150940537452698,
+      "learning_rate": 0.00015690721649484537,
+      "loss": 0.6772,
+      "step": 1196
+    },
+    {
+      "epoch": 0.76608,
+      "grad_norm": 0.31789594888687134,
+      "learning_rate": 0.0001568659793814433,
+      "loss": 0.6516,
+      "step": 1197
+    },
+    {
+      "epoch": 0.76672,
+      "grad_norm": 0.30661118030548096,
+      "learning_rate": 0.00015682474226804124,
+      "loss": 0.7465,
+      "step": 1198
+    },
+    {
+      "epoch": 0.76736,
+      "grad_norm": 0.3528458774089813,
+      "learning_rate": 0.00015678350515463917,
+      "loss": 0.7764,
+      "step": 1199
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.28390195965766907,
+      "learning_rate": 0.00015674226804123713,
+      "loss": 0.6395,
+      "step": 1200
+    },
+    {
+      "epoch": 0.76864,
+      "grad_norm": 0.3530828058719635,
+      "learning_rate": 0.00015670103092783506,
+      "loss": 0.7229,
+      "step": 1201
+    },
+    {
+      "epoch": 0.76928,
+      "grad_norm": 0.3125981092453003,
+      "learning_rate": 0.000156659793814433,
+      "loss": 0.5549,
+      "step": 1202
+    },
+    {
+      "epoch": 0.76992,
+      "grad_norm": 0.3444112539291382,
+      "learning_rate": 0.00015661855670103095,
+      "loss": 0.7073,
+      "step": 1203
+    },
+    {
+      "epoch": 0.77056,
+      "grad_norm": 0.3122750520706177,
+      "learning_rate": 0.00015657731958762888,
+      "loss": 0.7765,
+      "step": 1204
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3220624029636383,
+      "learning_rate": 0.0001565360824742268,
+      "loss": 0.588,
+      "step": 1205
+    },
+    {
+      "epoch": 0.77184,
+      "grad_norm": 0.31668856739997864,
+      "learning_rate": 0.00015649484536082474,
+      "loss": 0.6407,
+      "step": 1206
+    },
+    {
+      "epoch": 0.77248,
+      "grad_norm": 0.3321409523487091,
+      "learning_rate": 0.00015645360824742267,
+      "loss": 0.7278,
+      "step": 1207
+    },
+    {
+      "epoch": 0.77312,
+      "grad_norm": 0.34889960289001465,
+      "learning_rate": 0.00015641237113402063,
+      "loss": 0.7275,
+      "step": 1208
+    },
+    {
+      "epoch": 0.77376,
+      "grad_norm": 0.32358673214912415,
+      "learning_rate": 0.00015637113402061856,
+      "loss": 0.6778,
+      "step": 1209
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3390035927295685,
+      "learning_rate": 0.00015632989690721652,
+      "loss": 0.6715,
+      "step": 1210
+    },
+    {
+      "epoch": 0.77504,
+      "grad_norm": 0.3396976590156555,
+      "learning_rate": 0.00015628865979381445,
+      "loss": 0.6658,
+      "step": 1211
+    },
+    {
+      "epoch": 0.77568,
+      "grad_norm": 0.31868866086006165,
+      "learning_rate": 0.00015624742268041238,
+      "loss": 0.7042,
+      "step": 1212
+    },
+    {
+      "epoch": 0.77632,
+      "grad_norm": 0.32745471596717834,
+      "learning_rate": 0.00015620618556701031,
+      "loss": 0.7175,
+      "step": 1213
+    },
+    {
+      "epoch": 0.77696,
+      "grad_norm": 0.3370881676673889,
+      "learning_rate": 0.00015616494845360824,
+      "loss": 0.7008,
+      "step": 1214
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3745781481266022,
+      "learning_rate": 0.0001561237113402062,
+      "loss": 0.6672,
+      "step": 1215
+    },
+    {
+      "epoch": 0.77824,
+      "grad_norm": 0.3616238534450531,
+      "learning_rate": 0.00015608247422680413,
+      "loss": 0.7123,
+      "step": 1216
+    },
+    {
+      "epoch": 0.77888,
+      "grad_norm": 0.34303146600723267,
+      "learning_rate": 0.00015604123711340207,
+      "loss": 0.7754,
+      "step": 1217
+    },
+    {
+      "epoch": 0.77952,
+      "grad_norm": 0.3371765613555908,
+      "learning_rate": 0.00015600000000000002,
+      "loss": 0.701,
+      "step": 1218
+    },
+    {
+      "epoch": 0.78016,
+      "grad_norm": 0.3607301414012909,
+      "learning_rate": 0.00015595876288659796,
+      "loss": 0.7118,
+      "step": 1219
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.2974405884742737,
+      "learning_rate": 0.0001559175257731959,
+      "loss": 0.7023,
+      "step": 1220
+    },
+    {
+      "epoch": 0.78144,
+      "grad_norm": 0.33611080050468445,
+      "learning_rate": 0.00015587628865979382,
+      "loss": 0.8107,
+      "step": 1221
+    },
+    {
+      "epoch": 0.78208,
+      "grad_norm": 0.34894636273384094,
+      "learning_rate": 0.00015583505154639175,
+      "loss": 0.7945,
+      "step": 1222
+    },
+    {
+      "epoch": 0.78272,
+      "grad_norm": 0.32243672013282776,
+      "learning_rate": 0.0001557938144329897,
+      "loss": 0.7296,
+      "step": 1223
+    },
+    {
+      "epoch": 0.78336,
+      "grad_norm": 0.33610931038856506,
+      "learning_rate": 0.00015575257731958764,
+      "loss": 0.5785,
+      "step": 1224
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.355332612991333,
+      "learning_rate": 0.0001557113402061856,
+      "loss": 0.8286,
+      "step": 1225
+    },
+    {
+      "epoch": 0.78464,
+      "grad_norm": 0.3429398536682129,
+      "learning_rate": 0.0001556701030927835,
+      "loss": 0.7971,
+      "step": 1226
+    },
+    {
+      "epoch": 0.78528,
+      "grad_norm": 0.30152320861816406,
+      "learning_rate": 0.00015562886597938143,
+      "loss": 0.6758,
+      "step": 1227
+    },
+    {
+      "epoch": 0.78592,
+      "grad_norm": 0.3071718215942383,
+      "learning_rate": 0.0001555876288659794,
+      "loss": 0.618,
+      "step": 1228
+    },
+    {
+      "epoch": 0.78656,
+      "grad_norm": 0.3621586263179779,
+      "learning_rate": 0.00015554639175257732,
+      "loss": 0.6188,
+      "step": 1229
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3256191909313202,
+      "learning_rate": 0.00015550515463917525,
+      "loss": 0.7286,
+      "step": 1230
+    },
+    {
+      "epoch": 0.78784,
+      "grad_norm": 0.33490487933158875,
+      "learning_rate": 0.0001554639175257732,
+      "loss": 0.6898,
+      "step": 1231
+    },
+    {
+      "epoch": 0.78848,
+      "grad_norm": 0.3517960011959076,
+      "learning_rate": 0.00015542268041237114,
+      "loss": 0.7475,
+      "step": 1232
+    },
+    {
+      "epoch": 0.78912,
+      "grad_norm": 0.3338039219379425,
+      "learning_rate": 0.00015538144329896907,
+      "loss": 0.7257,
+      "step": 1233
+    },
+    {
+      "epoch": 0.78976,
+      "grad_norm": 0.3427311182022095,
+      "learning_rate": 0.000155340206185567,
+      "loss": 0.7267,
+      "step": 1234
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3468652069568634,
+      "learning_rate": 0.00015529896907216494,
+      "loss": 0.6463,
+      "step": 1235
+    },
+    {
+      "epoch": 0.79104,
+      "grad_norm": 0.35140207409858704,
+      "learning_rate": 0.0001552577319587629,
+      "loss": 0.7349,
+      "step": 1236
+    },
+    {
+      "epoch": 0.79168,
+      "grad_norm": 0.33990204334259033,
+      "learning_rate": 0.00015521649484536083,
+      "loss": 0.6885,
+      "step": 1237
+    },
+    {
+      "epoch": 0.79232,
+      "grad_norm": 0.3166777193546295,
+      "learning_rate": 0.00015517525773195879,
+      "loss": 0.7543,
+      "step": 1238
+    },
+    {
+      "epoch": 0.79296,
+      "grad_norm": 0.3637222945690155,
+      "learning_rate": 0.00015513402061855672,
+      "loss": 0.6794,
+      "step": 1239
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.33889108896255493,
+      "learning_rate": 0.00015509278350515465,
+      "loss": 0.7471,
+      "step": 1240
+    },
+    {
+      "epoch": 0.79424,
+      "grad_norm": 0.36119160056114197,
+      "learning_rate": 0.00015505154639175258,
+      "loss": 0.7139,
+      "step": 1241
+    },
+    {
+      "epoch": 0.79488,
+      "grad_norm": 0.343185156583786,
+      "learning_rate": 0.0001550103092783505,
+      "loss": 0.6431,
+      "step": 1242
+    },
+    {
+      "epoch": 0.79552,
+      "grad_norm": 0.34966182708740234,
+      "learning_rate": 0.00015496907216494847,
+      "loss": 0.613,
+      "step": 1243
+    },
+    {
+      "epoch": 0.79616,
+      "grad_norm": 0.35317862033843994,
+      "learning_rate": 0.0001549278350515464,
+      "loss": 0.6098,
+      "step": 1244
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3372093439102173,
+      "learning_rate": 0.00015488659793814433,
+      "loss": 0.6748,
+      "step": 1245
+    },
+    {
+      "epoch": 0.79744,
+      "grad_norm": 0.32267236709594727,
+      "learning_rate": 0.0001548453608247423,
+      "loss": 0.7122,
+      "step": 1246
+    },
+    {
+      "epoch": 0.79808,
+      "grad_norm": 0.3571048974990845,
+      "learning_rate": 0.00015480412371134022,
+      "loss": 0.7168,
+      "step": 1247
+    },
+    {
+      "epoch": 0.79872,
+      "grad_norm": 0.3830585777759552,
+      "learning_rate": 0.00015476288659793815,
+      "loss": 0.7252,
+      "step": 1248
+    },
+    {
+      "epoch": 0.79936,
+      "grad_norm": 0.3394557535648346,
+      "learning_rate": 0.00015472164948453608,
+      "loss": 0.7278,
+      "step": 1249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3088328242301941,
+      "learning_rate": 0.00015468041237113401,
+      "loss": 0.6761,
+      "step": 1250
+    },
+    {
+      "epoch": 0.80064,
+      "grad_norm": 0.341014564037323,
+      "learning_rate": 0.00015463917525773197,
+      "loss": 0.6595,
+      "step": 1251
+    },
+    {
+      "epoch": 0.80128,
+      "grad_norm": 0.32454901933670044,
+      "learning_rate": 0.0001545979381443299,
+      "loss": 0.6721,
+      "step": 1252
+    },
+    {
+      "epoch": 0.80192,
+      "grad_norm": 0.32738196849823,
+      "learning_rate": 0.00015455670103092786,
+      "loss": 0.7115,
+      "step": 1253
+    },
+    {
+      "epoch": 0.80256,
+      "grad_norm": 0.3569195866584778,
+      "learning_rate": 0.0001545154639175258,
+      "loss": 0.6786,
+      "step": 1254
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.34522199630737305,
+      "learning_rate": 0.0001544742268041237,
+      "loss": 0.6386,
+      "step": 1255
+    },
+    {
+      "epoch": 0.80384,
+      "grad_norm": 0.3232608437538147,
+      "learning_rate": 0.00015443298969072166,
+      "loss": 0.6268,
+      "step": 1256
+    },
+    {
+      "epoch": 0.80448,
+      "grad_norm": 0.32131457328796387,
+      "learning_rate": 0.0001543917525773196,
+      "loss": 0.6339,
+      "step": 1257
+    },
+    {
+      "epoch": 0.80512,
+      "grad_norm": 0.33605554699897766,
+      "learning_rate": 0.00015435051546391752,
+      "loss": 0.6948,
+      "step": 1258
+    },
+    {
+      "epoch": 0.80576,
+      "grad_norm": 0.35704505443573,
+      "learning_rate": 0.00015430927835051548,
+      "loss": 0.6756,
+      "step": 1259
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.35005655884742737,
+      "learning_rate": 0.0001542680412371134,
+      "loss": 0.693,
+      "step": 1260
+    },
+    {
+      "epoch": 0.80704,
+      "grad_norm": 0.3098815083503723,
+      "learning_rate": 0.00015422680412371137,
+      "loss": 0.6607,
+      "step": 1261
+    },
+    {
+      "epoch": 0.80768,
+      "grad_norm": 0.3409726321697235,
+      "learning_rate": 0.00015418556701030927,
+      "loss": 0.6068,
+      "step": 1262
+    },
+    {
+      "epoch": 0.80832,
+      "grad_norm": 0.3146200180053711,
+      "learning_rate": 0.0001541443298969072,
+      "loss": 0.6604,
+      "step": 1263
+    },
+    {
+      "epoch": 0.80896,
+      "grad_norm": 0.30577483773231506,
+      "learning_rate": 0.00015410309278350516,
+      "loss": 0.5952,
+      "step": 1264
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3423594534397125,
+      "learning_rate": 0.0001540618556701031,
+      "loss": 0.7195,
+      "step": 1265
+    },
+    {
+      "epoch": 0.81024,
+      "grad_norm": 0.3428061008453369,
+      "learning_rate": 0.00015402061855670105,
+      "loss": 0.6294,
+      "step": 1266
+    },
+    {
+      "epoch": 0.81088,
+      "grad_norm": 0.33427393436431885,
+      "learning_rate": 0.00015397938144329898,
+      "loss": 0.6692,
+      "step": 1267
+    },
+    {
+      "epoch": 0.81152,
+      "grad_norm": 0.3287568688392639,
+      "learning_rate": 0.0001539381443298969,
+      "loss": 0.6969,
+      "step": 1268
+    },
+    {
+      "epoch": 0.81216,
+      "grad_norm": 0.30522775650024414,
+      "learning_rate": 0.00015389690721649484,
+      "loss": 0.5871,
+      "step": 1269
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3650021553039551,
+      "learning_rate": 0.00015385567010309278,
+      "loss": 0.6975,
+      "step": 1270
+    },
+    {
+      "epoch": 0.81344,
+      "grad_norm": 0.3321733772754669,
+      "learning_rate": 0.00015381443298969073,
+      "loss": 0.7732,
+      "step": 1271
+    },
+    {
+      "epoch": 0.81408,
+      "grad_norm": 0.3221074342727661,
+      "learning_rate": 0.00015377319587628867,
+      "loss": 0.7357,
+      "step": 1272
+    },
+    {
+      "epoch": 0.81472,
+      "grad_norm": 0.3592389225959778,
+      "learning_rate": 0.0001537319587628866,
+      "loss": 0.6015,
+      "step": 1273
+    },
+    {
+      "epoch": 0.81536,
+      "grad_norm": 0.3137587606906891,
+      "learning_rate": 0.00015369072164948456,
+      "loss": 0.6997,
+      "step": 1274
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.33504366874694824,
+      "learning_rate": 0.00015364948453608249,
+      "loss": 0.7695,
+      "step": 1275
+    },
+    {
+      "epoch": 0.81664,
+      "grad_norm": 0.34564894437789917,
+      "learning_rate": 0.00015360824742268042,
+      "loss": 0.6743,
+      "step": 1276
+    },
+    {
+      "epoch": 0.81728,
+      "grad_norm": 0.3643783628940582,
+      "learning_rate": 0.00015356701030927835,
+      "loss": 0.7048,
+      "step": 1277
+    },
+    {
+      "epoch": 0.81792,
+      "grad_norm": 0.3302707076072693,
+      "learning_rate": 0.00015352577319587628,
+      "loss": 0.7178,
+      "step": 1278
+    },
+    {
+      "epoch": 0.81856,
+      "grad_norm": 0.33454573154449463,
+      "learning_rate": 0.00015348453608247424,
+      "loss": 0.7421,
+      "step": 1279
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.38336557149887085,
+      "learning_rate": 0.00015344329896907217,
+      "loss": 0.647,
+      "step": 1280
+    },
+    {
+      "epoch": 0.81984,
+      "grad_norm": 0.34019625186920166,
+      "learning_rate": 0.00015340206185567013,
+      "loss": 0.6481,
+      "step": 1281
+    },
+    {
+      "epoch": 0.82048,
+      "grad_norm": 0.36370283365249634,
+      "learning_rate": 0.00015336082474226806,
+      "loss": 0.7126,
+      "step": 1282
+    },
+    {
+      "epoch": 0.82112,
+      "grad_norm": 0.33676454424858093,
+      "learning_rate": 0.000153319587628866,
+      "loss": 0.5802,
+      "step": 1283
+    },
+    {
+      "epoch": 0.82176,
+      "grad_norm": 0.35506629943847656,
+      "learning_rate": 0.00015327835051546392,
+      "loss": 0.6485,
+      "step": 1284
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3331254720687866,
+      "learning_rate": 0.00015323711340206185,
+      "loss": 0.6071,
+      "step": 1285
+    },
+    {
+      "epoch": 0.82304,
+      "grad_norm": 0.3069826364517212,
+      "learning_rate": 0.0001531958762886598,
+      "loss": 0.6447,
+      "step": 1286
+    },
+    {
+      "epoch": 0.82368,
+      "grad_norm": 0.31817129254341125,
+      "learning_rate": 0.00015315463917525774,
+      "loss": 0.7145,
+      "step": 1287
+    },
+    {
+      "epoch": 0.82432,
+      "grad_norm": 0.3250603973865509,
+      "learning_rate": 0.00015311340206185567,
+      "loss": 0.8468,
+      "step": 1288
+    },
+    {
+      "epoch": 0.82496,
+      "grad_norm": 0.34493181109428406,
+      "learning_rate": 0.00015307216494845363,
+      "loss": 0.7208,
+      "step": 1289
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.35121843218803406,
+      "learning_rate": 0.00015303092783505156,
+      "loss": 0.6381,
+      "step": 1290
+    },
+    {
+      "epoch": 0.82624,
+      "grad_norm": 0.35016465187072754,
+      "learning_rate": 0.00015298969072164947,
+      "loss": 0.7521,
+      "step": 1291
+    },
+    {
+      "epoch": 0.82688,
+      "grad_norm": 0.3205499053001404,
+      "learning_rate": 0.00015294845360824743,
+      "loss": 0.5734,
+      "step": 1292
+    },
+    {
+      "epoch": 0.82752,
+      "grad_norm": 0.3265836834907532,
+      "learning_rate": 0.00015290721649484536,
+      "loss": 0.7054,
+      "step": 1293
+    },
+    {
+      "epoch": 0.82816,
+      "grad_norm": 0.3408982753753662,
+      "learning_rate": 0.00015286597938144332,
+      "loss": 0.6329,
+      "step": 1294
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3369613289833069,
+      "learning_rate": 0.00015282474226804125,
+      "loss": 0.6356,
+      "step": 1295
+    },
+    {
+      "epoch": 0.82944,
+      "grad_norm": 0.32483482360839844,
+      "learning_rate": 0.00015278350515463918,
+      "loss": 0.6851,
+      "step": 1296
+    },
+    {
+      "epoch": 0.83008,
+      "grad_norm": 0.35428106784820557,
+      "learning_rate": 0.00015274226804123714,
+      "loss": 0.7548,
+      "step": 1297
+    },
+    {
+      "epoch": 0.83072,
+      "grad_norm": 0.3151598572731018,
+      "learning_rate": 0.00015270103092783504,
+      "loss": 0.6278,
+      "step": 1298
+    },
+    {
+      "epoch": 0.83136,
+      "grad_norm": 0.3077206313610077,
+      "learning_rate": 0.000152659793814433,
+      "loss": 0.5626,
+      "step": 1299
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3161257207393646,
+      "learning_rate": 0.00015261855670103093,
+      "loss": 0.6943,
+      "step": 1300
+    },
+    {
+      "epoch": 0.83264,
+      "grad_norm": 0.3148772418498993,
+      "learning_rate": 0.00015257731958762886,
+      "loss": 0.7124,
+      "step": 1301
+    },
+    {
+      "epoch": 0.83328,
+      "grad_norm": 0.33965301513671875,
+      "learning_rate": 0.00015253608247422682,
+      "loss": 0.5917,
+      "step": 1302
+    },
+    {
+      "epoch": 0.83392,
+      "grad_norm": 0.31518319249153137,
+      "learning_rate": 0.00015249484536082475,
+      "loss": 0.7095,
+      "step": 1303
+    },
+    {
+      "epoch": 0.83456,
+      "grad_norm": 0.3609430491924286,
+      "learning_rate": 0.0001524536082474227,
+      "loss": 0.7086,
+      "step": 1304
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.345872163772583,
+      "learning_rate": 0.00015241237113402061,
+      "loss": 0.8028,
+      "step": 1305
+    },
+    {
+      "epoch": 0.83584,
+      "grad_norm": 0.3626687228679657,
+      "learning_rate": 0.00015237113402061855,
+      "loss": 0.6658,
+      "step": 1306
+    },
+    {
+      "epoch": 0.83648,
+      "grad_norm": 0.335816353559494,
+      "learning_rate": 0.0001523298969072165,
+      "loss": 0.7369,
+      "step": 1307
+    },
+    {
+      "epoch": 0.83712,
+      "grad_norm": 0.34543803334236145,
+      "learning_rate": 0.00015228865979381444,
+      "loss": 0.6675,
+      "step": 1308
+    },
+    {
+      "epoch": 0.83776,
+      "grad_norm": 0.3549405336380005,
+      "learning_rate": 0.0001522474226804124,
+      "loss": 0.6618,
+      "step": 1309
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.35000523924827576,
+      "learning_rate": 0.00015220618556701032,
+      "loss": 0.698,
+      "step": 1310
+    },
+    {
+      "epoch": 0.83904,
+      "grad_norm": 0.31434980034828186,
+      "learning_rate": 0.00015216494845360826,
+      "loss": 0.6845,
+      "step": 1311
+    },
+    {
+      "epoch": 0.83968,
+      "grad_norm": 0.35832491517066956,
+      "learning_rate": 0.0001521237113402062,
+      "loss": 0.7112,
+      "step": 1312
+    },
+    {
+      "epoch": 0.84032,
+      "grad_norm": 0.31477558612823486,
+      "learning_rate": 0.00015208247422680412,
+      "loss": 0.6842,
+      "step": 1313
+    },
+    {
+      "epoch": 0.84096,
+      "grad_norm": 0.3472610414028168,
+      "learning_rate": 0.00015204123711340208,
+      "loss": 0.767,
+      "step": 1314
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.33760467171669006,
+      "learning_rate": 0.000152,
+      "loss": 0.664,
+      "step": 1315
+    },
+    {
+      "epoch": 0.84224,
+      "grad_norm": 0.31780970096588135,
+      "learning_rate": 0.00015195876288659794,
+      "loss": 0.6731,
+      "step": 1316
+    },
+    {
+      "epoch": 0.84288,
+      "grad_norm": 0.3356291949748993,
+      "learning_rate": 0.0001519175257731959,
+      "loss": 0.7355,
+      "step": 1317
+    },
+    {
+      "epoch": 0.84352,
+      "grad_norm": 0.34692996740341187,
+      "learning_rate": 0.00015187628865979383,
+      "loss": 0.7219,
+      "step": 1318
+    },
+    {
+      "epoch": 0.84416,
+      "grad_norm": 0.3282603323459625,
+      "learning_rate": 0.00015183505154639176,
+      "loss": 0.5774,
+      "step": 1319
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.35520240664482117,
+      "learning_rate": 0.0001517938144329897,
+      "loss": 0.7591,
+      "step": 1320
+    },
+    {
+      "epoch": 0.84544,
+      "grad_norm": 0.3423345386981964,
+      "learning_rate": 0.00015175257731958762,
+      "loss": 0.7019,
+      "step": 1321
+    },
+    {
+      "epoch": 0.84608,
+      "grad_norm": 0.3333418667316437,
+      "learning_rate": 0.00015171134020618558,
+      "loss": 0.6563,
+      "step": 1322
+    },
+    {
+      "epoch": 0.84672,
+      "grad_norm": 0.3541411757469177,
+      "learning_rate": 0.0001516701030927835,
+      "loss": 0.7872,
+      "step": 1323
+    },
+    {
+      "epoch": 0.84736,
+      "grad_norm": 0.34967565536499023,
+      "learning_rate": 0.00015162886597938144,
+      "loss": 0.679,
+      "step": 1324
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3362700343132019,
+      "learning_rate": 0.0001515876288659794,
+      "loss": 0.7039,
+      "step": 1325
+    },
+    {
+      "epoch": 0.84864,
+      "grad_norm": 0.31964215636253357,
+      "learning_rate": 0.00015154639175257733,
+      "loss": 0.659,
+      "step": 1326
+    },
+    {
+      "epoch": 0.84928,
+      "grad_norm": 0.33980798721313477,
+      "learning_rate": 0.00015150515463917526,
+      "loss": 0.7693,
+      "step": 1327
+    },
+    {
+      "epoch": 0.84992,
+      "grad_norm": 0.32150471210479736,
+      "learning_rate": 0.0001514639175257732,
+      "loss": 0.6103,
+      "step": 1328
+    },
+    {
+      "epoch": 0.85056,
+      "grad_norm": 0.3334615230560303,
+      "learning_rate": 0.00015142268041237113,
+      "loss": 0.6614,
+      "step": 1329
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3384806215763092,
+      "learning_rate": 0.00015138144329896909,
+      "loss": 0.7253,
+      "step": 1330
+    },
+    {
+      "epoch": 0.85184,
+      "grad_norm": 0.3273143768310547,
+      "learning_rate": 0.00015134020618556702,
+      "loss": 0.7423,
+      "step": 1331
+    },
+    {
+      "epoch": 0.85248,
+      "grad_norm": 0.37515416741371155,
+      "learning_rate": 0.00015129896907216498,
+      "loss": 0.7679,
+      "step": 1332
+    },
+    {
+      "epoch": 0.85312,
+      "grad_norm": 0.3597499430179596,
+      "learning_rate": 0.0001512577319587629,
+      "loss": 0.6895,
+      "step": 1333
+    },
+    {
+      "epoch": 0.85376,
+      "grad_norm": 0.4030213952064514,
+      "learning_rate": 0.0001512164948453608,
+      "loss": 0.7709,
+      "step": 1334
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3496096134185791,
+      "learning_rate": 0.00015117525773195877,
+      "loss": 0.6554,
+      "step": 1335
+    },
+    {
+      "epoch": 0.85504,
+      "grad_norm": 0.35449036955833435,
+      "learning_rate": 0.0001511340206185567,
+      "loss": 0.687,
+      "step": 1336
+    },
+    {
+      "epoch": 0.85568,
+      "grad_norm": 0.3157009780406952,
+      "learning_rate": 0.00015109278350515466,
+      "loss": 0.6022,
+      "step": 1337
+    },
+    {
+      "epoch": 0.85632,
+      "grad_norm": 0.3934071958065033,
+      "learning_rate": 0.0001510515463917526,
+      "loss": 0.7945,
+      "step": 1338
+    },
+    {
+      "epoch": 0.85696,
+      "grad_norm": 0.31729912757873535,
+      "learning_rate": 0.00015101030927835052,
+      "loss": 0.6495,
+      "step": 1339
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3164269030094147,
+      "learning_rate": 0.00015096907216494845,
+      "loss": 0.6136,
+      "step": 1340
+    },
+    {
+      "epoch": 0.85824,
+      "grad_norm": 0.34435635805130005,
+      "learning_rate": 0.00015092783505154638,
+      "loss": 0.6914,
+      "step": 1341
+    },
+    {
+      "epoch": 0.85888,
+      "grad_norm": 0.3308452367782593,
+      "learning_rate": 0.00015088659793814434,
+      "loss": 0.5995,
+      "step": 1342
+    },
+    {
+      "epoch": 0.85952,
+      "grad_norm": 0.31508561968803406,
+      "learning_rate": 0.00015084536082474227,
+      "loss": 0.7071,
+      "step": 1343
+    },
+    {
+      "epoch": 0.86016,
+      "grad_norm": 0.32383549213409424,
+      "learning_rate": 0.0001508041237113402,
+      "loss": 0.7333,
+      "step": 1344
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3585171103477478,
+      "learning_rate": 0.00015076288659793816,
+      "loss": 0.6451,
+      "step": 1345
+    },
+    {
+      "epoch": 0.86144,
+      "grad_norm": 0.340991735458374,
+      "learning_rate": 0.0001507216494845361,
+      "loss": 0.7194,
+      "step": 1346
+    },
+    {
+      "epoch": 0.86208,
+      "grad_norm": 0.3153185248374939,
+      "learning_rate": 0.00015068041237113403,
+      "loss": 0.775,
+      "step": 1347
+    },
+    {
+      "epoch": 0.86272,
+      "grad_norm": 0.3068450093269348,
+      "learning_rate": 0.00015063917525773196,
+      "loss": 0.5745,
+      "step": 1348
+    },
+    {
+      "epoch": 0.86336,
+      "grad_norm": 0.33111587166786194,
+      "learning_rate": 0.0001505979381443299,
+      "loss": 0.8119,
+      "step": 1349
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.32656314969062805,
+      "learning_rate": 0.00015055670103092785,
+      "loss": 0.7342,
+      "step": 1350
+    },
+    {
+      "epoch": 0.86464,
+      "grad_norm": 0.3177049458026886,
+      "learning_rate": 0.00015051546391752578,
+      "loss": 0.7826,
+      "step": 1351
+    },
+    {
+      "epoch": 0.86528,
+      "grad_norm": 0.33048197627067566,
+      "learning_rate": 0.0001504742268041237,
+      "loss": 0.6289,
+      "step": 1352
+    },
+    {
+      "epoch": 0.86592,
+      "grad_norm": 0.33547744154930115,
+      "learning_rate": 0.00015043298969072167,
+      "loss": 0.6796,
+      "step": 1353
+    },
+    {
+      "epoch": 0.86656,
+      "grad_norm": 0.30461347103118896,
+      "learning_rate": 0.0001503917525773196,
+      "loss": 0.6702,
+      "step": 1354
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.31267422437667847,
+      "learning_rate": 0.00015035051546391753,
+      "loss": 0.6743,
+      "step": 1355
+    },
+    {
+      "epoch": 0.86784,
+      "grad_norm": 0.37094438076019287,
+      "learning_rate": 0.00015030927835051546,
+      "loss": 0.7085,
+      "step": 1356
+    },
+    {
+      "epoch": 0.86848,
+      "grad_norm": 0.3691127598285675,
+      "learning_rate": 0.0001502680412371134,
+      "loss": 0.7439,
+      "step": 1357
+    },
+    {
+      "epoch": 0.86912,
+      "grad_norm": 0.32903164625167847,
+      "learning_rate": 0.00015022680412371135,
+      "loss": 0.7332,
+      "step": 1358
+    },
+    {
+      "epoch": 0.86976,
+      "grad_norm": 0.35645338892936707,
+      "learning_rate": 0.00015018556701030928,
+      "loss": 0.7469,
+      "step": 1359
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.3341512382030487,
+      "learning_rate": 0.00015014432989690724,
+      "loss": 0.6653,
+      "step": 1360
+    },
+    {
+      "epoch": 0.87104,
+      "grad_norm": 0.33617594838142395,
+      "learning_rate": 0.00015010309278350517,
+      "loss": 0.6331,
+      "step": 1361
+    },
+    {
+      "epoch": 0.87168,
+      "grad_norm": 0.3406195044517517,
+      "learning_rate": 0.0001500618556701031,
+      "loss": 0.6222,
+      "step": 1362
+    },
+    {
+      "epoch": 0.87232,
+      "grad_norm": 0.3233450949192047,
+      "learning_rate": 0.00015002061855670103,
+      "loss": 0.7018,
+      "step": 1363
+    },
+    {
+      "epoch": 0.87296,
+      "grad_norm": 0.3225351572036743,
+      "learning_rate": 0.00014997938144329897,
+      "loss": 0.6305,
+      "step": 1364
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.327679842710495,
+      "learning_rate": 0.00014993814432989692,
+      "loss": 0.7385,
+      "step": 1365
+    },
+    {
+      "epoch": 0.87424,
+      "grad_norm": 0.3171643614768982,
+      "learning_rate": 0.00014989690721649486,
+      "loss": 0.7524,
+      "step": 1366
+    },
+    {
+      "epoch": 0.87488,
+      "grad_norm": 0.3197685480117798,
+      "learning_rate": 0.0001498556701030928,
+      "loss": 0.6118,
+      "step": 1367
+    },
+    {
+      "epoch": 0.87552,
+      "grad_norm": 0.355956107378006,
+      "learning_rate": 0.00014981443298969075,
+      "loss": 0.6588,
+      "step": 1368
+    },
+    {
+      "epoch": 0.87616,
+      "grad_norm": 0.3711245656013489,
+      "learning_rate": 0.00014977319587628868,
+      "loss": 0.7513,
+      "step": 1369
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.344751238822937,
+      "learning_rate": 0.0001497319587628866,
+      "loss": 0.7373,
+      "step": 1370
+    },
+    {
+      "epoch": 0.87744,
+      "grad_norm": 0.37006598711013794,
+      "learning_rate": 0.00014969072164948454,
+      "loss": 0.7385,
+      "step": 1371
+    },
+    {
+      "epoch": 0.87808,
+      "grad_norm": 0.34881147742271423,
+      "learning_rate": 0.00014964948453608247,
+      "loss": 0.8121,
+      "step": 1372
+    },
+    {
+      "epoch": 0.87872,
+      "grad_norm": 0.33570396900177,
+      "learning_rate": 0.00014960824742268043,
+      "loss": 0.7867,
+      "step": 1373
+    },
+    {
+      "epoch": 0.87936,
+      "grad_norm": 0.3271861970424652,
+      "learning_rate": 0.00014956701030927836,
+      "loss": 0.6926,
+      "step": 1374
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3356350362300873,
+      "learning_rate": 0.00014952577319587632,
+      "loss": 0.7701,
+      "step": 1375
+    },
+    {
+      "epoch": 0.88064,
+      "grad_norm": 0.344028502702713,
+      "learning_rate": 0.00014948453608247422,
+      "loss": 0.7646,
+      "step": 1376
+    },
+    {
+      "epoch": 0.88128,
+      "grad_norm": 0.34623923897743225,
+      "learning_rate": 0.00014944329896907215,
+      "loss": 0.7408,
+      "step": 1377
+    },
+    {
+      "epoch": 0.88192,
+      "grad_norm": 0.3717415928840637,
+      "learning_rate": 0.0001494020618556701,
+      "loss": 0.7161,
+      "step": 1378
+    },
+    {
+      "epoch": 0.88256,
+      "grad_norm": 0.3636385500431061,
+      "learning_rate": 0.00014936082474226804,
+      "loss": 0.7056,
+      "step": 1379
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.354122519493103,
+      "learning_rate": 0.00014931958762886597,
+      "loss": 0.5995,
+      "step": 1380
+    },
+    {
+      "epoch": 0.88384,
+      "grad_norm": 0.3554580509662628,
+      "learning_rate": 0.00014927835051546393,
+      "loss": 0.7153,
+      "step": 1381
+    },
+    {
+      "epoch": 0.88448,
+      "grad_norm": 0.36315762996673584,
+      "learning_rate": 0.00014923711340206186,
+      "loss": 0.8086,
+      "step": 1382
+    },
+    {
+      "epoch": 0.88512,
+      "grad_norm": 0.3279130756855011,
+      "learning_rate": 0.0001491958762886598,
+      "loss": 0.6946,
+      "step": 1383
+    },
+    {
+      "epoch": 0.88576,
+      "grad_norm": 0.3363383710384369,
+      "learning_rate": 0.00014915463917525773,
+      "loss": 0.6937,
+      "step": 1384
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.359523206949234,
+      "learning_rate": 0.00014911340206185566,
+      "loss": 0.675,
+      "step": 1385
+    },
+    {
+      "epoch": 0.88704,
+      "grad_norm": 0.32081571221351624,
+      "learning_rate": 0.00014907216494845362,
+      "loss": 0.6097,
+      "step": 1386
+    },
+    {
+      "epoch": 0.88768,
+      "grad_norm": 0.3369751572608948,
+      "learning_rate": 0.00014903092783505155,
+      "loss": 0.6011,
+      "step": 1387
+    },
+    {
+      "epoch": 0.88832,
+      "grad_norm": 0.3111063838005066,
+      "learning_rate": 0.0001489896907216495,
+      "loss": 0.6023,
+      "step": 1388
+    },
+    {
+      "epoch": 0.88896,
+      "grad_norm": 0.31701961159706116,
+      "learning_rate": 0.00014894845360824744,
+      "loss": 0.5472,
+      "step": 1389
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3474489152431488,
+      "learning_rate": 0.00014890721649484537,
+      "loss": 0.7134,
+      "step": 1390
+    },
+    {
+      "epoch": 0.89024,
+      "grad_norm": 0.36610841751098633,
+      "learning_rate": 0.0001488659793814433,
+      "loss": 0.7141,
+      "step": 1391
+    },
+    {
+      "epoch": 0.89088,
+      "grad_norm": 0.3990553021430969,
+      "learning_rate": 0.00014882474226804123,
+      "loss": 0.752,
+      "step": 1392
+    },
+    {
+      "epoch": 0.89152,
+      "grad_norm": 0.3637222647666931,
+      "learning_rate": 0.0001487835051546392,
+      "loss": 0.747,
+      "step": 1393
+    },
+    {
+      "epoch": 0.89216,
+      "grad_norm": 0.3554556369781494,
+      "learning_rate": 0.00014874226804123712,
+      "loss": 0.6925,
+      "step": 1394
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.35358330607414246,
+      "learning_rate": 0.00014870103092783505,
+      "loss": 0.752,
+      "step": 1395
+    },
+    {
+      "epoch": 0.89344,
+      "grad_norm": 0.38265809416770935,
+      "learning_rate": 0.000148659793814433,
+      "loss": 0.7462,
+      "step": 1396
+    },
+    {
+      "epoch": 0.89408,
+      "grad_norm": 0.33013471961021423,
+      "learning_rate": 0.00014861855670103094,
+      "loss": 0.663,
+      "step": 1397
+    },
+    {
+      "epoch": 0.89472,
+      "grad_norm": 0.3024407625198364,
+      "learning_rate": 0.00014857731958762887,
+      "loss": 0.7151,
+      "step": 1398
+    },
+    {
+      "epoch": 0.89536,
+      "grad_norm": 0.33110880851745605,
+      "learning_rate": 0.0001485360824742268,
+      "loss": 0.6614,
+      "step": 1399
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3051261305809021,
+      "learning_rate": 0.00014849484536082474,
+      "loss": 0.7265,
+      "step": 1400
+    },
+    {
+      "epoch": 0.89664,
+      "grad_norm": 0.3123985826969147,
+      "learning_rate": 0.0001484536082474227,
+      "loss": 0.6869,
+      "step": 1401
+    },
+    {
+      "epoch": 0.89728,
+      "grad_norm": 0.36997851729393005,
+      "learning_rate": 0.00014841237113402063,
+      "loss": 0.7789,
+      "step": 1402
+    },
+    {
+      "epoch": 0.89792,
+      "grad_norm": 0.36322706937789917,
+      "learning_rate": 0.00014837113402061858,
+      "loss": 0.7852,
+      "step": 1403
+    },
+    {
+      "epoch": 0.89856,
+      "grad_norm": 0.3465798795223236,
+      "learning_rate": 0.00014832989690721652,
+      "loss": 0.6503,
+      "step": 1404
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.32866960763931274,
+      "learning_rate": 0.00014828865979381442,
+      "loss": 0.6214,
+      "step": 1405
+    },
+    {
+      "epoch": 0.89984,
+      "grad_norm": 0.31216898560523987,
+      "learning_rate": 0.00014824742268041238,
+      "loss": 0.738,
+      "step": 1406
+    },
+    {
+      "epoch": 0.90048,
+      "grad_norm": 0.33529841899871826,
+      "learning_rate": 0.0001482061855670103,
+      "loss": 0.6818,
+      "step": 1407
+    },
+    {
+      "epoch": 0.90112,
+      "grad_norm": 0.32807934284210205,
+      "learning_rate": 0.00014816494845360827,
+      "loss": 0.5745,
+      "step": 1408
+    },
+    {
+      "epoch": 0.90176,
+      "grad_norm": 0.3438502252101898,
+      "learning_rate": 0.0001481237113402062,
+      "loss": 0.7076,
+      "step": 1409
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.35320690274238586,
+      "learning_rate": 0.00014808247422680413,
+      "loss": 0.7791,
+      "step": 1410
+    },
+    {
+      "epoch": 0.90304,
+      "grad_norm": 0.35281550884246826,
+      "learning_rate": 0.0001480412371134021,
+      "loss": 0.7301,
+      "step": 1411
+    },
+    {
+      "epoch": 0.90368,
+      "grad_norm": 0.33414119482040405,
+      "learning_rate": 0.000148,
+      "loss": 0.6999,
+      "step": 1412
+    },
+    {
+      "epoch": 0.90432,
+      "grad_norm": 0.32009291648864746,
+      "learning_rate": 0.00014795876288659792,
+      "loss": 0.7528,
+      "step": 1413
+    },
+    {
+      "epoch": 0.90496,
+      "grad_norm": 0.340766042470932,
+      "learning_rate": 0.00014791752577319588,
+      "loss": 0.7402,
+      "step": 1414
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3443303108215332,
+      "learning_rate": 0.0001478762886597938,
+      "loss": 0.6497,
+      "step": 1415
+    },
+    {
+      "epoch": 0.90624,
+      "grad_norm": 0.3252516984939575,
+      "learning_rate": 0.00014783505154639177,
+      "loss": 0.7553,
+      "step": 1416
+    },
+    {
+      "epoch": 0.90688,
+      "grad_norm": 0.33195751905441284,
+      "learning_rate": 0.0001477938144329897,
+      "loss": 0.7265,
+      "step": 1417
+    },
+    {
+      "epoch": 0.90752,
+      "grad_norm": 0.3498607277870178,
+      "learning_rate": 0.00014775257731958763,
+      "loss": 0.7438,
+      "step": 1418
+    },
+    {
+      "epoch": 0.90816,
+      "grad_norm": 0.3379184603691101,
+      "learning_rate": 0.00014771134020618557,
+      "loss": 0.656,
+      "step": 1419
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3491291105747223,
+      "learning_rate": 0.0001476701030927835,
+      "loss": 0.7993,
+      "step": 1420
+    },
+    {
+      "epoch": 0.90944,
+      "grad_norm": 0.3398728370666504,
+      "learning_rate": 0.00014762886597938146,
+      "loss": 0.6936,
+      "step": 1421
+    },
+    {
+      "epoch": 0.91008,
+      "grad_norm": 0.32713577151298523,
+      "learning_rate": 0.0001475876288659794,
+      "loss": 0.7153,
+      "step": 1422
+    },
+    {
+      "epoch": 0.91072,
+      "grad_norm": 0.30126509070396423,
+      "learning_rate": 0.00014754639175257732,
+      "loss": 0.5814,
+      "step": 1423
+    },
+    {
+      "epoch": 0.91136,
+      "grad_norm": 0.35304003953933716,
+      "learning_rate": 0.00014750515463917528,
+      "loss": 0.7117,
+      "step": 1424
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3445260524749756,
+      "learning_rate": 0.0001474639175257732,
+      "loss": 0.7359,
+      "step": 1425
+    },
+    {
+      "epoch": 0.91264,
+      "grad_norm": 0.32121333479881287,
+      "learning_rate": 0.00014742268041237114,
+      "loss": 0.6745,
+      "step": 1426
+    },
+    {
+      "epoch": 0.91328,
+      "grad_norm": 0.3413887321949005,
+      "learning_rate": 0.00014738144329896907,
+      "loss": 0.7379,
+      "step": 1427
+    },
+    {
+      "epoch": 0.91392,
+      "grad_norm": 0.3193953335285187,
+      "learning_rate": 0.000147340206185567,
+      "loss": 0.6312,
+      "step": 1428
+    },
+    {
+      "epoch": 0.91456,
+      "grad_norm": 0.34085628390312195,
+      "learning_rate": 0.00014729896907216496,
+      "loss": 0.7229,
+      "step": 1429
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3182644248008728,
+      "learning_rate": 0.0001472577319587629,
+      "loss": 0.6213,
+      "step": 1430
+    },
+    {
+      "epoch": 0.91584,
+      "grad_norm": 0.345705509185791,
+      "learning_rate": 0.00014721649484536085,
+      "loss": 0.6493,
+      "step": 1431
+    },
+    {
+      "epoch": 0.91648,
+      "grad_norm": 0.35458967089653015,
+      "learning_rate": 0.00014717525773195878,
+      "loss": 0.6547,
+      "step": 1432
+    },
+    {
+      "epoch": 0.91712,
+      "grad_norm": 0.3273255228996277,
+      "learning_rate": 0.0001471340206185567,
+      "loss": 0.6562,
+      "step": 1433
+    },
+    {
+      "epoch": 0.91776,
+      "grad_norm": 0.313321053981781,
+      "learning_rate": 0.00014709278350515464,
+      "loss": 0.663,
+      "step": 1434
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3528328835964203,
+      "learning_rate": 0.00014705154639175257,
+      "loss": 0.6845,
+      "step": 1435
+    },
+    {
+      "epoch": 0.91904,
+      "grad_norm": 0.3644863963127136,
+      "learning_rate": 0.00014701030927835053,
+      "loss": 0.7734,
+      "step": 1436
+    },
+    {
+      "epoch": 0.91968,
+      "grad_norm": 0.3365723788738251,
+      "learning_rate": 0.00014696907216494846,
+      "loss": 0.7171,
+      "step": 1437
+    },
+    {
+      "epoch": 0.92032,
+      "grad_norm": 0.3347156345844269,
+      "learning_rate": 0.0001469278350515464,
+      "loss": 0.6749,
+      "step": 1438
+    },
+    {
+      "epoch": 0.92096,
+      "grad_norm": 0.3630187511444092,
+      "learning_rate": 0.00014688659793814435,
+      "loss": 0.6841,
+      "step": 1439
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.31573957204818726,
+      "learning_rate": 0.00014684536082474228,
+      "loss": 0.6241,
+      "step": 1440
+    },
+    {
+      "epoch": 0.92224,
+      "grad_norm": 0.2965434491634369,
+      "learning_rate": 0.0001468041237113402,
+      "loss": 0.6478,
+      "step": 1441
+    },
+    {
+      "epoch": 0.92288,
+      "grad_norm": 0.3451821208000183,
+      "learning_rate": 0.00014676288659793815,
+      "loss": 0.7184,
+      "step": 1442
+    },
+    {
+      "epoch": 0.92352,
+      "grad_norm": 0.36585310101509094,
+      "learning_rate": 0.00014672164948453608,
+      "loss": 0.6723,
+      "step": 1443
+    },
+    {
+      "epoch": 0.92416,
+      "grad_norm": 0.34479328989982605,
+      "learning_rate": 0.00014668041237113404,
+      "loss": 0.6165,
+      "step": 1444
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.30933552980422974,
+      "learning_rate": 0.00014663917525773197,
+      "loss": 0.6633,
+      "step": 1445
+    },
+    {
+      "epoch": 0.92544,
+      "grad_norm": 0.2867521345615387,
+      "learning_rate": 0.0001465979381443299,
+      "loss": 0.5973,
+      "step": 1446
+    },
+    {
+      "epoch": 0.92608,
+      "grad_norm": 0.33106234669685364,
+      "learning_rate": 0.00014655670103092786,
+      "loss": 0.7109,
+      "step": 1447
+    },
+    {
+      "epoch": 0.92672,
+      "grad_norm": 0.3225803077220917,
+      "learning_rate": 0.00014651546391752576,
+      "loss": 0.5933,
+      "step": 1448
+    },
+    {
+      "epoch": 0.92736,
+      "grad_norm": 0.3555907607078552,
+      "learning_rate": 0.00014647422680412372,
+      "loss": 0.6711,
+      "step": 1449
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.35797473788261414,
+      "learning_rate": 0.00014643298969072165,
+      "loss": 0.656,
+      "step": 1450
+    },
+    {
+      "epoch": 0.92864,
+      "grad_norm": 0.33814266324043274,
+      "learning_rate": 0.00014639175257731958,
+      "loss": 0.8518,
+      "step": 1451
+    },
+    {
+      "epoch": 0.92928,
+      "grad_norm": 0.32533276081085205,
+      "learning_rate": 0.00014635051546391754,
+      "loss": 0.7056,
+      "step": 1452
+    },
+    {
+      "epoch": 0.92992,
+      "grad_norm": 0.3458595871925354,
+      "learning_rate": 0.00014630927835051547,
+      "loss": 0.7645,
+      "step": 1453
+    },
+    {
+      "epoch": 0.93056,
+      "grad_norm": 0.35336101055145264,
+      "learning_rate": 0.00014626804123711343,
+      "loss": 0.7675,
+      "step": 1454
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3128267526626587,
+      "learning_rate": 0.00014622680412371134,
+      "loss": 0.6103,
+      "step": 1455
+    },
+    {
+      "epoch": 0.93184,
+      "grad_norm": 0.3310684859752655,
+      "learning_rate": 0.00014618556701030927,
+      "loss": 0.7475,
+      "step": 1456
+    },
+    {
+      "epoch": 0.93248,
+      "grad_norm": 0.3515443801879883,
+      "learning_rate": 0.00014614432989690723,
+      "loss": 0.7845,
+      "step": 1457
+    },
+    {
+      "epoch": 0.93312,
+      "grad_norm": 0.31737762689590454,
+      "learning_rate": 0.00014610309278350516,
+      "loss": 0.7807,
+      "step": 1458
+    },
+    {
+      "epoch": 0.93376,
+      "grad_norm": 0.3264552056789398,
+      "learning_rate": 0.00014606185567010311,
+      "loss": 0.5944,
+      "step": 1459
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.35810723900794983,
+      "learning_rate": 0.00014602061855670105,
+      "loss": 0.7086,
+      "step": 1460
+    },
+    {
+      "epoch": 0.93504,
+      "grad_norm": 0.3738027811050415,
+      "learning_rate": 0.00014597938144329898,
+      "loss": 0.7887,
+      "step": 1461
+    },
+    {
+      "epoch": 0.93568,
+      "grad_norm": 0.3405448794364929,
+      "learning_rate": 0.0001459381443298969,
+      "loss": 0.7385,
+      "step": 1462
+    },
+    {
+      "epoch": 0.93632,
+      "grad_norm": 0.381549596786499,
+      "learning_rate": 0.00014589690721649484,
+      "loss": 0.7132,
+      "step": 1463
+    },
+    {
+      "epoch": 0.93696,
+      "grad_norm": 0.3560553193092346,
+      "learning_rate": 0.0001458556701030928,
+      "loss": 0.658,
+      "step": 1464
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.33707597851753235,
+      "learning_rate": 0.00014581443298969073,
+      "loss": 0.722,
+      "step": 1465
+    },
+    {
+      "epoch": 0.93824,
+      "grad_norm": 0.3352229595184326,
+      "learning_rate": 0.00014577319587628866,
+      "loss": 0.7734,
+      "step": 1466
+    },
+    {
+      "epoch": 0.93888,
+      "grad_norm": 0.30755874514579773,
+      "learning_rate": 0.00014573195876288662,
+      "loss": 0.6894,
+      "step": 1467
+    },
+    {
+      "epoch": 0.93952,
+      "grad_norm": 0.3388887941837311,
+      "learning_rate": 0.00014569072164948455,
+      "loss": 0.6426,
+      "step": 1468
+    },
+    {
+      "epoch": 0.94016,
+      "grad_norm": 0.32400229573249817,
+      "learning_rate": 0.00014564948453608248,
+      "loss": 0.5684,
+      "step": 1469
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.3112936019897461,
+      "learning_rate": 0.0001456082474226804,
+      "loss": 0.7669,
+      "step": 1470
+    },
+    {
+      "epoch": 0.94144,
+      "grad_norm": 0.3637300431728363,
+      "learning_rate": 0.00014556701030927834,
+      "loss": 0.7408,
+      "step": 1471
+    },
+    {
+      "epoch": 0.94208,
+      "grad_norm": 0.3490108251571655,
+      "learning_rate": 0.0001455257731958763,
+      "loss": 0.6653,
+      "step": 1472
+    },
+    {
+      "epoch": 0.94272,
+      "grad_norm": 0.3674986660480499,
+      "learning_rate": 0.00014548453608247423,
+      "loss": 0.7763,
+      "step": 1473
+    },
+    {
+      "epoch": 0.94336,
+      "grad_norm": 0.3270869553089142,
+      "learning_rate": 0.00014544329896907217,
+      "loss": 0.7423,
+      "step": 1474
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.34406957030296326,
+      "learning_rate": 0.00014540206185567012,
+      "loss": 0.77,
+      "step": 1475
+    },
+    {
+      "epoch": 0.94464,
+      "grad_norm": 0.3693459928035736,
+      "learning_rate": 0.00014536082474226805,
+      "loss": 0.6565,
+      "step": 1476
+    },
+    {
+      "epoch": 0.94528,
+      "grad_norm": 0.35018038749694824,
+      "learning_rate": 0.00014531958762886599,
+      "loss": 0.6507,
+      "step": 1477
+    },
+    {
+      "epoch": 0.94592,
+      "grad_norm": 0.3434397280216217,
+      "learning_rate": 0.00014527835051546392,
+      "loss": 0.7195,
+      "step": 1478
+    },
+    {
+      "epoch": 0.94656,
+      "grad_norm": 0.3621327579021454,
+      "learning_rate": 0.00014523711340206185,
+      "loss": 0.7103,
+      "step": 1479
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.316039502620697,
+      "learning_rate": 0.0001451958762886598,
+      "loss": 0.6073,
+      "step": 1480
+    },
+    {
+      "epoch": 0.94784,
+      "grad_norm": 0.3471470773220062,
+      "learning_rate": 0.00014515463917525774,
+      "loss": 0.7357,
+      "step": 1481
+    },
+    {
+      "epoch": 0.94848,
+      "grad_norm": 0.3563861548900604,
+      "learning_rate": 0.0001451134020618557,
+      "loss": 0.7745,
+      "step": 1482
+    },
+    {
+      "epoch": 0.94912,
+      "grad_norm": 0.3513467013835907,
+      "learning_rate": 0.00014507216494845363,
+      "loss": 0.8306,
+      "step": 1483
+    },
+    {
+      "epoch": 0.94976,
+      "grad_norm": 0.32617536187171936,
+      "learning_rate": 0.00014503092783505153,
+      "loss": 0.7443,
+      "step": 1484
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3458118140697479,
+      "learning_rate": 0.0001449896907216495,
+      "loss": 0.7389,
+      "step": 1485
+    },
+    {
+      "epoch": 0.95104,
+      "grad_norm": 0.35420748591423035,
+      "learning_rate": 0.00014494845360824742,
+      "loss": 0.6689,
+      "step": 1486
+    },
+    {
+      "epoch": 0.95168,
+      "grad_norm": 0.32274532318115234,
+      "learning_rate": 0.00014490721649484538,
+      "loss": 0.6128,
+      "step": 1487
+    },
+    {
+      "epoch": 0.95232,
+      "grad_norm": 0.334571897983551,
+      "learning_rate": 0.0001448659793814433,
+      "loss": 0.653,
+      "step": 1488
+    },
+    {
+      "epoch": 0.95296,
+      "grad_norm": 0.3239440619945526,
+      "learning_rate": 0.00014482474226804124,
+      "loss": 0.6419,
+      "step": 1489
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.33303913474082947,
+      "learning_rate": 0.00014478350515463917,
+      "loss": 0.6425,
+      "step": 1490
+    },
+    {
+      "epoch": 0.95424,
+      "grad_norm": 0.35053551197052,
+      "learning_rate": 0.0001447422680412371,
+      "loss": 0.8042,
+      "step": 1491
+    },
+    {
+      "epoch": 0.95488,
+      "grad_norm": 0.3504735827445984,
+      "learning_rate": 0.00014470103092783506,
+      "loss": 0.7464,
+      "step": 1492
+    },
+    {
+      "epoch": 0.95552,
+      "grad_norm": 0.34028613567352295,
+      "learning_rate": 0.000144659793814433,
+      "loss": 0.6041,
+      "step": 1493
+    },
+    {
+      "epoch": 0.95616,
+      "grad_norm": 0.3301861882209778,
+      "learning_rate": 0.00014461855670103093,
+      "loss": 0.6822,
+      "step": 1494
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3272842466831207,
+      "learning_rate": 0.00014457731958762888,
+      "loss": 0.706,
+      "step": 1495
+    },
+    {
+      "epoch": 0.95744,
+      "grad_norm": 0.3220781981945038,
+      "learning_rate": 0.00014453608247422682,
+      "loss": 0.5884,
+      "step": 1496
+    },
+    {
+      "epoch": 0.95808,
+      "grad_norm": 0.3289218246936798,
+      "learning_rate": 0.00014449484536082475,
+      "loss": 0.6679,
+      "step": 1497
+    },
+    {
+      "epoch": 0.95872,
+      "grad_norm": 0.3308790922164917,
+      "learning_rate": 0.00014445360824742268,
+      "loss": 0.6896,
+      "step": 1498
+    },
+    {
+      "epoch": 0.95936,
+      "grad_norm": 0.31763625144958496,
+      "learning_rate": 0.0001444123711340206,
+      "loss": 0.6182,
+      "step": 1499
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3630064129829407,
+      "learning_rate": 0.00014437113402061857,
+      "loss": 0.6053,
+      "step": 1500
+    },
+    {
+      "epoch": 0.96064,
+      "grad_norm": 0.3612072169780731,
+      "learning_rate": 0.0001443298969072165,
+      "loss": 0.7454,
+      "step": 1501
+    },
+    {
+      "epoch": 0.96128,
+      "grad_norm": 0.38905102014541626,
+      "learning_rate": 0.00014428865979381443,
+      "loss": 0.7743,
+      "step": 1502
+    },
+    {
+      "epoch": 0.96192,
+      "grad_norm": 0.31242096424102783,
+      "learning_rate": 0.0001442474226804124,
+      "loss": 0.5985,
+      "step": 1503
+    },
+    {
+      "epoch": 0.96256,
+      "grad_norm": 0.3593393862247467,
+      "learning_rate": 0.00014420618556701032,
+      "loss": 0.7271,
+      "step": 1504
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3331391513347626,
+      "learning_rate": 0.00014416494845360825,
+      "loss": 0.5825,
+      "step": 1505
+    },
+    {
+      "epoch": 0.96384,
+      "grad_norm": 0.34404680132865906,
+      "learning_rate": 0.00014412371134020618,
+      "loss": 0.6196,
+      "step": 1506
+    },
+    {
+      "epoch": 0.96448,
+      "grad_norm": 0.34856563806533813,
+      "learning_rate": 0.00014408247422680411,
+      "loss": 0.7277,
+      "step": 1507
+    },
+    {
+      "epoch": 0.96512,
+      "grad_norm": 0.3219682574272156,
+      "learning_rate": 0.00014404123711340207,
+      "loss": 0.6949,
+      "step": 1508
+    },
+    {
+      "epoch": 0.96576,
+      "grad_norm": 0.34216082096099854,
+      "learning_rate": 0.000144,
+      "loss": 0.7733,
+      "step": 1509
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3214196264743805,
+      "learning_rate": 0.00014395876288659796,
+      "loss": 0.6478,
+      "step": 1510
+    },
+    {
+      "epoch": 0.96704,
+      "grad_norm": 0.3411322832107544,
+      "learning_rate": 0.0001439175257731959,
+      "loss": 0.6221,
+      "step": 1511
+    },
+    {
+      "epoch": 0.96768,
+      "grad_norm": 0.33544671535491943,
+      "learning_rate": 0.00014387628865979382,
+      "loss": 0.6342,
+      "step": 1512
+    },
+    {
+      "epoch": 0.96832,
+      "grad_norm": 0.3665071129798889,
+      "learning_rate": 0.00014383505154639176,
+      "loss": 0.6918,
+      "step": 1513
+    },
+    {
+      "epoch": 0.96896,
+      "grad_norm": 0.3559079170227051,
+      "learning_rate": 0.0001437938144329897,
+      "loss": 0.6973,
+      "step": 1514
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.34050458669662476,
+      "learning_rate": 0.00014375257731958765,
+      "loss": 0.6281,
+      "step": 1515
+    },
+    {
+      "epoch": 0.97024,
+      "grad_norm": 0.3335132300853729,
+      "learning_rate": 0.00014371134020618558,
+      "loss": 0.6622,
+      "step": 1516
+    },
+    {
+      "epoch": 0.97088,
+      "grad_norm": 0.34413525462150574,
+      "learning_rate": 0.0001436701030927835,
+      "loss": 0.6899,
+      "step": 1517
+    },
+    {
+      "epoch": 0.97152,
+      "grad_norm": 0.32614198327064514,
+      "learning_rate": 0.00014362886597938147,
+      "loss": 0.6784,
+      "step": 1518
+    },
+    {
+      "epoch": 0.97216,
+      "grad_norm": 0.3583051264286041,
+      "learning_rate": 0.0001435876288659794,
+      "loss": 0.7253,
+      "step": 1519
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.32303881645202637,
+      "learning_rate": 0.00014354639175257733,
+      "loss": 0.6762,
+      "step": 1520
+    },
+    {
+      "epoch": 0.97344,
+      "grad_norm": 0.36455729603767395,
+      "learning_rate": 0.00014350515463917526,
+      "loss": 0.6477,
+      "step": 1521
+    },
+    {
+      "epoch": 0.97408,
+      "grad_norm": 0.3207743465900421,
+      "learning_rate": 0.0001434639175257732,
+      "loss": 0.7027,
+      "step": 1522
+    },
+    {
+      "epoch": 0.97472,
+      "grad_norm": 0.34744077920913696,
+      "learning_rate": 0.00014342268041237115,
+      "loss": 0.6641,
+      "step": 1523
+    },
+    {
+      "epoch": 0.97536,
+      "grad_norm": 0.35359621047973633,
+      "learning_rate": 0.00014338144329896908,
+      "loss": 0.6712,
+      "step": 1524
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.32968953251838684,
+      "learning_rate": 0.00014334020618556704,
+      "loss": 0.7755,
+      "step": 1525
+    },
+    {
+      "epoch": 0.97664,
+      "grad_norm": 0.3231963515281677,
+      "learning_rate": 0.00014329896907216494,
+      "loss": 0.7248,
+      "step": 1526
+    },
+    {
+      "epoch": 0.97728,
+      "grad_norm": 0.3221093714237213,
+      "learning_rate": 0.00014325773195876287,
+      "loss": 0.6597,
+      "step": 1527
+    },
+    {
+      "epoch": 0.97792,
+      "grad_norm": 0.31836336851119995,
+      "learning_rate": 0.00014321649484536083,
+      "loss": 0.73,
+      "step": 1528
+    },
+    {
+      "epoch": 0.97856,
+      "grad_norm": 0.3228800892829895,
+      "learning_rate": 0.00014317525773195876,
+      "loss": 0.6581,
+      "step": 1529
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.29753103852272034,
+      "learning_rate": 0.00014313402061855672,
+      "loss": 0.6584,
+      "step": 1530
+    },
+    {
+      "epoch": 0.97984,
+      "grad_norm": 0.34255167841911316,
+      "learning_rate": 0.00014309278350515465,
+      "loss": 0.7127,
+      "step": 1531
+    },
+    {
+      "epoch": 0.98048,
+      "grad_norm": 0.3666214644908905,
+      "learning_rate": 0.00014305154639175259,
+      "loss": 0.7636,
+      "step": 1532
+    },
+    {
+      "epoch": 0.98112,
+      "grad_norm": 0.3017803132534027,
+      "learning_rate": 0.00014301030927835052,
+      "loss": 0.4781,
+      "step": 1533
+    },
+    {
+      "epoch": 0.98176,
+      "grad_norm": 0.34710705280303955,
+      "learning_rate": 0.00014296907216494845,
+      "loss": 0.7269,
+      "step": 1534
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3229653239250183,
+      "learning_rate": 0.00014292783505154638,
+      "loss": 0.6193,
+      "step": 1535
+    },
+    {
+      "epoch": 0.98304,
+      "grad_norm": 0.35263773798942566,
+      "learning_rate": 0.00014288659793814434,
+      "loss": 0.8413,
+      "step": 1536
+    },
+    {
+      "epoch": 0.98368,
+      "grad_norm": 0.3444972634315491,
+      "learning_rate": 0.00014284536082474227,
+      "loss": 0.7474,
+      "step": 1537
+    },
+    {
+      "epoch": 0.98432,
+      "grad_norm": 0.34435218572616577,
+      "learning_rate": 0.00014280412371134023,
+      "loss": 0.7323,
+      "step": 1538
+    },
+    {
+      "epoch": 0.98496,
+      "grad_norm": 0.363757848739624,
+      "learning_rate": 0.00014276288659793816,
+      "loss": 0.7352,
+      "step": 1539
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3740340769290924,
+      "learning_rate": 0.0001427216494845361,
+      "loss": 0.7309,
+      "step": 1540
+    },
+    {
+      "epoch": 0.98624,
+      "grad_norm": 0.377585232257843,
+      "learning_rate": 0.00014268041237113402,
+      "loss": 0.7167,
+      "step": 1541
+    },
+    {
+      "epoch": 0.98688,
+      "grad_norm": 0.3670336902141571,
+      "learning_rate": 0.00014263917525773195,
+      "loss": 0.732,
+      "step": 1542
+    },
+    {
+      "epoch": 0.98752,
+      "grad_norm": 0.28932520747184753,
+      "learning_rate": 0.0001425979381443299,
+      "loss": 0.6054,
+      "step": 1543
+    },
+    {
+      "epoch": 0.98816,
+      "grad_norm": 0.35115039348602295,
+      "learning_rate": 0.00014255670103092784,
+      "loss": 0.6665,
+      "step": 1544
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3191932439804077,
+      "learning_rate": 0.00014251546391752577,
+      "loss": 0.5987,
+      "step": 1545
+    },
+    {
+      "epoch": 0.98944,
+      "grad_norm": 0.3272586464881897,
+      "learning_rate": 0.00014247422680412373,
+      "loss": 0.6658,
+      "step": 1546
+    },
+    {
+      "epoch": 0.99008,
+      "grad_norm": 0.32529738545417786,
+      "learning_rate": 0.00014243298969072166,
+      "loss": 0.7015,
+      "step": 1547
+    },
+    {
+      "epoch": 0.99072,
+      "grad_norm": 0.33128103613853455,
+      "learning_rate": 0.0001423917525773196,
+      "loss": 0.7008,
+      "step": 1548
+    },
+    {
+      "epoch": 0.99136,
+      "grad_norm": 0.332308828830719,
+      "learning_rate": 0.00014235051546391753,
+      "loss": 0.6187,
+      "step": 1549
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3438853323459625,
+      "learning_rate": 0.00014230927835051546,
+      "loss": 0.8414,
+      "step": 1550
+    },
+    {
+      "epoch": 0.99264,
+      "grad_norm": 0.3244102895259857,
+      "learning_rate": 0.00014226804123711342,
+      "loss": 0.7177,
+      "step": 1551
+    },
+    {
+      "epoch": 0.99328,
+      "grad_norm": 0.3385954201221466,
+      "learning_rate": 0.00014222680412371135,
+      "loss": 0.727,
+      "step": 1552
+    },
+    {
+      "epoch": 0.99392,
+      "grad_norm": 0.36369264125823975,
+      "learning_rate": 0.0001421855670103093,
+      "loss": 0.6948,
+      "step": 1553
+    },
+    {
+      "epoch": 0.99456,
+      "grad_norm": 0.38467058539390564,
+      "learning_rate": 0.00014214432989690724,
+      "loss": 0.6766,
+      "step": 1554
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.33229145407676697,
+      "learning_rate": 0.00014210309278350517,
+      "loss": 0.6784,
+      "step": 1555
+    },
+    {
+      "epoch": 0.99584,
+      "grad_norm": 0.3291219472885132,
+      "learning_rate": 0.0001420618556701031,
+      "loss": 0.7022,
+      "step": 1556
+    },
+    {
+      "epoch": 0.99648,
+      "grad_norm": 0.3074395954608917,
+      "learning_rate": 0.00014202061855670103,
+      "loss": 0.687,
+      "step": 1557
+    },
+    {
+      "epoch": 0.99712,
+      "grad_norm": 0.3554990589618683,
+      "learning_rate": 0.000141979381443299,
+      "loss": 0.7246,
+      "step": 1558
+    },
+    {
+      "epoch": 0.99776,
+      "grad_norm": 0.33503812551498413,
+      "learning_rate": 0.00014193814432989692,
+      "loss": 0.7354,
+      "step": 1559
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3535533547401428,
+      "learning_rate": 0.00014189690721649485,
+      "loss": 0.6769,
+      "step": 1560
+    },
+    {
+      "epoch": 0.99904,
+      "grad_norm": 0.3338563144207001,
+      "learning_rate": 0.0001418556701030928,
+      "loss": 0.6049,
+      "step": 1561
+    },
+    {
+      "epoch": 0.99968,
+      "grad_norm": 0.3241831958293915,
+      "learning_rate": 0.0001418144329896907,
+      "loss": 0.5746,
+      "step": 1562
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4873449504375458,
+      "learning_rate": 0.00014177319587628864,
+      "loss": 0.7101,
+      "step": 1563
+    },
+    {
+      "epoch": 1.00064,
+      "grad_norm": 0.3582092821598053,
+      "learning_rate": 0.0001417319587628866,
+      "loss": 0.6504,
+      "step": 1564
+    },
+    {
+      "epoch": 1.00128,
+      "grad_norm": 0.34047731757164,
+      "learning_rate": 0.00014169072164948453,
+      "loss": 0.5911,
+      "step": 1565
+    },
+    {
+      "epoch": 1.00192,
+      "grad_norm": 0.29821503162384033,
+      "learning_rate": 0.0001416494845360825,
+      "loss": 0.6593,
+      "step": 1566
+    },
+    {
+      "epoch": 1.00256,
+      "grad_norm": 0.36149361729621887,
+      "learning_rate": 0.00014160824742268042,
+      "loss": 0.6476,
+      "step": 1567
+    },
+    {
+      "epoch": 1.0032,
+      "grad_norm": 0.33677470684051514,
+      "learning_rate": 0.00014156701030927836,
+      "loss": 0.6024,
+      "step": 1568
+    },
+    {
+      "epoch": 1.00384,
+      "grad_norm": 0.32472071051597595,
+      "learning_rate": 0.0001415257731958763,
+      "loss": 0.5563,
+      "step": 1569
+    },
+    {
+      "epoch": 1.00448,
+      "grad_norm": 0.41546696424484253,
+      "learning_rate": 0.00014148453608247422,
+      "loss": 0.6046,
+      "step": 1570
+    },
+    {
+      "epoch": 1.00512,
+      "grad_norm": 0.3531988263130188,
+      "learning_rate": 0.00014144329896907218,
+      "loss": 0.4613,
+      "step": 1571
+    },
+    {
+      "epoch": 1.00576,
+      "grad_norm": 0.38962408900260925,
+      "learning_rate": 0.0001414020618556701,
+      "loss": 0.532,
+      "step": 1572
+    },
+    {
+      "epoch": 1.0064,
+      "grad_norm": 0.35115158557891846,
+      "learning_rate": 0.00014136082474226804,
+      "loss": 0.5996,
+      "step": 1573
+    },
+    {
+      "epoch": 1.00704,
+      "grad_norm": 0.3636733293533325,
+      "learning_rate": 0.000141319587628866,
+      "loss": 0.5867,
+      "step": 1574
+    },
+    {
+      "epoch": 1.00768,
+      "grad_norm": 0.3544944226741791,
+      "learning_rate": 0.00014127835051546393,
+      "loss": 0.6231,
+      "step": 1575
+    },
+    {
+      "epoch": 1.00832,
+      "grad_norm": 0.33466288447380066,
+      "learning_rate": 0.00014123711340206186,
+      "loss": 0.4968,
+      "step": 1576
+    },
+    {
+      "epoch": 1.00896,
+      "grad_norm": 0.3898334205150604,
+      "learning_rate": 0.0001411958762886598,
+      "loss": 0.58,
+      "step": 1577
+    },
+    {
+      "epoch": 1.0096,
+      "grad_norm": 0.3533715605735779,
+      "learning_rate": 0.00014115463917525772,
+      "loss": 0.7001,
+      "step": 1578
+    },
+    {
+      "epoch": 1.01024,
+      "grad_norm": 0.36940985918045044,
+      "learning_rate": 0.00014111340206185568,
+      "loss": 0.5609,
+      "step": 1579
+    },
+    {
+      "epoch": 1.01088,
+      "grad_norm": 0.40379032492637634,
+      "learning_rate": 0.0001410721649484536,
+      "loss": 0.588,
+      "step": 1580
+    },
+    {
+      "epoch": 1.01152,
+      "grad_norm": 0.38169723749160767,
+      "learning_rate": 0.00014103092783505157,
+      "loss": 0.522,
+      "step": 1581
+    },
+    {
+      "epoch": 1.01216,
+      "grad_norm": 0.3294139802455902,
+      "learning_rate": 0.0001409896907216495,
+      "loss": 0.5767,
+      "step": 1582
+    },
+    {
+      "epoch": 1.0128,
+      "grad_norm": 0.37129107117652893,
+      "learning_rate": 0.00014094845360824743,
+      "loss": 0.6475,
+      "step": 1583
+    },
+    {
+      "epoch": 1.01344,
+      "grad_norm": 0.3353690207004547,
+      "learning_rate": 0.00014090721649484536,
+      "loss": 0.6361,
+      "step": 1584
+    },
+    {
+      "epoch": 1.01408,
+      "grad_norm": 0.3521464169025421,
+      "learning_rate": 0.0001408659793814433,
+      "loss": 0.5538,
+      "step": 1585
+    },
+    {
+      "epoch": 1.01472,
+      "grad_norm": 0.39704054594039917,
+      "learning_rate": 0.00014082474226804125,
+      "loss": 0.6819,
+      "step": 1586
+    },
+    {
+      "epoch": 1.01536,
+      "grad_norm": 0.3758016526699066,
+      "learning_rate": 0.00014078350515463919,
+      "loss": 0.6187,
+      "step": 1587
+    },
+    {
+      "epoch": 1.016,
+      "grad_norm": 0.33712366223335266,
+      "learning_rate": 0.00014074226804123712,
+      "loss": 0.5761,
+      "step": 1588
+    },
+    {
+      "epoch": 1.01664,
+      "grad_norm": 0.3787217140197754,
+      "learning_rate": 0.00014070103092783507,
+      "loss": 0.5494,
+      "step": 1589
+    },
+    {
+      "epoch": 1.01728,
+      "grad_norm": 0.35239607095718384,
+      "learning_rate": 0.000140659793814433,
+      "loss": 0.5799,
+      "step": 1590
+    },
+    {
+      "epoch": 1.01792,
+      "grad_norm": 0.3878360390663147,
+      "learning_rate": 0.0001406185567010309,
+      "loss": 0.625,
+      "step": 1591
+    },
+    {
+      "epoch": 1.01856,
+      "grad_norm": 0.4082742929458618,
+      "learning_rate": 0.00014057731958762887,
+      "loss": 0.7564,
+      "step": 1592
+    },
+    {
+      "epoch": 1.0192,
+      "grad_norm": 0.3483103811740875,
+      "learning_rate": 0.0001405360824742268,
+      "loss": 0.588,
+      "step": 1593
+    },
+    {
+      "epoch": 1.01984,
+      "grad_norm": 0.345406174659729,
+      "learning_rate": 0.00014049484536082476,
+      "loss": 0.6317,
+      "step": 1594
+    },
+    {
+      "epoch": 1.02048,
+      "grad_norm": 0.3764224052429199,
+      "learning_rate": 0.0001404536082474227,
+      "loss": 0.6833,
+      "step": 1595
+    },
+    {
+      "epoch": 1.02112,
+      "grad_norm": 0.33381062746047974,
+      "learning_rate": 0.00014041237113402062,
+      "loss": 0.5572,
+      "step": 1596
+    },
+    {
+      "epoch": 1.02176,
+      "grad_norm": 0.3805691599845886,
+      "learning_rate": 0.00014037113402061858,
+      "loss": 0.5933,
+      "step": 1597
+    },
+    {
+      "epoch": 1.0224,
+      "grad_norm": 0.3715429902076721,
+      "learning_rate": 0.00014032989690721648,
+      "loss": 0.5799,
+      "step": 1598
+    },
+    {
+      "epoch": 1.02304,
+      "grad_norm": 0.3666604459285736,
+      "learning_rate": 0.00014028865979381444,
+      "loss": 0.6649,
+      "step": 1599
+    },
+    {
+      "epoch": 1.02368,
+      "grad_norm": 0.38050490617752075,
+      "learning_rate": 0.00014024742268041237,
+      "loss": 0.5393,
+      "step": 1600
+    },
+    {
+      "epoch": 1.02432,
+      "grad_norm": 0.365047425031662,
+      "learning_rate": 0.0001402061855670103,
+      "loss": 0.7033,
+      "step": 1601
+    },
+    {
+      "epoch": 1.02496,
+      "grad_norm": 0.32319119572639465,
+      "learning_rate": 0.00014016494845360826,
+      "loss": 0.6234,
+      "step": 1602
+    },
+    {
+      "epoch": 1.0256,
+      "grad_norm": 0.3831627666950226,
+      "learning_rate": 0.0001401237113402062,
+      "loss": 0.6105,
+      "step": 1603
+    },
+    {
+      "epoch": 1.02624,
+      "grad_norm": 0.3453712463378906,
+      "learning_rate": 0.00014008247422680415,
+      "loss": 0.7051,
+      "step": 1604
+    },
+    {
+      "epoch": 1.02688,
+      "grad_norm": 0.35720542073249817,
+      "learning_rate": 0.00014004123711340206,
+      "loss": 0.6065,
+      "step": 1605
+    },
+    {
+      "epoch": 1.02752,
+      "grad_norm": 0.3653002083301544,
+      "learning_rate": 0.00014,
+      "loss": 0.5819,
+      "step": 1606
+    },
+    {
+      "epoch": 1.02816,
+      "grad_norm": 0.36896300315856934,
+      "learning_rate": 0.00013995876288659795,
+      "loss": 0.5667,
+      "step": 1607
+    },
+    {
+      "epoch": 1.0288,
+      "grad_norm": 0.41424256563186646,
+      "learning_rate": 0.00013991752577319588,
+      "loss": 0.5321,
+      "step": 1608
+    },
+    {
+      "epoch": 1.02944,
+      "grad_norm": 0.3553130328655243,
+      "learning_rate": 0.00013987628865979384,
+      "loss": 0.5159,
+      "step": 1609
+    },
+    {
+      "epoch": 1.03008,
+      "grad_norm": 0.3503171503543854,
+      "learning_rate": 0.00013983505154639177,
+      "loss": 0.5749,
+      "step": 1610
+    },
+    {
+      "epoch": 1.03072,
+      "grad_norm": 0.3715916872024536,
+      "learning_rate": 0.0001397938144329897,
+      "loss": 0.6229,
+      "step": 1611
+    },
+    {
+      "epoch": 1.03136,
+      "grad_norm": 0.3359808027744293,
+      "learning_rate": 0.00013975257731958763,
+      "loss": 0.4659,
+      "step": 1612
+    },
+    {
+      "epoch": 1.032,
+      "grad_norm": 0.37680912017822266,
+      "learning_rate": 0.00013971134020618556,
+      "loss": 0.5977,
+      "step": 1613
+    },
+    {
+      "epoch": 1.03264,
+      "grad_norm": 0.3844177722930908,
+      "learning_rate": 0.00013967010309278352,
+      "loss": 0.6201,
+      "step": 1614
+    },
+    {
+      "epoch": 1.03328,
+      "grad_norm": 0.3862740695476532,
+      "learning_rate": 0.00013962886597938145,
+      "loss": 0.5754,
+      "step": 1615
+    },
+    {
+      "epoch": 1.03392,
+      "grad_norm": 0.3973449170589447,
+      "learning_rate": 0.00013958762886597938,
+      "loss": 0.4929,
+      "step": 1616
+    },
+    {
+      "epoch": 1.03456,
+      "grad_norm": 0.3179456293582916,
+      "learning_rate": 0.00013954639175257734,
+      "loss": 0.5141,
+      "step": 1617
+    },
+    {
+      "epoch": 1.0352,
+      "grad_norm": 0.3669663071632385,
+      "learning_rate": 0.00013950515463917527,
+      "loss": 0.5807,
+      "step": 1618
+    },
+    {
+      "epoch": 1.03584,
+      "grad_norm": 0.3528003990650177,
+      "learning_rate": 0.0001394639175257732,
+      "loss": 0.4915,
+      "step": 1619
+    },
+    {
+      "epoch": 1.03648,
+      "grad_norm": 0.3512045741081238,
+      "learning_rate": 0.00013942268041237113,
+      "loss": 0.5493,
+      "step": 1620
+    },
+    {
+      "epoch": 1.03712,
+      "grad_norm": 0.339330792427063,
+      "learning_rate": 0.00013938144329896907,
+      "loss": 0.5518,
+      "step": 1621
+    },
+    {
+      "epoch": 1.03776,
+      "grad_norm": 0.34881454706192017,
+      "learning_rate": 0.00013934020618556702,
+      "loss": 0.6093,
+      "step": 1622
+    },
+    {
+      "epoch": 1.0384,
+      "grad_norm": 0.36878353357315063,
+      "learning_rate": 0.00013929896907216495,
+      "loss": 0.6323,
+      "step": 1623
+    },
+    {
+      "epoch": 1.03904,
+      "grad_norm": 0.3644968569278717,
+      "learning_rate": 0.00013925773195876289,
+      "loss": 0.5141,
+      "step": 1624
+    },
+    {
+      "epoch": 1.03968,
+      "grad_norm": 0.38777193427085876,
+      "learning_rate": 0.00013921649484536084,
+      "loss": 0.5938,
+      "step": 1625
+    },
+    {
+      "epoch": 1.04032,
+      "grad_norm": 0.37466245889663696,
+      "learning_rate": 0.00013917525773195878,
+      "loss": 0.6795,
+      "step": 1626
+    },
+    {
+      "epoch": 1.04096,
+      "grad_norm": 0.36173924803733826,
+      "learning_rate": 0.0001391340206185567,
+      "loss": 0.5851,
+      "step": 1627
+    },
+    {
+      "epoch": 1.0416,
+      "grad_norm": 0.3563174903392792,
+      "learning_rate": 0.00013909278350515464,
+      "loss": 0.6418,
+      "step": 1628
+    },
+    {
+      "epoch": 1.04224,
+      "grad_norm": 0.39220330119132996,
+      "learning_rate": 0.00013905154639175257,
+      "loss": 0.6428,
+      "step": 1629
+    },
+    {
+      "epoch": 1.04288,
+      "grad_norm": 0.3948117792606354,
+      "learning_rate": 0.00013901030927835053,
+      "loss": 0.6294,
+      "step": 1630
+    },
+    {
+      "epoch": 1.04352,
+      "grad_norm": 0.37853294610977173,
+      "learning_rate": 0.00013896907216494846,
+      "loss": 0.6395,
+      "step": 1631
+    },
+    {
+      "epoch": 1.04416,
+      "grad_norm": 0.42039862275123596,
+      "learning_rate": 0.00013892783505154642,
+      "loss": 0.5903,
+      "step": 1632
+    },
+    {
+      "epoch": 1.0448,
+      "grad_norm": 0.3778029978275299,
+      "learning_rate": 0.00013888659793814435,
+      "loss": 0.5811,
+      "step": 1633
+    },
+    {
+      "epoch": 1.04544,
+      "grad_norm": 0.3506350815296173,
+      "learning_rate": 0.00013884536082474225,
+      "loss": 0.6402,
+      "step": 1634
+    },
+    {
+      "epoch": 1.04608,
+      "grad_norm": 0.3886317014694214,
+      "learning_rate": 0.0001388041237113402,
+      "loss": 0.642,
+      "step": 1635
+    },
+    {
+      "epoch": 1.04672,
+      "grad_norm": 0.33783188462257385,
+      "learning_rate": 0.00013876288659793814,
+      "loss": 0.4852,
+      "step": 1636
+    },
+    {
+      "epoch": 1.04736,
+      "grad_norm": 0.33383408188819885,
+      "learning_rate": 0.0001387216494845361,
+      "loss": 0.5621,
+      "step": 1637
+    },
+    {
+      "epoch": 1.048,
+      "grad_norm": 0.3467686176300049,
+      "learning_rate": 0.00013868041237113403,
+      "loss": 0.5509,
+      "step": 1638
+    },
+    {
+      "epoch": 1.04864,
+      "grad_norm": 0.3499736487865448,
+      "learning_rate": 0.00013863917525773196,
+      "loss": 0.5857,
+      "step": 1639
+    },
+    {
+      "epoch": 1.04928,
+      "grad_norm": 0.3642216622829437,
+      "learning_rate": 0.0001385979381443299,
+      "loss": 0.5341,
+      "step": 1640
+    },
+    {
+      "epoch": 1.04992,
+      "grad_norm": 0.3752499520778656,
+      "learning_rate": 0.00013855670103092783,
+      "loss": 0.6762,
+      "step": 1641
+    },
+    {
+      "epoch": 1.05056,
+      "grad_norm": 0.37471890449523926,
+      "learning_rate": 0.00013851546391752578,
+      "loss": 0.5817,
+      "step": 1642
+    },
+    {
+      "epoch": 1.0512,
+      "grad_norm": 0.36779728531837463,
+      "learning_rate": 0.00013847422680412372,
+      "loss": 0.6125,
+      "step": 1643
+    },
+    {
+      "epoch": 1.0518399999999999,
+      "grad_norm": 0.37170174717903137,
+      "learning_rate": 0.00013843298969072165,
+      "loss": 0.6245,
+      "step": 1644
+    },
+    {
+      "epoch": 1.05248,
+      "grad_norm": 0.38504675030708313,
+      "learning_rate": 0.0001383917525773196,
+      "loss": 0.5763,
+      "step": 1645
+    },
+    {
+      "epoch": 1.05312,
+      "grad_norm": 0.3785061538219452,
+      "learning_rate": 0.00013835051546391754,
+      "loss": 0.6072,
+      "step": 1646
+    },
+    {
+      "epoch": 1.05376,
+      "grad_norm": 0.3451640009880066,
+      "learning_rate": 0.00013830927835051547,
+      "loss": 0.5257,
+      "step": 1647
+    },
+    {
+      "epoch": 1.0544,
+      "grad_norm": 0.362108439207077,
+      "learning_rate": 0.0001382680412371134,
+      "loss": 0.555,
+      "step": 1648
+    },
+    {
+      "epoch": 1.05504,
+      "grad_norm": 0.35216209292411804,
+      "learning_rate": 0.00013822680412371133,
+      "loss": 0.593,
+      "step": 1649
+    },
+    {
+      "epoch": 1.05568,
+      "grad_norm": 0.3748760521411896,
+      "learning_rate": 0.0001381855670103093,
+      "loss": 0.701,
+      "step": 1650
+    },
+    {
+      "epoch": 1.05632,
+      "grad_norm": 0.38456958532333374,
+      "learning_rate": 0.00013814432989690722,
+      "loss": 0.6053,
+      "step": 1651
+    },
+    {
+      "epoch": 1.05696,
+      "grad_norm": 0.3533494770526886,
+      "learning_rate": 0.00013810309278350515,
+      "loss": 0.6423,
+      "step": 1652
+    },
+    {
+      "epoch": 1.0576,
+      "grad_norm": 0.3906964063644409,
+      "learning_rate": 0.0001380618556701031,
+      "loss": 0.6442,
+      "step": 1653
+    },
+    {
+      "epoch": 1.05824,
+      "grad_norm": 0.3915063440799713,
+      "learning_rate": 0.00013802061855670104,
+      "loss": 0.5803,
+      "step": 1654
+    },
+    {
+      "epoch": 1.05888,
+      "grad_norm": 0.3616456091403961,
+      "learning_rate": 0.00013797938144329897,
+      "loss": 0.6542,
+      "step": 1655
+    },
+    {
+      "epoch": 1.05952,
+      "grad_norm": 0.3633642792701721,
+      "learning_rate": 0.0001379381443298969,
+      "loss": 0.5703,
+      "step": 1656
+    },
+    {
+      "epoch": 1.06016,
+      "grad_norm": 0.42691540718078613,
+      "learning_rate": 0.00013789690721649484,
+      "loss": 0.5512,
+      "step": 1657
+    },
+    {
+      "epoch": 1.0608,
+      "grad_norm": 0.3361470103263855,
+      "learning_rate": 0.0001378556701030928,
+      "loss": 0.5057,
+      "step": 1658
+    },
+    {
+      "epoch": 1.06144,
+      "grad_norm": 0.37919721007347107,
+      "learning_rate": 0.00013781443298969072,
+      "loss": 0.5841,
+      "step": 1659
+    },
+    {
+      "epoch": 1.06208,
+      "grad_norm": 0.3860274851322174,
+      "learning_rate": 0.00013777319587628868,
+      "loss": 0.7334,
+      "step": 1660
+    },
+    {
+      "epoch": 1.06272,
+      "grad_norm": 0.3730994164943695,
+      "learning_rate": 0.00013773195876288661,
+      "loss": 0.5825,
+      "step": 1661
+    },
+    {
+      "epoch": 1.06336,
+      "grad_norm": 0.37137842178344727,
+      "learning_rate": 0.00013769072164948455,
+      "loss": 0.6255,
+      "step": 1662
+    },
+    {
+      "epoch": 1.064,
+      "grad_norm": 0.35934144258499146,
+      "learning_rate": 0.00013764948453608248,
+      "loss": 0.5399,
+      "step": 1663
+    },
+    {
+      "epoch": 1.06464,
+      "grad_norm": 0.3864918649196625,
+      "learning_rate": 0.0001376082474226804,
+      "loss": 0.6441,
+      "step": 1664
+    },
+    {
+      "epoch": 1.06528,
+      "grad_norm": 0.3714539110660553,
+      "learning_rate": 0.00013756701030927837,
+      "loss": 0.5749,
+      "step": 1665
+    },
+    {
+      "epoch": 1.06592,
+      "grad_norm": 0.34073054790496826,
+      "learning_rate": 0.0001375257731958763,
+      "loss": 0.5158,
+      "step": 1666
+    },
+    {
+      "epoch": 1.06656,
+      "grad_norm": 0.3501395881175995,
+      "learning_rate": 0.00013748453608247423,
+      "loss": 0.5898,
+      "step": 1667
+    },
+    {
+      "epoch": 1.0672,
+      "grad_norm": 0.37236419320106506,
+      "learning_rate": 0.0001374432989690722,
+      "loss": 0.5146,
+      "step": 1668
+    },
+    {
+      "epoch": 1.06784,
+      "grad_norm": 0.338746041059494,
+      "learning_rate": 0.00013740206185567012,
+      "loss": 0.5545,
+      "step": 1669
+    },
+    {
+      "epoch": 1.06848,
+      "grad_norm": 0.41783398389816284,
+      "learning_rate": 0.00013736082474226805,
+      "loss": 0.6795,
+      "step": 1670
+    },
+    {
+      "epoch": 1.06912,
+      "grad_norm": 0.3753952085971832,
+      "learning_rate": 0.00013731958762886598,
+      "loss": 0.6388,
+      "step": 1671
+    },
+    {
+      "epoch": 1.06976,
+      "grad_norm": 0.42162153124809265,
+      "learning_rate": 0.0001372783505154639,
+      "loss": 0.5841,
+      "step": 1672
+    },
+    {
+      "epoch": 1.0704,
+      "grad_norm": 0.35999423265457153,
+      "learning_rate": 0.00013723711340206187,
+      "loss": 0.6306,
+      "step": 1673
+    },
+    {
+      "epoch": 1.07104,
+      "grad_norm": 0.35728198289871216,
+      "learning_rate": 0.0001371958762886598,
+      "loss": 0.5569,
+      "step": 1674
+    },
+    {
+      "epoch": 1.07168,
+      "grad_norm": 0.4031636416912079,
+      "learning_rate": 0.00013715463917525776,
+      "loss": 0.5657,
+      "step": 1675
+    },
+    {
+      "epoch": 1.07232,
+      "grad_norm": 0.3436456322669983,
+      "learning_rate": 0.00013711340206185566,
+      "loss": 0.479,
+      "step": 1676
+    },
+    {
+      "epoch": 1.07296,
+      "grad_norm": 0.38950830698013306,
+      "learning_rate": 0.0001370721649484536,
+      "loss": 0.5212,
+      "step": 1677
+    },
+    {
+      "epoch": 1.0735999999999999,
+      "grad_norm": 0.37595292925834656,
+      "learning_rate": 0.00013703092783505155,
+      "loss": 0.5956,
+      "step": 1678
+    },
+    {
+      "epoch": 1.07424,
+      "grad_norm": 0.36911144852638245,
+      "learning_rate": 0.00013698969072164949,
+      "loss": 0.5517,
+      "step": 1679
+    },
+    {
+      "epoch": 1.07488,
+      "grad_norm": 0.36743664741516113,
+      "learning_rate": 0.00013694845360824744,
+      "loss": 0.5383,
+      "step": 1680
+    },
+    {
+      "epoch": 1.07552,
+      "grad_norm": 0.3812563121318817,
+      "learning_rate": 0.00013690721649484538,
+      "loss": 0.5541,
+      "step": 1681
+    },
+    {
+      "epoch": 1.07616,
+      "grad_norm": 0.39858195185661316,
+      "learning_rate": 0.0001368659793814433,
+      "loss": 0.6808,
+      "step": 1682
+    },
+    {
+      "epoch": 1.0768,
+      "grad_norm": 0.35400688648223877,
+      "learning_rate": 0.00013682474226804124,
+      "loss": 0.5576,
+      "step": 1683
+    },
+    {
+      "epoch": 1.07744,
+      "grad_norm": 0.37154725193977356,
+      "learning_rate": 0.00013678350515463917,
+      "loss": 0.4932,
+      "step": 1684
+    },
+    {
+      "epoch": 1.07808,
+      "grad_norm": 0.3603758215904236,
+      "learning_rate": 0.0001367422680412371,
+      "loss": 0.4878,
+      "step": 1685
+    },
+    {
+      "epoch": 1.07872,
+      "grad_norm": 0.3607088327407837,
+      "learning_rate": 0.00013670103092783506,
+      "loss": 0.6534,
+      "step": 1686
+    },
+    {
+      "epoch": 1.07936,
+      "grad_norm": 0.3325243890285492,
+      "learning_rate": 0.000136659793814433,
+      "loss": 0.5901,
+      "step": 1687
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.3671567440032959,
+      "learning_rate": 0.00013661855670103095,
+      "loss": 0.5408,
+      "step": 1688
+    },
+    {
+      "epoch": 1.08064,
+      "grad_norm": 0.41899484395980835,
+      "learning_rate": 0.00013657731958762888,
+      "loss": 0.6145,
+      "step": 1689
+    },
+    {
+      "epoch": 1.08128,
+      "grad_norm": 0.3702346086502075,
+      "learning_rate": 0.0001365360824742268,
+      "loss": 0.5571,
+      "step": 1690
+    },
+    {
+      "epoch": 1.08192,
+      "grad_norm": 0.376015841960907,
+      "learning_rate": 0.00013649484536082474,
+      "loss": 0.5516,
+      "step": 1691
+    },
+    {
+      "epoch": 1.08256,
+      "grad_norm": 0.331441193819046,
+      "learning_rate": 0.00013645360824742267,
+      "loss": 0.5411,
+      "step": 1692
+    },
+    {
+      "epoch": 1.0832,
+      "grad_norm": 0.41230928897857666,
+      "learning_rate": 0.00013641237113402063,
+      "loss": 0.6085,
+      "step": 1693
+    },
+    {
+      "epoch": 1.08384,
+      "grad_norm": 0.3398403823375702,
+      "learning_rate": 0.00013637113402061856,
+      "loss": 0.5479,
+      "step": 1694
+    },
+    {
+      "epoch": 1.08448,
+      "grad_norm": 0.36121487617492676,
+      "learning_rate": 0.0001363298969072165,
+      "loss": 0.5682,
+      "step": 1695
+    },
+    {
+      "epoch": 1.08512,
+      "grad_norm": 0.3683512508869171,
+      "learning_rate": 0.00013628865979381445,
+      "loss": 0.5344,
+      "step": 1696
+    },
+    {
+      "epoch": 1.08576,
+      "grad_norm": 0.37766095995903015,
+      "learning_rate": 0.00013624742268041238,
+      "loss": 0.5834,
+      "step": 1697
+    },
+    {
+      "epoch": 1.0864,
+      "grad_norm": 0.3533225655555725,
+      "learning_rate": 0.00013620618556701032,
+      "loss": 0.5221,
+      "step": 1698
+    },
+    {
+      "epoch": 1.08704,
+      "grad_norm": 0.39451518654823303,
+      "learning_rate": 0.00013616494845360825,
+      "loss": 0.6362,
+      "step": 1699
+    },
+    {
+      "epoch": 1.08768,
+      "grad_norm": 0.3887990713119507,
+      "learning_rate": 0.00013612371134020618,
+      "loss": 0.5954,
+      "step": 1700
+    },
+    {
+      "epoch": 1.08832,
+      "grad_norm": 0.37543973326683044,
+      "learning_rate": 0.00013608247422680414,
+      "loss": 0.5687,
+      "step": 1701
+    },
+    {
+      "epoch": 1.08896,
+      "grad_norm": 0.36732420325279236,
+      "learning_rate": 0.00013604123711340207,
+      "loss": 0.6015,
+      "step": 1702
+    },
+    {
+      "epoch": 1.0896,
+      "grad_norm": 0.36806875467300415,
+      "learning_rate": 0.00013600000000000003,
+      "loss": 0.519,
+      "step": 1703
+    },
+    {
+      "epoch": 1.09024,
+      "grad_norm": 0.39172106981277466,
+      "learning_rate": 0.00013595876288659796,
+      "loss": 0.6334,
+      "step": 1704
+    },
+    {
+      "epoch": 1.09088,
+      "grad_norm": 0.36365771293640137,
+      "learning_rate": 0.0001359175257731959,
+      "loss": 0.4939,
+      "step": 1705
+    },
+    {
+      "epoch": 1.09152,
+      "grad_norm": 0.3407532274723053,
+      "learning_rate": 0.00013587628865979382,
+      "loss": 0.5404,
+      "step": 1706
+    },
+    {
+      "epoch": 1.09216,
+      "grad_norm": 0.35514283180236816,
+      "learning_rate": 0.00013583505154639175,
+      "loss": 0.6861,
+      "step": 1707
+    },
+    {
+      "epoch": 1.0928,
+      "grad_norm": 0.39585545659065247,
+      "learning_rate": 0.0001357938144329897,
+      "loss": 0.5692,
+      "step": 1708
+    },
+    {
+      "epoch": 1.09344,
+      "grad_norm": 0.4041411876678467,
+      "learning_rate": 0.00013575257731958764,
+      "loss": 0.5807,
+      "step": 1709
+    },
+    {
+      "epoch": 1.09408,
+      "grad_norm": 0.38059863448143005,
+      "learning_rate": 0.00013571134020618557,
+      "loss": 0.4947,
+      "step": 1710
+    },
+    {
+      "epoch": 1.09472,
+      "grad_norm": 0.3838833272457123,
+      "learning_rate": 0.00013567010309278353,
+      "loss": 0.5708,
+      "step": 1711
+    },
+    {
+      "epoch": 1.09536,
+      "grad_norm": 0.35747432708740234,
+      "learning_rate": 0.00013562886597938143,
+      "loss": 0.5755,
+      "step": 1712
+    },
+    {
+      "epoch": 1.096,
+      "grad_norm": 0.3820178806781769,
+      "learning_rate": 0.00013558762886597937,
+      "loss": 0.6761,
+      "step": 1713
+    },
+    {
+      "epoch": 1.09664,
+      "grad_norm": 0.3965759873390198,
+      "learning_rate": 0.00013554639175257732,
+      "loss": 0.5473,
+      "step": 1714
+    },
+    {
+      "epoch": 1.09728,
+      "grad_norm": 0.36719343066215515,
+      "learning_rate": 0.00013550515463917526,
+      "loss": 0.5434,
+      "step": 1715
+    },
+    {
+      "epoch": 1.09792,
+      "grad_norm": 0.39314016699790955,
+      "learning_rate": 0.00013546391752577321,
+      "loss": 0.5541,
+      "step": 1716
+    },
+    {
+      "epoch": 1.09856,
+      "grad_norm": 0.3576416075229645,
+      "learning_rate": 0.00013542268041237115,
+      "loss": 0.5772,
+      "step": 1717
+    },
+    {
+      "epoch": 1.0992,
+      "grad_norm": 0.35051867365837097,
+      "learning_rate": 0.00013538144329896908,
+      "loss": 0.5975,
+      "step": 1718
+    },
+    {
+      "epoch": 1.09984,
+      "grad_norm": 0.41169795393943787,
+      "learning_rate": 0.000135340206185567,
+      "loss": 0.6602,
+      "step": 1719
+    },
+    {
+      "epoch": 1.10048,
+      "grad_norm": 0.36348679661750793,
+      "learning_rate": 0.00013529896907216494,
+      "loss": 0.5553,
+      "step": 1720
+    },
+    {
+      "epoch": 1.10112,
+      "grad_norm": 0.3854933977127075,
+      "learning_rate": 0.0001352577319587629,
+      "loss": 0.5673,
+      "step": 1721
+    },
+    {
+      "epoch": 1.10176,
+      "grad_norm": 0.3731314539909363,
+      "learning_rate": 0.00013521649484536083,
+      "loss": 0.6773,
+      "step": 1722
+    },
+    {
+      "epoch": 1.1024,
+      "grad_norm": 0.3611276149749756,
+      "learning_rate": 0.00013517525773195876,
+      "loss": 0.4994,
+      "step": 1723
+    },
+    {
+      "epoch": 1.10304,
+      "grad_norm": 0.3814999461174011,
+      "learning_rate": 0.00013513402061855672,
+      "loss": 0.5284,
+      "step": 1724
+    },
+    {
+      "epoch": 1.10368,
+      "grad_norm": 0.3731253147125244,
+      "learning_rate": 0.00013509278350515465,
+      "loss": 0.5762,
+      "step": 1725
+    },
+    {
+      "epoch": 1.10432,
+      "grad_norm": 0.34234726428985596,
+      "learning_rate": 0.00013505154639175258,
+      "loss": 0.491,
+      "step": 1726
+    },
+    {
+      "epoch": 1.10496,
+      "grad_norm": 0.36094292998313904,
+      "learning_rate": 0.0001350103092783505,
+      "loss": 0.5707,
+      "step": 1727
+    },
+    {
+      "epoch": 1.1056,
+      "grad_norm": 0.3746544122695923,
+      "learning_rate": 0.00013496907216494844,
+      "loss": 0.5159,
+      "step": 1728
+    },
+    {
+      "epoch": 1.1062400000000001,
+      "grad_norm": 0.38964226841926575,
+      "learning_rate": 0.0001349278350515464,
+      "loss": 0.6289,
+      "step": 1729
+    },
+    {
+      "epoch": 1.10688,
+      "grad_norm": 0.40024447441101074,
+      "learning_rate": 0.00013488659793814433,
+      "loss": 0.5785,
+      "step": 1730
+    },
+    {
+      "epoch": 1.10752,
+      "grad_norm": 0.4115532338619232,
+      "learning_rate": 0.0001348453608247423,
+      "loss": 0.6364,
+      "step": 1731
+    },
+    {
+      "epoch": 1.10816,
+      "grad_norm": 0.38356631994247437,
+      "learning_rate": 0.00013480412371134022,
+      "loss": 0.652,
+      "step": 1732
+    },
+    {
+      "epoch": 1.1088,
+      "grad_norm": 0.3938641846179962,
+      "learning_rate": 0.00013476288659793815,
+      "loss": 0.6295,
+      "step": 1733
+    },
+    {
+      "epoch": 1.10944,
+      "grad_norm": 0.3447168469429016,
+      "learning_rate": 0.00013472164948453609,
+      "loss": 0.5983,
+      "step": 1734
+    },
+    {
+      "epoch": 1.11008,
+      "grad_norm": 0.37293338775634766,
+      "learning_rate": 0.00013468041237113402,
+      "loss": 0.6428,
+      "step": 1735
+    },
+    {
+      "epoch": 1.11072,
+      "grad_norm": 0.38352444767951965,
+      "learning_rate": 0.00013463917525773197,
+      "loss": 0.5998,
+      "step": 1736
+    },
+    {
+      "epoch": 1.11136,
+      "grad_norm": 0.3988989293575287,
+      "learning_rate": 0.0001345979381443299,
+      "loss": 0.5353,
+      "step": 1737
+    },
+    {
+      "epoch": 1.112,
+      "grad_norm": 0.37284255027770996,
+      "learning_rate": 0.00013455670103092784,
+      "loss": 0.5591,
+      "step": 1738
+    },
+    {
+      "epoch": 1.11264,
+      "grad_norm": 0.3382832109928131,
+      "learning_rate": 0.0001345154639175258,
+      "loss": 0.5718,
+      "step": 1739
+    },
+    {
+      "epoch": 1.11328,
+      "grad_norm": 0.40069517493247986,
+      "learning_rate": 0.00013447422680412373,
+      "loss": 0.5905,
+      "step": 1740
+    },
+    {
+      "epoch": 1.11392,
+      "grad_norm": 0.41355252265930176,
+      "learning_rate": 0.00013443298969072166,
+      "loss": 0.5448,
+      "step": 1741
+    },
+    {
+      "epoch": 1.11456,
+      "grad_norm": 0.42176541686058044,
+      "learning_rate": 0.0001343917525773196,
+      "loss": 0.622,
+      "step": 1742
+    },
+    {
+      "epoch": 1.1152,
+      "grad_norm": 0.383028119802475,
+      "learning_rate": 0.00013435051546391752,
+      "loss": 0.6191,
+      "step": 1743
+    },
+    {
+      "epoch": 1.11584,
+      "grad_norm": 0.4090883433818817,
+      "learning_rate": 0.00013430927835051548,
+      "loss": 0.5833,
+      "step": 1744
+    },
+    {
+      "epoch": 1.11648,
+      "grad_norm": 0.392283171415329,
+      "learning_rate": 0.0001342680412371134,
+      "loss": 0.6282,
+      "step": 1745
+    },
+    {
+      "epoch": 1.11712,
+      "grad_norm": 0.38783347606658936,
+      "learning_rate": 0.00013422680412371134,
+      "loss": 0.6605,
+      "step": 1746
+    },
+    {
+      "epoch": 1.11776,
+      "grad_norm": 0.4046933054924011,
+      "learning_rate": 0.0001341855670103093,
+      "loss": 0.6888,
+      "step": 1747
+    },
+    {
+      "epoch": 1.1184,
+      "grad_norm": 0.393470823764801,
+      "learning_rate": 0.0001341443298969072,
+      "loss": 0.6713,
+      "step": 1748
+    },
+    {
+      "epoch": 1.11904,
+      "grad_norm": 0.37555503845214844,
+      "learning_rate": 0.00013410309278350516,
+      "loss": 0.5369,
+      "step": 1749
+    },
+    {
+      "epoch": 1.11968,
+      "grad_norm": 0.3503190279006958,
+      "learning_rate": 0.0001340618556701031,
+      "loss": 0.5974,
+      "step": 1750
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 5000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 250,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.9555587900773253e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}