diff --git "a/adapter/checkpoint-1220/trainer_state.json" "b/adapter/checkpoint-1220/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/adapter/checkpoint-1220/trainer_state.json"
@@ -0,0 +1,8561 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9975440032746623,
+  "eval_steps": 500,
+  "global_step": 1220,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.3300960063934326,
+      "learning_rate": 2.9999999999999997e-05,
+      "loss": 0.9966,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.4113194942474365,
+      "learning_rate": 5.9999999999999995e-05,
+      "loss": 1.1253,
+      "step": 2
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.2486647665500641,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 1.0721,
+      "step": 3
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.2249160259962082,
+      "learning_rate": 0.00011999999999999999,
+      "loss": 0.9033,
+      "step": 4
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.3706735074520111,
+      "learning_rate": 0.00015,
+      "loss": 1.0498,
+      "step": 5
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.28104931116104126,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.9108,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.27497801184654236,
+      "learning_rate": 0.00020999999999999998,
+      "loss": 0.9038,
+      "step": 7
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.30283215641975403,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 0.8605,
+      "step": 8
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.33457252383232117,
+      "learning_rate": 0.00027,
+      "loss": 0.9049,
+      "step": 9
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.37725692987442017,
+      "learning_rate": 0.0003,
+      "loss": 0.772,
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.27986466884613037,
+      "learning_rate": 0.00029975206611570246,
+      "loss": 0.7666,
+      "step": 11
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.30687034130096436,
+      "learning_rate": 0.00029950413223140494,
+      "loss": 0.8312,
+      "step": 12
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.3321741819381714,
+      "learning_rate": 0.0002992561983471074,
+      "loss": 0.8308,
+      "step": 13
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.29080134630203247,
+      "learning_rate": 0.0002990082644628099,
+      "loss": 0.7597,
+      "step": 14
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.33823856711387634,
+      "learning_rate": 0.0002987603305785124,
+      "loss": 0.8693,
+      "step": 15
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.3461182117462158,
+      "learning_rate": 0.0002985123966942149,
+      "loss": 1.0571,
+      "step": 16
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.22306275367736816,
+      "learning_rate": 0.0002982644628099173,
+      "loss": 0.7706,
+      "step": 17
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 154.4940643310547,
+      "learning_rate": 0.0002980165289256198,
+      "loss": 2.6519,
+      "step": 18
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.22956405580043793,
+      "learning_rate": 0.00029776859504132227,
+      "loss": 0.6897,
+      "step": 19
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.25711989402770996,
+      "learning_rate": 0.00029752066115702476,
+      "loss": 0.7338,
+      "step": 20
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.2565441131591797,
+      "learning_rate": 0.00029727272727272724,
+      "loss": 0.8211,
+      "step": 21
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.2437434047460556,
+      "learning_rate": 0.0002970247933884297,
+      "loss": 0.8027,
+      "step": 22
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.21284469962120056,
+      "learning_rate": 0.0002967768595041322,
+      "loss": 0.7944,
+      "step": 23
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.23338356614112854,
+      "learning_rate": 0.0002965289256198347,
+      "loss": 0.7696,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.25512659549713135,
+      "learning_rate": 0.0002962809917355372,
+      "loss": 0.7693,
+      "step": 25
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.19500921666622162,
+      "learning_rate": 0.0002960330578512396,
+      "loss": 0.7599,
+      "step": 26
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.2554054260253906,
+      "learning_rate": 0.00029578512396694214,
+      "loss": 0.966,
+      "step": 27
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.17682747542858124,
+      "learning_rate": 0.0002955371900826446,
+      "loss": 0.676,
+      "step": 28
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.20516635477542877,
+      "learning_rate": 0.0002952892561983471,
+      "loss": 0.8144,
+      "step": 29
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.3275119662284851,
+      "learning_rate": 0.0002950413223140496,
+      "loss": 0.7704,
+      "step": 30
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.22231778502464294,
+      "learning_rate": 0.000294793388429752,
+      "loss": 0.7614,
+      "step": 31
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.17065812647342682,
+      "learning_rate": 0.0002945454545454545,
+      "loss": 0.5634,
+      "step": 32
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.1771956831216812,
+      "learning_rate": 0.000294297520661157,
+      "loss": 0.7607,
+      "step": 33
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.26693442463874817,
+      "learning_rate": 0.00029404958677685947,
+      "loss": 0.8171,
+      "step": 34
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.409070611000061,
+      "learning_rate": 0.00029380165289256196,
+      "loss": 0.7791,
+      "step": 35
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.20727217197418213,
+      "learning_rate": 0.00029355371900826444,
+      "loss": 0.7357,
+      "step": 36
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.2145707905292511,
+      "learning_rate": 0.0002933057851239669,
+      "loss": 0.8458,
+      "step": 37
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.2068527340888977,
+      "learning_rate": 0.0002930578512396694,
+      "loss": 0.78,
+      "step": 38
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.22432388365268707,
+      "learning_rate": 0.00029280991735537184,
+      "loss": 0.8523,
+      "step": 39
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.19982610642910004,
+      "learning_rate": 0.0002925619834710743,
+      "loss": 0.7372,
+      "step": 40
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 6.248472213745117,
+      "learning_rate": 0.00029231404958677686,
+      "loss": 0.7399,
+      "step": 41
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.2269737422466278,
+      "learning_rate": 0.00029206611570247934,
+      "loss": 0.7842,
+      "step": 42
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.23117898404598236,
+      "learning_rate": 0.0002918181818181818,
+      "loss": 0.7111,
+      "step": 43
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.22466522455215454,
+      "learning_rate": 0.00029157024793388425,
+      "loss": 0.8979,
+      "step": 44
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.20770332217216492,
+      "learning_rate": 0.00029132231404958674,
+      "loss": 0.774,
+      "step": 45
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.2376495748758316,
+      "learning_rate": 0.0002910743801652892,
+      "loss": 0.7216,
+      "step": 46
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.2470778226852417,
+      "learning_rate": 0.0002908264462809917,
+      "loss": 0.7369,
+      "step": 47
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.23465900123119354,
+      "learning_rate": 0.0002905785123966942,
+      "loss": 0.7528,
+      "step": 48
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5718627572059631,
+      "learning_rate": 0.00029033057851239667,
+      "loss": 0.7535,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.21493370831012726,
+      "learning_rate": 0.00029008264462809916,
+      "loss": 0.8593,
+      "step": 50
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.21197210252285004,
+      "learning_rate": 0.00028983471074380164,
+      "loss": 0.8013,
+      "step": 51
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.20836398005485535,
+      "learning_rate": 0.0002895867768595041,
+      "loss": 0.7905,
+      "step": 52
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.2096678912639618,
+      "learning_rate": 0.00028933884297520655,
+      "loss": 0.6754,
+      "step": 53
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.25898435711860657,
+      "learning_rate": 0.00028909090909090904,
+      "loss": 0.7725,
+      "step": 54
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.23370735347270966,
+      "learning_rate": 0.0002888429752066116,
+      "loss": 0.7007,
+      "step": 55
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.23006942868232727,
+      "learning_rate": 0.00028859504132231406,
+      "loss": 0.7534,
+      "step": 56
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.20855402946472168,
+      "learning_rate": 0.0002883471074380165,
+      "loss": 0.9491,
+      "step": 57
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.24340493977069855,
+      "learning_rate": 0.00028809917355371897,
+      "loss": 0.8089,
+      "step": 58
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.20169466733932495,
+      "learning_rate": 0.00028785123966942145,
+      "loss": 0.64,
+      "step": 59
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.23272906243801117,
+      "learning_rate": 0.00028760330578512394,
+      "loss": 0.8456,
+      "step": 60
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1767100691795349,
+      "learning_rate": 0.0002873553719008264,
+      "loss": 0.6686,
+      "step": 61
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.24511106312274933,
+      "learning_rate": 0.0002871074380165289,
+      "loss": 0.6998,
+      "step": 62
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.22284479439258575,
+      "learning_rate": 0.0002868595041322314,
+      "loss": 0.6699,
+      "step": 63
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.21842750906944275,
+      "learning_rate": 0.00028661157024793387,
+      "loss": 0.7413,
+      "step": 64
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.2669163644313812,
+      "learning_rate": 0.00028636363636363636,
+      "loss": 0.931,
+      "step": 65
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.1864808052778244,
+      "learning_rate": 0.0002861157024793388,
+      "loss": 0.5652,
+      "step": 66
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.18369853496551514,
+      "learning_rate": 0.00028586776859504127,
+      "loss": 0.6847,
+      "step": 67
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.22353056073188782,
+      "learning_rate": 0.00028561983471074375,
+      "loss": 0.598,
+      "step": 68
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.20269523561000824,
+      "learning_rate": 0.0002853719008264463,
+      "loss": 0.8688,
+      "step": 69
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.2291198968887329,
+      "learning_rate": 0.0002851239669421488,
+      "loss": 0.7535,
+      "step": 70
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.22033120691776276,
+      "learning_rate": 0.0002848760330578512,
+      "loss": 0.8377,
+      "step": 71
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.2687983214855194,
+      "learning_rate": 0.0002846280991735537,
+      "loss": 0.6926,
+      "step": 72
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.1933681070804596,
+      "learning_rate": 0.00028438016528925617,
+      "loss": 0.6276,
+      "step": 73
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.2820705473423004,
+      "learning_rate": 0.00028413223140495865,
+      "loss": 0.848,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.19532324373722076,
+      "learning_rate": 0.00028388429752066114,
+      "loss": 0.6198,
+      "step": 75
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.25057846307754517,
+      "learning_rate": 0.0002836363636363636,
+      "loss": 0.6838,
+      "step": 76
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.2168462574481964,
+      "learning_rate": 0.0002833884297520661,
+      "loss": 0.7885,
+      "step": 77
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.2106674313545227,
+      "learning_rate": 0.0002831404958677686,
+      "loss": 0.6757,
+      "step": 78
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.24460363388061523,
+      "learning_rate": 0.000282892561983471,
+      "loss": 0.7414,
+      "step": 79
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.3706071078777313,
+      "learning_rate": 0.0002826446280991735,
+      "loss": 0.621,
+      "step": 80
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.2251998782157898,
+      "learning_rate": 0.000282396694214876,
+      "loss": 0.7453,
+      "step": 81
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.24521738290786743,
+      "learning_rate": 0.00028214876033057847,
+      "loss": 0.6985,
+      "step": 82
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.2262742966413498,
+      "learning_rate": 0.000281900826446281,
+      "loss": 0.6316,
+      "step": 83
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.19723354279994965,
+      "learning_rate": 0.00028165289256198344,
+      "loss": 0.4798,
+      "step": 84
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.20684833824634552,
+      "learning_rate": 0.0002814049586776859,
+      "loss": 0.7993,
+      "step": 85
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.19534814357757568,
+      "learning_rate": 0.0002811570247933884,
+      "loss": 0.7735,
+      "step": 86
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.2585545480251312,
+      "learning_rate": 0.0002809090909090909,
+      "loss": 0.8126,
+      "step": 87
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.2510583996772766,
+      "learning_rate": 0.00028066115702479337,
+      "loss": 0.6973,
+      "step": 88
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.1884051263332367,
+      "learning_rate": 0.00028041322314049585,
+      "loss": 0.701,
+      "step": 89
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.2526257038116455,
+      "learning_rate": 0.00028016528925619834,
+      "loss": 0.7132,
+      "step": 90
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.200734481215477,
+      "learning_rate": 0.0002799173553719008,
+      "loss": 0.7024,
+      "step": 91
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.2404022514820099,
+      "learning_rate": 0.0002796694214876033,
+      "loss": 0.704,
+      "step": 92
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.23063871264457703,
+      "learning_rate": 0.00027942148760330573,
+      "loss": 0.6312,
+      "step": 93
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.1759747564792633,
+      "learning_rate": 0.0002791735537190082,
+      "loss": 0.6577,
+      "step": 94
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2009582370519638,
+      "learning_rate": 0.0002789256198347107,
+      "loss": 0.8036,
+      "step": 95
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2200164943933487,
+      "learning_rate": 0.0002786776859504132,
+      "loss": 0.7101,
+      "step": 96
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.19693537056446075,
+      "learning_rate": 0.00027842975206611567,
+      "loss": 0.6221,
+      "step": 97
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.23269779980182648,
+      "learning_rate": 0.00027818181818181815,
+      "loss": 0.8264,
+      "step": 98
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2440226823091507,
+      "learning_rate": 0.00027793388429752064,
+      "loss": 0.8051,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2307034134864807,
+      "learning_rate": 0.0002776859504132231,
+      "loss": 0.631,
+      "step": 100
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2530567944049835,
+      "learning_rate": 0.0002774380165289256,
+      "loss": 0.8616,
+      "step": 101
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2808806300163269,
+      "learning_rate": 0.0002771900826446281,
+      "loss": 0.8333,
+      "step": 102
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.20667941868305206,
+      "learning_rate": 0.00027694214876033057,
+      "loss": 0.7212,
+      "step": 103
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.17540781199932098,
+      "learning_rate": 0.00027669421487603305,
+      "loss": 0.5964,
+      "step": 104
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2526637613773346,
+      "learning_rate": 0.00027644628099173554,
+      "loss": 0.6868,
+      "step": 105
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2137339860200882,
+      "learning_rate": 0.00027619834710743797,
+      "loss": 0.6155,
+      "step": 106
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.21061092615127563,
+      "learning_rate": 0.00027595041322314045,
+      "loss": 0.813,
+      "step": 107
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.21619191765785217,
+      "learning_rate": 0.00027570247933884293,
+      "loss": 0.8046,
+      "step": 108
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.2212170660495758,
+      "learning_rate": 0.0002754545454545454,
+      "loss": 0.6706,
+      "step": 109
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.23427413403987885,
+      "learning_rate": 0.0002752066115702479,
+      "loss": 0.7152,
+      "step": 110
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.20566123723983765,
+      "learning_rate": 0.0002749586776859504,
+      "loss": 0.6568,
+      "step": 111
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.22977930307388306,
+      "learning_rate": 0.00027471074380165287,
+      "loss": 0.7832,
+      "step": 112
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.28307485580444336,
+      "learning_rate": 0.00027446280991735535,
+      "loss": 0.7446,
+      "step": 113
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.19567596912384033,
+      "learning_rate": 0.00027421487603305784,
+      "loss": 0.6394,
+      "step": 114
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.24577689170837402,
+      "learning_rate": 0.0002739669421487603,
+      "loss": 0.6389,
+      "step": 115
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.2180463820695877,
+      "learning_rate": 0.0002737190082644628,
+      "loss": 0.7814,
+      "step": 116
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.19546380639076233,
+      "learning_rate": 0.0002734710743801653,
+      "loss": 0.8312,
+      "step": 117
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.22698360681533813,
+      "learning_rate": 0.00027322314049586777,
+      "loss": 0.7443,
+      "step": 118
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.22987066209316254,
+      "learning_rate": 0.0002729752066115702,
+      "loss": 0.7839,
+      "step": 119
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.20548178255558014,
+      "learning_rate": 0.0002727272727272727,
+      "loss": 0.7805,
+      "step": 120
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2477702796459198,
+      "learning_rate": 0.00027247933884297517,
+      "loss": 0.5694,
+      "step": 121
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.20593340694904327,
+      "learning_rate": 0.00027223140495867765,
+      "loss": 0.6479,
+      "step": 122
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.23635917901992798,
+      "learning_rate": 0.00027198347107438013,
+      "loss": 0.8107,
+      "step": 123
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.25808119773864746,
+      "learning_rate": 0.0002717355371900826,
+      "loss": 0.7876,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.22156469523906708,
+      "learning_rate": 0.0002714876033057851,
+      "loss": 0.7261,
+      "step": 125
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.19892215728759766,
+      "learning_rate": 0.0002712396694214876,
+      "loss": 0.6874,
+      "step": 126
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.24936752021312714,
+      "learning_rate": 0.00027099173553719007,
+      "loss": 0.6155,
+      "step": 127
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.23287539184093475,
+      "learning_rate": 0.0002707438016528925,
+      "loss": 0.602,
+      "step": 128
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.2086639404296875,
+      "learning_rate": 0.00027049586776859504,
+      "loss": 0.7198,
+      "step": 129
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.24974922835826874,
+      "learning_rate": 0.0002702479338842975,
+      "loss": 0.6873,
+      "step": 130
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.2066827118396759,
+      "learning_rate": 0.00027,
+      "loss": 0.5821,
+      "step": 131
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.28004395961761475,
+      "learning_rate": 0.0002697520661157025,
+      "loss": 0.7864,
+      "step": 132
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.22391608357429504,
+      "learning_rate": 0.0002695041322314049,
+      "loss": 0.6773,
+      "step": 133
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.2821199297904968,
+      "learning_rate": 0.0002692561983471074,
+      "loss": 0.6806,
+      "step": 134
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.21736428141593933,
+      "learning_rate": 0.0002690082644628099,
+      "loss": 0.6662,
+      "step": 135
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.23889939486980438,
+      "learning_rate": 0.00026876033057851237,
+      "loss": 0.6356,
+      "step": 136
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.21096719801425934,
+      "learning_rate": 0.00026851239669421485,
+      "loss": 0.6762,
+      "step": 137
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.22622421383857727,
+      "learning_rate": 0.00026826446280991733,
+      "loss": 0.8085,
+      "step": 138
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.19824957847595215,
+      "learning_rate": 0.0002680165289256198,
+      "loss": 0.6031,
+      "step": 139
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.24482691287994385,
+      "learning_rate": 0.0002677685950413223,
+      "loss": 0.6649,
+      "step": 140
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.21291929483413696,
+      "learning_rate": 0.0002675206611570248,
+      "loss": 0.6671,
+      "step": 141
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.2202674299478531,
+      "learning_rate": 0.0002672727272727272,
+      "loss": 0.6469,
+      "step": 142
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.23572632670402527,
+      "learning_rate": 0.0002670247933884297,
+      "loss": 0.7377,
+      "step": 143
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.2051907777786255,
+      "learning_rate": 0.00026677685950413224,
+      "loss": 0.6217,
+      "step": 144
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.23270072042942047,
+      "learning_rate": 0.0002665289256198347,
+      "loss": 0.7933,
+      "step": 145
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.20652809739112854,
+      "learning_rate": 0.00026628099173553715,
+      "loss": 0.6007,
+      "step": 146
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.23084674775600433,
+      "learning_rate": 0.00026603305785123963,
+      "loss": 0.701,
+      "step": 147
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.25663891434669495,
+      "learning_rate": 0.0002657851239669421,
+      "loss": 0.7271,
+      "step": 148
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.25880497694015503,
+      "learning_rate": 0.0002655371900826446,
+      "loss": 0.6562,
+      "step": 149
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.19349205493927002,
+      "learning_rate": 0.0002652892561983471,
+      "loss": 0.5016,
+      "step": 150
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.2401740401983261,
+      "learning_rate": 0.00026504132231404957,
+      "loss": 0.6978,
+      "step": 151
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.19495394825935364,
+      "learning_rate": 0.00026479338842975205,
+      "loss": 0.5562,
+      "step": 152
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.21485286951065063,
+      "learning_rate": 0.00026454545454545453,
+      "loss": 0.7847,
+      "step": 153
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.241348534822464,
+      "learning_rate": 0.000264297520661157,
+      "loss": 0.7513,
+      "step": 154
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.3316986858844757,
+      "learning_rate": 0.00026404958677685945,
+      "loss": 0.664,
+      "step": 155
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.2419958859682083,
+      "learning_rate": 0.00026380165289256193,
+      "loss": 0.7322,
+      "step": 156
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.2868640124797821,
+      "learning_rate": 0.0002635537190082644,
+      "loss": 0.7004,
+      "step": 157
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.24806949496269226,
+      "learning_rate": 0.00026330578512396695,
+      "loss": 0.6497,
+      "step": 158
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.23873400688171387,
+      "learning_rate": 0.00026305785123966944,
+      "loss": 0.7543,
+      "step": 159
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.2480355203151703,
+      "learning_rate": 0.00026280991735537187,
+      "loss": 0.6048,
+      "step": 160
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.2619112730026245,
+      "learning_rate": 0.00026256198347107435,
+      "loss": 0.762,
+      "step": 161
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.22763262689113617,
+      "learning_rate": 0.00026231404958677683,
+      "loss": 0.6557,
+      "step": 162
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.3291528522968292,
+      "learning_rate": 0.0002620661157024793,
+      "loss": 0.7059,
+      "step": 163
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.2959338426589966,
+      "learning_rate": 0.0002618181818181818,
+      "loss": 0.6622,
+      "step": 164
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.23001112043857574,
+      "learning_rate": 0.0002615702479338843,
+      "loss": 0.6465,
+      "step": 165
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.1998877376317978,
+      "learning_rate": 0.00026132231404958677,
+      "loss": 0.666,
+      "step": 166
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.23009613156318665,
+      "learning_rate": 0.00026107438016528925,
+      "loss": 0.8793,
+      "step": 167
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.24525685608386993,
+      "learning_rate": 0.0002608264462809917,
+      "loss": 0.8009,
+      "step": 168
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.21605077385902405,
+      "learning_rate": 0.00026057851239669416,
+      "loss": 0.5459,
+      "step": 169
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.2576725482940674,
+      "learning_rate": 0.00026033057851239665,
+      "loss": 0.6818,
+      "step": 170
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.23385170102119446,
+      "learning_rate": 0.00026008264462809913,
+      "loss": 0.7559,
+      "step": 171
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.1973017454147339,
+      "learning_rate": 0.00025983471074380167,
+      "loss": 0.6798,
+      "step": 172
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.22262559831142426,
+      "learning_rate": 0.0002595867768595041,
+      "loss": 0.5566,
+      "step": 173
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.23010462522506714,
+      "learning_rate": 0.0002593388429752066,
+      "loss": 0.7101,
+      "step": 174
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.21676452457904816,
+      "learning_rate": 0.00025909090909090907,
+      "loss": 0.7038,
+      "step": 175
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.22475261986255646,
+      "learning_rate": 0.00025884297520661155,
+      "loss": 0.7812,
+      "step": 176
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.28893202543258667,
+      "learning_rate": 0.00025859504132231403,
+      "loss": 0.5925,
+      "step": 177
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.22777552902698517,
+      "learning_rate": 0.0002583471074380165,
+      "loss": 0.7319,
+      "step": 178
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.2287953644990921,
+      "learning_rate": 0.000258099173553719,
+      "loss": 0.7775,
+      "step": 179
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.2049843668937683,
+      "learning_rate": 0.0002578512396694215,
+      "loss": 0.7448,
+      "step": 180
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.22585280239582062,
+      "learning_rate": 0.00025760330578512397,
+      "loss": 0.59,
+      "step": 181
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.23159150779247284,
+      "learning_rate": 0.0002573553719008264,
+      "loss": 0.737,
+      "step": 182
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.3393082320690155,
+      "learning_rate": 0.0002571074380165289,
+      "loss": 0.6948,
+      "step": 183
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.2345617413520813,
+      "learning_rate": 0.00025685950413223136,
+      "loss": 0.6351,
+      "step": 184
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.23474591970443726,
+      "learning_rate": 0.00025661157024793385,
+      "loss": 0.6643,
+      "step": 185
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.2473030984401703,
+      "learning_rate": 0.00025636363636363633,
+      "loss": 0.7663,
+      "step": 186
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.2971685230731964,
+      "learning_rate": 0.0002561157024793388,
+      "loss": 0.7449,
+      "step": 187
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.2745087742805481,
+      "learning_rate": 0.0002558677685950413,
+      "loss": 0.6125,
+      "step": 188
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.23520545661449432,
+      "learning_rate": 0.0002556198347107438,
+      "loss": 0.573,
+      "step": 189
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.2955464720726013,
+      "learning_rate": 0.00025537190082644627,
+      "loss": 0.5315,
+      "step": 190
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.23987281322479248,
+      "learning_rate": 0.00025512396694214875,
+      "loss": 0.5636,
+      "step": 191
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.24263744056224823,
+      "learning_rate": 0.00025487603305785123,
+      "loss": 0.6047,
+      "step": 192
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.26061922311782837,
+      "learning_rate": 0.0002546280991735537,
+      "loss": 0.7812,
+      "step": 193
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.2458687126636505,
+      "learning_rate": 0.0002543801652892562,
+      "loss": 0.58,
+      "step": 194
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.24598994851112366,
+      "learning_rate": 0.00025413223140495863,
+      "loss": 0.7432,
+      "step": 195
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.248992919921875,
+      "learning_rate": 0.0002538842975206611,
+      "loss": 0.6953,
+      "step": 196
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.2518531382083893,
+      "learning_rate": 0.0002536363636363636,
+      "loss": 0.6707,
+      "step": 197
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.23844210803508759,
+      "learning_rate": 0.0002533884297520661,
+      "loss": 0.6285,
+      "step": 198
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.21948237717151642,
+      "learning_rate": 0.00025314049586776856,
+      "loss": 0.6859,
+      "step": 199
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.2003835141658783,
+      "learning_rate": 0.00025289256198347105,
+      "loss": 0.6305,
+      "step": 200
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.23421582579612732,
+      "learning_rate": 0.00025264462809917353,
+      "loss": 0.7164,
+      "step": 201
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.22344104945659637,
+      "learning_rate": 0.000252396694214876,
+      "loss": 0.6498,
+      "step": 202
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.17792212963104248,
+      "learning_rate": 0.0002521487603305785,
+      "loss": 0.614,
+      "step": 203
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.217886820435524,
+      "learning_rate": 0.000251900826446281,
+      "loss": 0.7033,
+      "step": 204
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.209726020693779,
+      "learning_rate": 0.00025165289256198347,
+      "loss": 0.5913,
+      "step": 205
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.2401910424232483,
+      "learning_rate": 0.00025140495867768595,
+      "loss": 0.6405,
+      "step": 206
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.21315626800060272,
+      "learning_rate": 0.00025115702479338843,
+      "loss": 0.7369,
+      "step": 207
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20102320611476898,
+      "learning_rate": 0.00025090909090909086,
+      "loss": 0.6245,
+      "step": 208
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20447981357574463,
+      "learning_rate": 0.00025066115702479335,
+      "loss": 0.5423,
+      "step": 209
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.24979281425476074,
+      "learning_rate": 0.00025041322314049583,
+      "loss": 0.8078,
+      "step": 210
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.20141547918319702,
+      "learning_rate": 0.0002501652892561983,
+      "loss": 0.7386,
+      "step": 211
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.2538990378379822,
+      "learning_rate": 0.0002499173553719008,
+      "loss": 0.7219,
+      "step": 212
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.2613961100578308,
+      "learning_rate": 0.0002496694214876033,
+      "loss": 0.7903,
+      "step": 213
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.24777857959270477,
+      "learning_rate": 0.00024942148760330576,
+      "loss": 0.664,
+      "step": 214
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.21958425641059875,
+      "learning_rate": 0.00024917355371900825,
+      "loss": 0.6755,
+      "step": 215
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.2218528538942337,
+      "learning_rate": 0.00024892561983471073,
+      "loss": 0.5568,
+      "step": 216
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.23632755875587463,
+      "learning_rate": 0.00024867768595041316,
+      "loss": 0.6858,
+      "step": 217
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2641279697418213,
+      "learning_rate": 0.0002484297520661157,
+      "loss": 0.7783,
+      "step": 218
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.3147680163383484,
+      "learning_rate": 0.0002481818181818182,
+      "loss": 0.662,
+      "step": 219
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.27947697043418884,
+      "learning_rate": 0.00024793388429752067,
+      "loss": 0.6477,
+      "step": 220
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2297278195619583,
+      "learning_rate": 0.00024768595041322315,
+      "loss": 0.5895,
+      "step": 221
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.23085851967334747,
+      "learning_rate": 0.0002474380165289256,
+      "loss": 0.5806,
+      "step": 222
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.19654251635074615,
+      "learning_rate": 0.00024719008264462806,
+      "loss": 0.5942,
+      "step": 223
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.2467166632413864,
+      "learning_rate": 0.00024694214876033055,
+      "loss": 0.5059,
+      "step": 224
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.22614917159080505,
+      "learning_rate": 0.00024669421487603303,
+      "loss": 0.643,
+      "step": 225
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.2622920274734497,
+      "learning_rate": 0.0002464462809917355,
+      "loss": 0.6257,
+      "step": 226
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.21843163669109344,
+      "learning_rate": 0.000246198347107438,
+      "loss": 0.6057,
+      "step": 227
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.2294640988111496,
+      "learning_rate": 0.0002459504132231405,
+      "loss": 0.6876,
+      "step": 228
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.1791463941335678,
+      "learning_rate": 0.00024570247933884296,
+      "loss": 0.5348,
+      "step": 229
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.17243699729442596,
+      "learning_rate": 0.00024545454545454545,
+      "loss": 0.5966,
+      "step": 230
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.22769273817539215,
+      "learning_rate": 0.0002452066115702479,
+      "loss": 0.7912,
+      "step": 231
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.2325255423784256,
+      "learning_rate": 0.0002449586776859504,
+      "loss": 0.7441,
+      "step": 232
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.24277740716934204,
+      "learning_rate": 0.0002447107438016529,
+      "loss": 0.6653,
+      "step": 233
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.21596141159534454,
+      "learning_rate": 0.0002444628099173554,
+      "loss": 0.6668,
+      "step": 234
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.20814135670661926,
+      "learning_rate": 0.0002442148760330578,
+      "loss": 0.6306,
+      "step": 235
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.25570017099380493,
+      "learning_rate": 0.0002439669421487603,
+      "loss": 0.6524,
+      "step": 236
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.2502390146255493,
+      "learning_rate": 0.00024371900826446278,
+      "loss": 0.6048,
+      "step": 237
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.23688243329524994,
+      "learning_rate": 0.0002434710743801653,
+      "loss": 0.568,
+      "step": 238
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.21041709184646606,
+      "learning_rate": 0.00024322314049586777,
+      "loss": 0.6908,
+      "step": 239
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.21656759083271027,
+      "learning_rate": 0.00024297520661157023,
+      "loss": 0.4993,
+      "step": 240
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.25133028626441956,
+      "learning_rate": 0.0002427272727272727,
+      "loss": 0.718,
+      "step": 241
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.22228790819644928,
+      "learning_rate": 0.0002424793388429752,
+      "loss": 0.6146,
+      "step": 242
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.26273205876350403,
+      "learning_rate": 0.00024223140495867768,
+      "loss": 0.7459,
+      "step": 243
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.2156606763601303,
+      "learning_rate": 0.00024198347107438014,
+      "loss": 0.6692,
+      "step": 244
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.2075020670890808,
+      "learning_rate": 0.00024173553719008262,
+      "loss": 0.6427,
+      "step": 245
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.25821176171302795,
+      "learning_rate": 0.0002414876033057851,
+      "loss": 0.7964,
+      "step": 246
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.23016126453876495,
+      "learning_rate": 0.0002412396694214876,
+      "loss": 0.536,
+      "step": 247
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.23115016520023346,
+      "learning_rate": 0.00024099173553719004,
+      "loss": 0.6053,
+      "step": 248
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.18249157071113586,
+      "learning_rate": 0.00024074380165289253,
+      "loss": 0.6574,
+      "step": 249
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.28391778469085693,
+      "learning_rate": 0.000240495867768595,
+      "loss": 0.7152,
+      "step": 250
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.2581539452075958,
+      "learning_rate": 0.0002402479338842975,
+      "loss": 0.8476,
+      "step": 251
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.2304867058992386,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 0.5781,
+      "step": 252
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.239717036485672,
+      "learning_rate": 0.00023975206611570244,
+      "loss": 0.6543,
+      "step": 253
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.22493794560432434,
+      "learning_rate": 0.00023950413223140495,
+      "loss": 0.7048,
+      "step": 254
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.22085991501808167,
+      "learning_rate": 0.00023925619834710743,
+      "loss": 0.5572,
+      "step": 255
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.35917988419532776,
+      "learning_rate": 0.0002390082644628099,
+      "loss": 0.8485,
+      "step": 256
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.28269943594932556,
+      "learning_rate": 0.00023876033057851237,
+      "loss": 0.5732,
+      "step": 257
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.26313093304634094,
+      "learning_rate": 0.00023851239669421485,
+      "loss": 0.8212,
+      "step": 258
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.30286532640457153,
+      "learning_rate": 0.00023826446280991734,
+      "loss": 0.5878,
+      "step": 259
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.22270837426185608,
+      "learning_rate": 0.00023801652892561982,
+      "loss": 0.6933,
+      "step": 260
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.29011014103889465,
+      "learning_rate": 0.0002377685950413223,
+      "loss": 0.6188,
+      "step": 261
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.2390982061624527,
+      "learning_rate": 0.00023752066115702476,
+      "loss": 0.6426,
+      "step": 262
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.3416346609592438,
+      "learning_rate": 0.00023727272727272724,
+      "loss": 0.8845,
+      "step": 263
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.25051388144493103,
+      "learning_rate": 0.00023702479338842973,
+      "loss": 0.7286,
+      "step": 264
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.2497546523809433,
+      "learning_rate": 0.0002367768595041322,
+      "loss": 0.6027,
+      "step": 265
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.23835037648677826,
+      "learning_rate": 0.00023652892561983467,
+      "loss": 0.7052,
+      "step": 266
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.22467398643493652,
+      "learning_rate": 0.00023628099173553715,
+      "loss": 0.5806,
+      "step": 267
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.2663390338420868,
+      "learning_rate": 0.00023603305785123964,
+      "loss": 0.6943,
+      "step": 268
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.22997191548347473,
+      "learning_rate": 0.00023578512396694215,
+      "loss": 0.6411,
+      "step": 269
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.23266558349132538,
+      "learning_rate": 0.00023553719008264463,
+      "loss": 0.6068,
+      "step": 270
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.2304474264383316,
+      "learning_rate": 0.00023528925619834709,
+      "loss": 0.6427,
+      "step": 271
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.28231826424598694,
+      "learning_rate": 0.00023504132231404957,
+      "loss": 0.8011,
+      "step": 272
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.28013259172439575,
+      "learning_rate": 0.00023479338842975205,
+      "loss": 0.5988,
+      "step": 273
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.22702372074127197,
+      "learning_rate": 0.00023454545454545454,
+      "loss": 0.6737,
+      "step": 274
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.27958643436431885,
+      "learning_rate": 0.000234297520661157,
+      "loss": 0.6621,
+      "step": 275
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.23902451992034912,
+      "learning_rate": 0.00023404958677685948,
+      "loss": 0.6525,
+      "step": 276
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.2778523564338684,
+      "learning_rate": 0.00023380165289256196,
+      "loss": 0.6697,
+      "step": 277
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.2382276952266693,
+      "learning_rate": 0.00023355371900826444,
+      "loss": 0.6281,
+      "step": 278
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.24487091600894928,
+      "learning_rate": 0.00023330578512396693,
+      "loss": 0.6842,
+      "step": 279
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.2063397765159607,
+      "learning_rate": 0.00023305785123966938,
+      "loss": 0.6554,
+      "step": 280
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.21523278951644897,
+      "learning_rate": 0.00023280991735537187,
+      "loss": 0.632,
+      "step": 281
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.2420080006122589,
+      "learning_rate": 0.00023256198347107435,
+      "loss": 0.6001,
+      "step": 282
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.2390110194683075,
+      "learning_rate": 0.00023231404958677686,
+      "loss": 0.5648,
+      "step": 283
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.24080687761306763,
+      "learning_rate": 0.0002320661157024793,
+      "loss": 0.86,
+      "step": 284
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.29456445574760437,
+      "learning_rate": 0.0002318181818181818,
+      "loss": 0.7418,
+      "step": 285
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.23326683044433594,
+      "learning_rate": 0.00023157024793388429,
+      "loss": 0.6967,
+      "step": 286
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.20866093039512634,
+      "learning_rate": 0.00023132231404958677,
+      "loss": 0.5205,
+      "step": 287
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.3158474266529083,
+      "learning_rate": 0.00023107438016528925,
+      "loss": 0.7879,
+      "step": 288
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.2730140686035156,
+      "learning_rate": 0.0002308264462809917,
+      "loss": 0.7292,
+      "step": 289
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.25384965538978577,
+      "learning_rate": 0.0002305785123966942,
+      "loss": 0.7258,
+      "step": 290
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.20765069127082825,
+      "learning_rate": 0.00023033057851239668,
+      "loss": 0.7108,
+      "step": 291
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.25662195682525635,
+      "learning_rate": 0.00023008264462809916,
+      "loss": 0.7473,
+      "step": 292
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.300243616104126,
+      "learning_rate": 0.00022983471074380162,
+      "loss": 0.6902,
+      "step": 293
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.23513919115066528,
+      "learning_rate": 0.0002295867768595041,
+      "loss": 0.5888,
+      "step": 294
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2077571451663971,
+      "learning_rate": 0.00022933884297520658,
+      "loss": 0.6256,
+      "step": 295
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.266201376914978,
+      "learning_rate": 0.00022909090909090907,
+      "loss": 0.6913,
+      "step": 296
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.2239614725112915,
+      "learning_rate": 0.00022884297520661152,
+      "loss": 0.7369,
+      "step": 297
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.21509824693202972,
+      "learning_rate": 0.000228595041322314,
+      "loss": 0.4445,
+      "step": 298
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.21956239640712738,
+      "learning_rate": 0.00022834710743801652,
+      "loss": 0.6732,
+      "step": 299
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.18832357227802277,
+      "learning_rate": 0.000228099173553719,
+      "loss": 0.6808,
+      "step": 300
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.21115505695343018,
+      "learning_rate": 0.0002278512396694215,
+      "loss": 0.5323,
+      "step": 301
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.23715418577194214,
+      "learning_rate": 0.00022760330578512394,
+      "loss": 0.8333,
+      "step": 302
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.29385048151016235,
+      "learning_rate": 0.00022735537190082643,
+      "loss": 0.6,
+      "step": 303
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.26947689056396484,
+      "learning_rate": 0.0002271074380165289,
+      "loss": 0.8788,
+      "step": 304
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.2778269946575165,
+      "learning_rate": 0.0002268595041322314,
+      "loss": 0.7073,
+      "step": 305
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.20938479900360107,
+      "learning_rate": 0.00022661157024793385,
+      "loss": 0.6422,
+      "step": 306
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.2777106761932373,
+      "learning_rate": 0.00022636363636363633,
+      "loss": 0.7495,
+      "step": 307
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.20872819423675537,
+      "learning_rate": 0.00022611570247933882,
+      "loss": 0.6492,
+      "step": 308
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.2752722501754761,
+      "learning_rate": 0.0002258677685950413,
+      "loss": 0.6014,
+      "step": 309
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.24615786969661713,
+      "learning_rate": 0.00022561983471074378,
+      "loss": 0.6287,
+      "step": 310
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.24146385490894318,
+      "learning_rate": 0.00022537190082644624,
+      "loss": 0.6151,
+      "step": 311
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.24762235581874847,
+      "learning_rate": 0.00022512396694214872,
+      "loss": 0.6377,
+      "step": 312
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.24630331993103027,
+      "learning_rate": 0.00022487603305785124,
+      "loss": 0.7255,
+      "step": 313
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.2922554612159729,
+      "learning_rate": 0.00022462809917355372,
+      "loss": 0.6645,
+      "step": 314
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.21686063706874847,
+      "learning_rate": 0.00022438016528925618,
+      "loss": 0.5606,
+      "step": 315
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.2216208428144455,
+      "learning_rate": 0.00022413223140495866,
+      "loss": 0.5126,
+      "step": 316
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.25635436177253723,
+      "learning_rate": 0.00022388429752066114,
+      "loss": 0.7387,
+      "step": 317
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.2786000669002533,
+      "learning_rate": 0.00022363636363636363,
+      "loss": 0.5941,
+      "step": 318
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.26092806458473206,
+      "learning_rate": 0.0002233884297520661,
+      "loss": 0.7851,
+      "step": 319
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.23881889879703522,
+      "learning_rate": 0.00022314049586776857,
+      "loss": 0.598,
+      "step": 320
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.23304526507854462,
+      "learning_rate": 0.00022289256198347105,
+      "loss": 0.7165,
+      "step": 321
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.2340225875377655,
+      "learning_rate": 0.00022264462809917353,
+      "loss": 0.6608,
+      "step": 322
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.31176140904426575,
+      "learning_rate": 0.00022239669421487602,
+      "loss": 0.6711,
+      "step": 323
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.23832640051841736,
+      "learning_rate": 0.00022214876033057847,
+      "loss": 0.732,
+      "step": 324
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.28845977783203125,
+      "learning_rate": 0.00022190082644628096,
+      "loss": 0.7968,
+      "step": 325
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.1978536993265152,
+      "learning_rate": 0.00022165289256198344,
+      "loss": 0.6592,
+      "step": 326
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.26940053701400757,
+      "learning_rate": 0.00022140495867768595,
+      "loss": 0.7953,
+      "step": 327
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.20393389463424683,
+      "learning_rate": 0.00022115702479338844,
+      "loss": 0.4871,
+      "step": 328
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.27152347564697266,
+      "learning_rate": 0.0002209090909090909,
+      "loss": 0.5583,
+      "step": 329
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.2883144021034241,
+      "learning_rate": 0.00022066115702479338,
+      "loss": 0.6156,
+      "step": 330
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.1987351030111313,
+      "learning_rate": 0.00022041322314049586,
+      "loss": 0.5196,
+      "step": 331
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.2651583254337311,
+      "learning_rate": 0.00022016528925619834,
+      "loss": 0.6099,
+      "step": 332
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.2574511468410492,
+      "learning_rate": 0.0002199173553719008,
+      "loss": 0.6925,
+      "step": 333
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.27730292081832886,
+      "learning_rate": 0.00021966942148760328,
+      "loss": 0.6752,
+      "step": 334
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.2001207172870636,
+      "learning_rate": 0.00021942148760330577,
+      "loss": 0.75,
+      "step": 335
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.24222363531589508,
+      "learning_rate": 0.00021917355371900825,
+      "loss": 0.6364,
+      "step": 336
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.26326724886894226,
+      "learning_rate": 0.0002189256198347107,
+      "loss": 0.673,
+      "step": 337
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.2272881418466568,
+      "learning_rate": 0.0002186776859504132,
+      "loss": 0.561,
+      "step": 338
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.24880024790763855,
+      "learning_rate": 0.00021842975206611567,
+      "loss": 0.5552,
+      "step": 339
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2593706548213959,
+      "learning_rate": 0.00021818181818181816,
+      "loss": 0.5417,
+      "step": 340
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.19063642621040344,
+      "learning_rate": 0.00021793388429752067,
+      "loss": 0.5694,
+      "step": 341
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2146475464105606,
+      "learning_rate": 0.0002176859504132231,
+      "loss": 0.4314,
+      "step": 342
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.25150927901268005,
+      "learning_rate": 0.0002174380165289256,
+      "loss": 0.631,
+      "step": 343
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2753889858722687,
+      "learning_rate": 0.0002171900826446281,
+      "loss": 0.6859,
+      "step": 344
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.20773079991340637,
+      "learning_rate": 0.00021694214876033058,
+      "loss": 0.7515,
+      "step": 345
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.2547062635421753,
+      "learning_rate": 0.00021669421487603303,
+      "loss": 0.7582,
+      "step": 346
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.24687208235263824,
+      "learning_rate": 0.00021644628099173552,
+      "loss": 0.5865,
+      "step": 347
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.24116279184818268,
+      "learning_rate": 0.000216198347107438,
+      "loss": 0.4841,
+      "step": 348
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.2270282804965973,
+      "learning_rate": 0.00021595041322314048,
+      "loss": 0.5933,
+      "step": 349
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.21436922252178192,
+      "learning_rate": 0.00021570247933884297,
+      "loss": 0.6959,
+      "step": 350
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.25802701711654663,
+      "learning_rate": 0.00021545454545454542,
+      "loss": 0.729,
+      "step": 351
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.23808260262012482,
+      "learning_rate": 0.0002152066115702479,
+      "loss": 0.6346,
+      "step": 352
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.23161651194095612,
+      "learning_rate": 0.0002149586776859504,
+      "loss": 0.6459,
+      "step": 353
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.2442287802696228,
+      "learning_rate": 0.00021471074380165287,
+      "loss": 0.6803,
+      "step": 354
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.19150683283805847,
+      "learning_rate": 0.00021446280991735533,
+      "loss": 0.4375,
+      "step": 355
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.23142127692699432,
+      "learning_rate": 0.00021421487603305781,
+      "loss": 0.5505,
+      "step": 356
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.22447548806667328,
+      "learning_rate": 0.00021396694214876033,
+      "loss": 0.6368,
+      "step": 357
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.25168758630752563,
+      "learning_rate": 0.0002137190082644628,
+      "loss": 0.6322,
+      "step": 358
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.25538235902786255,
+      "learning_rate": 0.0002134710743801653,
+      "loss": 0.5317,
+      "step": 359
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.2565425634384155,
+      "learning_rate": 0.00021322314049586775,
+      "loss": 0.6261,
+      "step": 360
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.25399863719940186,
+      "learning_rate": 0.00021297520661157023,
+      "loss": 0.596,
+      "step": 361
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.27143988013267517,
+      "learning_rate": 0.00021272727272727272,
+      "loss": 0.6691,
+      "step": 362
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.2387736439704895,
+      "learning_rate": 0.0002124793388429752,
+      "loss": 0.5288,
+      "step": 363
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.2549780607223511,
+      "learning_rate": 0.00021223140495867766,
+      "loss": 0.7455,
+      "step": 364
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.2740858793258667,
+      "learning_rate": 0.00021198347107438014,
+      "loss": 0.4921,
+      "step": 365
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.25273847579956055,
+      "learning_rate": 0.00021173553719008262,
+      "loss": 0.7965,
+      "step": 366
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.25858959555625916,
+      "learning_rate": 0.0002114876033057851,
+      "loss": 0.7303,
+      "step": 367
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.2599296271800995,
+      "learning_rate": 0.0002112396694214876,
+      "loss": 0.6342,
+      "step": 368
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.21084599196910858,
+      "learning_rate": 0.00021099173553719005,
+      "loss": 0.633,
+      "step": 369
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.24272632598876953,
+      "learning_rate": 0.00021074380165289253,
+      "loss": 0.6213,
+      "step": 370
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.26323699951171875,
+      "learning_rate": 0.00021049586776859501,
+      "loss": 0.563,
+      "step": 371
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.20646587014198303,
+      "learning_rate": 0.00021024793388429753,
+      "loss": 0.6248,
+      "step": 372
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.21778297424316406,
+      "learning_rate": 0.00020999999999999998,
+      "loss": 0.7186,
+      "step": 373
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.21315112709999084,
+      "learning_rate": 0.00020975206611570247,
+      "loss": 0.5961,
+      "step": 374
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.20787106454372406,
+      "learning_rate": 0.00020950413223140495,
+      "loss": 0.5917,
+      "step": 375
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.23541009426116943,
+      "learning_rate": 0.00020925619834710743,
+      "loss": 0.7803,
+      "step": 376
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.22649626433849335,
+      "learning_rate": 0.00020900826446280992,
+      "loss": 0.5895,
+      "step": 377
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.23644742369651794,
+      "learning_rate": 0.00020876033057851237,
+      "loss": 0.6656,
+      "step": 378
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.22934262454509735,
+      "learning_rate": 0.00020851239669421486,
+      "loss": 0.5933,
+      "step": 379
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.289989709854126,
+      "learning_rate": 0.00020826446280991734,
+      "loss": 0.6852,
+      "step": 380
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.24489325284957886,
+      "learning_rate": 0.00020801652892561982,
+      "loss": 0.5546,
+      "step": 381
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.27165278792381287,
+      "learning_rate": 0.00020776859504132228,
+      "loss": 0.6845,
+      "step": 382
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.19467370212078094,
+      "learning_rate": 0.00020752066115702476,
+      "loss": 0.5587,
+      "step": 383
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.27320200204849243,
+      "learning_rate": 0.00020727272727272725,
+      "loss": 0.7144,
+      "step": 384
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.28100526332855225,
+      "learning_rate": 0.00020702479338842973,
+      "loss": 0.6914,
+      "step": 385
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.3059975504875183,
+      "learning_rate": 0.0002067768595041322,
+      "loss": 0.6075,
+      "step": 386
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.24904222786426544,
+      "learning_rate": 0.00020652892561983467,
+      "loss": 0.5543,
+      "step": 387
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.24768255650997162,
+      "learning_rate": 0.00020628099173553718,
+      "loss": 0.607,
+      "step": 388
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.25083738565444946,
+      "learning_rate": 0.00020603305785123967,
+      "loss": 0.7961,
+      "step": 389
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.26338303089141846,
+      "learning_rate": 0.00020578512396694215,
+      "loss": 0.6467,
+      "step": 390
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.25761598348617554,
+      "learning_rate": 0.0002055371900826446,
+      "loss": 0.5891,
+      "step": 391
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.2616937756538391,
+      "learning_rate": 0.0002052892561983471,
+      "loss": 0.5706,
+      "step": 392
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.18980839848518372,
+      "learning_rate": 0.00020504132231404957,
+      "loss": 0.4479,
+      "step": 393
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.250431627035141,
+      "learning_rate": 0.00020479338842975206,
+      "loss": 0.6006,
+      "step": 394
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.2146655172109604,
+      "learning_rate": 0.0002045454545454545,
+      "loss": 0.7113,
+      "step": 395
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.2195209115743637,
+      "learning_rate": 0.000204297520661157,
+      "loss": 0.5354,
+      "step": 396
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.24879257380962372,
+      "learning_rate": 0.00020404958677685948,
+      "loss": 0.5478,
+      "step": 397
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.27159082889556885,
+      "learning_rate": 0.00020380165289256196,
+      "loss": 0.7681,
+      "step": 398
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.20614947378635406,
+      "learning_rate": 0.00020355371900826445,
+      "loss": 0.6357,
+      "step": 399
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.25690051913261414,
+      "learning_rate": 0.0002033057851239669,
+      "loss": 0.5731,
+      "step": 400
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.24473583698272705,
+      "learning_rate": 0.0002030578512396694,
+      "loss": 0.6784,
+      "step": 401
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.32395297288894653,
+      "learning_rate": 0.0002028099173553719,
+      "loss": 0.7118,
+      "step": 402
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.2975274324417114,
+      "learning_rate": 0.00020256198347107438,
+      "loss": 0.6504,
+      "step": 403
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.2652553915977478,
+      "learning_rate": 0.00020231404958677684,
+      "loss": 0.6986,
+      "step": 404
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.29475778341293335,
+      "learning_rate": 0.00020206611570247932,
+      "loss": 0.6525,
+      "step": 405
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.24549973011016846,
+      "learning_rate": 0.0002018181818181818,
+      "loss": 0.5408,
+      "step": 406
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.2181435376405716,
+      "learning_rate": 0.0002015702479338843,
+      "loss": 0.6146,
+      "step": 407
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.2682584226131439,
+      "learning_rate": 0.00020132231404958677,
+      "loss": 0.6368,
+      "step": 408
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.2641114592552185,
+      "learning_rate": 0.00020107438016528923,
+      "loss": 0.51,
+      "step": 409
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.27871838212013245,
+      "learning_rate": 0.0002008264462809917,
+      "loss": 0.7269,
+      "step": 410
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.23890569806098938,
+      "learning_rate": 0.0002005785123966942,
+      "loss": 0.6444,
+      "step": 411
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.2451583445072174,
+      "learning_rate": 0.00020033057851239668,
+      "loss": 0.5806,
+      "step": 412
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.2743864953517914,
+      "learning_rate": 0.00020008264462809914,
+      "loss": 0.6305,
+      "step": 413
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.2626914978027344,
+      "learning_rate": 0.00019983471074380162,
+      "loss": 0.5765,
+      "step": 414
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.2874875068664551,
+      "learning_rate": 0.0001995867768595041,
+      "loss": 0.5928,
+      "step": 415
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.30499163269996643,
+      "learning_rate": 0.00019933884297520661,
+      "loss": 0.6271,
+      "step": 416
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.30474454164505005,
+      "learning_rate": 0.0001990909090909091,
+      "loss": 0.6755,
+      "step": 417
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.1819755882024765,
+      "learning_rate": 0.00019884297520661155,
+      "loss": 0.394,
+      "step": 418
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.25470343232154846,
+      "learning_rate": 0.00019859504132231404,
+      "loss": 0.7121,
+      "step": 419
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.26749151945114136,
+      "learning_rate": 0.00019834710743801652,
+      "loss": 0.6487,
+      "step": 420
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.20643912255764008,
+      "learning_rate": 0.000198099173553719,
+      "loss": 0.4585,
+      "step": 421
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.2576930522918701,
+      "learning_rate": 0.00019785123966942146,
+      "loss": 0.5235,
+      "step": 422
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.2899012863636017,
+      "learning_rate": 0.00019760330578512395,
+      "loss": 0.6292,
+      "step": 423
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.2541065216064453,
+      "learning_rate": 0.00019735537190082643,
+      "loss": 0.648,
+      "step": 424
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.24382047355175018,
+      "learning_rate": 0.0001971074380165289,
+      "loss": 0.5939,
+      "step": 425
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.22931940853595734,
+      "learning_rate": 0.00019685950413223137,
+      "loss": 0.6812,
+      "step": 426
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.2592567205429077,
+      "learning_rate": 0.00019661157024793385,
+      "loss": 0.69,
+      "step": 427
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.2516980767250061,
+      "learning_rate": 0.00019636363636363634,
+      "loss": 0.5707,
+      "step": 428
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.23515059053897858,
+      "learning_rate": 0.00019611570247933882,
+      "loss": 0.6739,
+      "step": 429
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.24742184579372406,
+      "learning_rate": 0.00019586776859504133,
+      "loss": 0.6761,
+      "step": 430
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.26232922077178955,
+      "learning_rate": 0.00019561983471074376,
+      "loss": 0.7071,
+      "step": 431
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.2853042781352997,
+      "learning_rate": 0.00019537190082644627,
+      "loss": 0.7667,
+      "step": 432
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.251169353723526,
+      "learning_rate": 0.00019512396694214875,
+      "loss": 0.6518,
+      "step": 433
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.2321665734052658,
+      "learning_rate": 0.00019487603305785124,
+      "loss": 0.4377,
+      "step": 434
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.25216928124427795,
+      "learning_rate": 0.0001946280991735537,
+      "loss": 0.7173,
+      "step": 435
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.19498330354690552,
+      "learning_rate": 0.00019438016528925618,
+      "loss": 0.5584,
+      "step": 436
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.32786309719085693,
+      "learning_rate": 0.00019413223140495866,
+      "loss": 0.6583,
+      "step": 437
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.25834760069847107,
+      "learning_rate": 0.00019388429752066115,
+      "loss": 0.4957,
+      "step": 438
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3462083041667938,
+      "learning_rate": 0.00019363636363636363,
+      "loss": 0.5205,
+      "step": 439
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.27106693387031555,
+      "learning_rate": 0.00019338842975206609,
+      "loss": 0.6803,
+      "step": 440
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.28165388107299805,
+      "learning_rate": 0.00019314049586776857,
+      "loss": 0.7049,
+      "step": 441
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.20732273161411285,
+      "learning_rate": 0.00019289256198347105,
+      "loss": 0.6407,
+      "step": 442
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.2609116733074188,
+      "learning_rate": 0.00019264462809917354,
+      "loss": 0.5377,
+      "step": 443
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.2561998963356018,
+      "learning_rate": 0.000192396694214876,
+      "loss": 0.6212,
+      "step": 444
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.27699044346809387,
+      "learning_rate": 0.00019214876033057848,
+      "loss": 0.5482,
+      "step": 445
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.2426328808069229,
+      "learning_rate": 0.000191900826446281,
+      "loss": 0.6444,
+      "step": 446
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.26187026500701904,
+      "learning_rate": 0.00019165289256198347,
+      "loss": 0.5443,
+      "step": 447
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.2719630002975464,
+      "learning_rate": 0.00019140495867768595,
+      "loss": 0.6886,
+      "step": 448
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.18477971851825714,
+      "learning_rate": 0.0001911570247933884,
+      "loss": 0.5292,
+      "step": 449
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2144313007593155,
+      "learning_rate": 0.0001909090909090909,
+      "loss": 0.4613,
+      "step": 450
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2580784857273102,
+      "learning_rate": 0.00019066115702479338,
+      "loss": 0.5606,
+      "step": 451
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.3073588013648987,
+      "learning_rate": 0.00019041322314049586,
+      "loss": 0.6123,
+      "step": 452
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.21787844598293304,
+      "learning_rate": 0.00019016528925619832,
+      "loss": 0.5939,
+      "step": 453
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.255750447511673,
+      "learning_rate": 0.0001899173553719008,
+      "loss": 0.5739,
+      "step": 454
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.24147820472717285,
+      "learning_rate": 0.00018966942148760329,
+      "loss": 0.6026,
+      "step": 455
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.26172590255737305,
+      "learning_rate": 0.00018942148760330577,
+      "loss": 0.5166,
+      "step": 456
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.2710455358028412,
+      "learning_rate": 0.00018917355371900825,
+      "loss": 0.6429,
+      "step": 457
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.1971074640750885,
+      "learning_rate": 0.0001889256198347107,
+      "loss": 0.4799,
+      "step": 458
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.23394368588924408,
+      "learning_rate": 0.0001886776859504132,
+      "loss": 0.5491,
+      "step": 459
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.22820048034191132,
+      "learning_rate": 0.0001884297520661157,
+      "loss": 0.5343,
+      "step": 460
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.23169974982738495,
+      "learning_rate": 0.0001881818181818182,
+      "loss": 0.5852,
+      "step": 461
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.24015003442764282,
+      "learning_rate": 0.00018793388429752064,
+      "loss": 0.6209,
+      "step": 462
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.2230776697397232,
+      "learning_rate": 0.00018768595041322313,
+      "loss": 0.6296,
+      "step": 463
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.2518354654312134,
+      "learning_rate": 0.0001874380165289256,
+      "loss": 0.6167,
+      "step": 464
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.338256299495697,
+      "learning_rate": 0.0001871900826446281,
+      "loss": 0.6512,
+      "step": 465
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.23796728253364563,
+      "learning_rate": 0.00018694214876033055,
+      "loss": 0.8155,
+      "step": 466
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.31516361236572266,
+      "learning_rate": 0.00018669421487603303,
+      "loss": 0.8023,
+      "step": 467
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.2371574491262436,
+      "learning_rate": 0.00018644628099173552,
+      "loss": 0.5613,
+      "step": 468
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.2822033762931824,
+      "learning_rate": 0.000186198347107438,
+      "loss": 0.5549,
+      "step": 469
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.25953295826911926,
+      "learning_rate": 0.00018595041322314049,
+      "loss": 0.6199,
+      "step": 470
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.2478639930486679,
+      "learning_rate": 0.00018570247933884294,
+      "loss": 0.5806,
+      "step": 471
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.2439350187778473,
+      "learning_rate": 0.00018545454545454543,
+      "loss": 0.6222,
+      "step": 472
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.24993474781513214,
+      "learning_rate": 0.0001852066115702479,
+      "loss": 0.6048,
+      "step": 473
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.24781496822834015,
+      "learning_rate": 0.00018495867768595042,
+      "loss": 0.5941,
+      "step": 474
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.1847202032804489,
+      "learning_rate": 0.00018471074380165285,
+      "loss": 0.609,
+      "step": 475
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.21596528589725494,
+      "learning_rate": 0.00018446280991735536,
+      "loss": 0.4457,
+      "step": 476
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.240879625082016,
+      "learning_rate": 0.00018421487603305784,
+      "loss": 0.6118,
+      "step": 477
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.2898111641407013,
+      "learning_rate": 0.00018396694214876033,
+      "loss": 0.7725,
+      "step": 478
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.27428382635116577,
+      "learning_rate": 0.0001837190082644628,
+      "loss": 0.5366,
+      "step": 479
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.23467296361923218,
+      "learning_rate": 0.00018347107438016527,
+      "loss": 0.6018,
+      "step": 480
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.2190561592578888,
+      "learning_rate": 0.00018322314049586775,
+      "loss": 0.5249,
+      "step": 481
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.2240625023841858,
+      "learning_rate": 0.00018297520661157024,
+      "loss": 0.6891,
+      "step": 482
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.24726848304271698,
+      "learning_rate": 0.00018272727272727272,
+      "loss": 0.5545,
+      "step": 483
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.3318251371383667,
+      "learning_rate": 0.00018247933884297518,
+      "loss": 0.4809,
+      "step": 484
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.2396695613861084,
+      "learning_rate": 0.00018223140495867766,
+      "loss": 0.4942,
+      "step": 485
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.25009942054748535,
+      "learning_rate": 0.00018198347107438014,
+      "loss": 0.7381,
+      "step": 486
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.22655311226844788,
+      "learning_rate": 0.00018173553719008263,
+      "loss": 0.4729,
+      "step": 487
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.23187695443630219,
+      "learning_rate": 0.0001814876033057851,
+      "loss": 0.5719,
+      "step": 488
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2703653573989868,
+      "learning_rate": 0.00018123966942148757,
+      "loss": 0.6031,
+      "step": 489
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2207796424627304,
+      "learning_rate": 0.00018099173553719008,
+      "loss": 0.5361,
+      "step": 490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.24914169311523438,
+      "learning_rate": 0.00018074380165289256,
+      "loss": 0.6547,
+      "step": 491
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.2714746594429016,
+      "learning_rate": 0.00018049586776859504,
+      "loss": 0.5702,
+      "step": 492
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.3201580047607422,
+      "learning_rate": 0.0001802479338842975,
+      "loss": 0.6119,
+      "step": 493
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.2548397183418274,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.5251,
+      "step": 494
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.28669115900993347,
+      "learning_rate": 0.00017975206611570247,
+      "loss": 0.5773,
+      "step": 495
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.26253971457481384,
+      "learning_rate": 0.00017950413223140495,
+      "loss": 0.6504,
+      "step": 496
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.22113384306430817,
+      "learning_rate": 0.00017925619834710744,
+      "loss": 0.4741,
+      "step": 497
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.261636346578598,
+      "learning_rate": 0.0001790082644628099,
+      "loss": 0.6241,
+      "step": 498
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.1780402809381485,
+      "learning_rate": 0.00017876033057851238,
+      "loss": 0.5207,
+      "step": 499
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.26149195432662964,
+      "learning_rate": 0.00017851239669421486,
+      "loss": 0.5872,
+      "step": 500
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.26113009452819824,
+      "learning_rate": 0.00017826446280991734,
+      "loss": 0.6163,
+      "step": 501
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.21397502720355988,
+      "learning_rate": 0.0001780165289256198,
+      "loss": 0.479,
+      "step": 502
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.21250088512897491,
+      "learning_rate": 0.00017776859504132228,
+      "loss": 0.6978,
+      "step": 503
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.2556426525115967,
+      "learning_rate": 0.00017752066115702477,
+      "loss": 0.6128,
+      "step": 504
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.24139715731143951,
+      "learning_rate": 0.00017727272727272728,
+      "loss": 0.5066,
+      "step": 505
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.23671215772628784,
+      "learning_rate": 0.00017702479338842976,
+      "loss": 0.5183,
+      "step": 506
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.23494285345077515,
+      "learning_rate": 0.00017677685950413222,
+      "loss": 0.5181,
+      "step": 507
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.2547609806060791,
+      "learning_rate": 0.0001765289256198347,
+      "loss": 0.5406,
+      "step": 508
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.3042651414871216,
+      "learning_rate": 0.00017628099173553718,
+      "loss": 0.5551,
+      "step": 509
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.22910748422145844,
+      "learning_rate": 0.00017603305785123967,
+      "loss": 0.6373,
+      "step": 510
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.19777967035770416,
+      "learning_rate": 0.00017578512396694212,
+      "loss": 0.5471,
+      "step": 511
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.31034502387046814,
+      "learning_rate": 0.0001755371900826446,
+      "loss": 0.7017,
+      "step": 512
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3504410684108734,
+      "learning_rate": 0.0001752892561983471,
+      "loss": 0.7208,
+      "step": 513
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.24271292984485626,
+      "learning_rate": 0.00017504132231404958,
+      "loss": 0.5563,
+      "step": 514
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.27147865295410156,
+      "learning_rate": 0.00017479338842975203,
+      "loss": 0.5869,
+      "step": 515
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.2976628839969635,
+      "learning_rate": 0.00017454545454545452,
+      "loss": 0.5471,
+      "step": 516
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.28489646315574646,
+      "learning_rate": 0.000174297520661157,
+      "loss": 0.6053,
+      "step": 517
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.30020108819007874,
+      "learning_rate": 0.00017404958677685948,
+      "loss": 0.6178,
+      "step": 518
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.23986253142356873,
+      "learning_rate": 0.000173801652892562,
+      "loss": 0.5896,
+      "step": 519
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.2667832374572754,
+      "learning_rate": 0.00017355371900826442,
+      "loss": 0.5375,
+      "step": 520
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.22176356613636017,
+      "learning_rate": 0.00017330578512396693,
+      "loss": 0.5723,
+      "step": 521
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.263257771730423,
+      "learning_rate": 0.00017305785123966942,
+      "loss": 0.7317,
+      "step": 522
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.24838753044605255,
+      "learning_rate": 0.0001728099173553719,
+      "loss": 0.5849,
+      "step": 523
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.24839664995670319,
+      "learning_rate": 0.00017256198347107436,
+      "loss": 0.6678,
+      "step": 524
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.2849573493003845,
+      "learning_rate": 0.00017231404958677684,
+      "loss": 0.7144,
+      "step": 525
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.26900768280029297,
+      "learning_rate": 0.00017206611570247932,
+      "loss": 0.5156,
+      "step": 526
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.2212425172328949,
+      "learning_rate": 0.0001718181818181818,
+      "loss": 0.4551,
+      "step": 527
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.2066129595041275,
+      "learning_rate": 0.0001715702479338843,
+      "loss": 0.4193,
+      "step": 528
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.2838365137577057,
+      "learning_rate": 0.00017132231404958675,
+      "loss": 0.6078,
+      "step": 529
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.239679753780365,
+      "learning_rate": 0.00017107438016528923,
+      "loss": 0.616,
+      "step": 530
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.23269398510456085,
+      "learning_rate": 0.00017082644628099172,
+      "loss": 0.542,
+      "step": 531
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.23838558793067932,
+      "learning_rate": 0.0001705785123966942,
+      "loss": 0.5147,
+      "step": 532
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.2819415330886841,
+      "learning_rate": 0.00017033057851239666,
+      "loss": 0.6437,
+      "step": 533
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.243398055434227,
+      "learning_rate": 0.00017008264462809914,
+      "loss": 0.6611,
+      "step": 534
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.22569122910499573,
+      "learning_rate": 0.00016983471074380165,
+      "loss": 0.3979,
+      "step": 535
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.33265820145606995,
+      "learning_rate": 0.00016958677685950413,
+      "loss": 0.6005,
+      "step": 536
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.26828673481941223,
+      "learning_rate": 0.00016933884297520662,
+      "loss": 0.608,
+      "step": 537
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.24439513683319092,
+      "learning_rate": 0.00016909090909090907,
+      "loss": 0.5572,
+      "step": 538
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.22491876780986786,
+      "learning_rate": 0.00016884297520661156,
+      "loss": 0.7226,
+      "step": 539
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.24468480050563812,
+      "learning_rate": 0.00016859504132231404,
+      "loss": 0.4582,
+      "step": 540
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.23392945528030396,
+      "learning_rate": 0.00016834710743801652,
+      "loss": 0.6477,
+      "step": 541
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.27548858523368835,
+      "learning_rate": 0.00016809917355371898,
+      "loss": 0.5846,
+      "step": 542
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.2861180603504181,
+      "learning_rate": 0.00016785123966942146,
+      "loss": 0.6412,
+      "step": 543
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.24700766801834106,
+      "learning_rate": 0.00016760330578512395,
+      "loss": 0.6947,
+      "step": 544
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.2600953280925751,
+      "learning_rate": 0.00016735537190082643,
+      "loss": 0.6165,
+      "step": 545
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.26876646280288696,
+      "learning_rate": 0.00016710743801652892,
+      "loss": 0.6855,
+      "step": 546
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.26161080598831177,
+      "learning_rate": 0.00016685950413223137,
+      "loss": 0.5066,
+      "step": 547
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.25190046429634094,
+      "learning_rate": 0.00016661157024793386,
+      "loss": 0.5902,
+      "step": 548
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.25269225239753723,
+      "learning_rate": 0.00016636363636363637,
+      "loss": 0.7017,
+      "step": 549
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.28042706847190857,
+      "learning_rate": 0.00016611570247933885,
+      "loss": 0.6264,
+      "step": 550
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.2767360508441925,
+      "learning_rate": 0.0001658677685950413,
+      "loss": 0.7562,
+      "step": 551
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.2771216034889221,
+      "learning_rate": 0.0001656198347107438,
+      "loss": 0.5333,
+      "step": 552
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.189210906624794,
+      "learning_rate": 0.00016537190082644627,
+      "loss": 0.5378,
+      "step": 553
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.22517065703868866,
+      "learning_rate": 0.00016512396694214876,
+      "loss": 0.5292,
+      "step": 554
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.2390165776014328,
+      "learning_rate": 0.00016487603305785121,
+      "loss": 0.4407,
+      "step": 555
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.21548262238502502,
+      "learning_rate": 0.0001646280991735537,
+      "loss": 0.4504,
+      "step": 556
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.20831167697906494,
+      "learning_rate": 0.00016438016528925618,
+      "loss": 0.6848,
+      "step": 557
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.271257609128952,
+      "learning_rate": 0.00016413223140495866,
+      "loss": 0.535,
+      "step": 558
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.32008254528045654,
+      "learning_rate": 0.00016388429752066115,
+      "loss": 0.5107,
+      "step": 559
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.34058302640914917,
+      "learning_rate": 0.0001636363636363636,
+      "loss": 0.5708,
+      "step": 560
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.28070059418678284,
+      "learning_rate": 0.0001633884297520661,
+      "loss": 0.5086,
+      "step": 561
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.25487688183784485,
+      "learning_rate": 0.00016314049586776857,
+      "loss": 0.5184,
+      "step": 562
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3240332007408142,
+      "learning_rate": 0.00016289256198347108,
+      "loss": 0.6774,
+      "step": 563
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.30744409561157227,
+      "learning_rate": 0.0001626446280991735,
+      "loss": 0.5314,
+      "step": 564
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.25220754742622375,
+      "learning_rate": 0.00016239669421487602,
+      "loss": 0.6308,
+      "step": 565
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.29116958379745483,
+      "learning_rate": 0.0001621487603305785,
+      "loss": 0.5685,
+      "step": 566
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.23250073194503784,
+      "learning_rate": 0.000161900826446281,
+      "loss": 0.4318,
+      "step": 567
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.2808091640472412,
+      "learning_rate": 0.00016165289256198347,
+      "loss": 0.6313,
+      "step": 568
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.2711193561553955,
+      "learning_rate": 0.00016140495867768593,
+      "loss": 0.4651,
+      "step": 569
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.29540935158729553,
+      "learning_rate": 0.00016115702479338841,
+      "loss": 0.6663,
+      "step": 570
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.23418714106082916,
+      "learning_rate": 0.0001609090909090909,
+      "loss": 0.448,
+      "step": 571
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.21675793826580048,
+      "learning_rate": 0.00016066115702479338,
+      "loss": 0.5034,
+      "step": 572
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.22451865673065186,
+      "learning_rate": 0.00016041322314049584,
+      "loss": 0.4476,
+      "step": 573
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.26300856471061707,
+      "learning_rate": 0.00016016528925619832,
+      "loss": 0.6646,
+      "step": 574
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.3377116918563843,
+      "learning_rate": 0.0001599173553719008,
+      "loss": 0.6029,
+      "step": 575
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.23391880095005035,
+      "learning_rate": 0.0001596694214876033,
+      "loss": 0.6277,
+      "step": 576
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.19620922207832336,
+      "learning_rate": 0.0001594214876033058,
+      "loss": 0.4638,
+      "step": 577
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.22981096804141998,
+      "learning_rate": 0.00015917355371900823,
+      "loss": 0.5826,
+      "step": 578
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.34321555495262146,
+      "learning_rate": 0.00015892561983471074,
+      "loss": 0.5618,
+      "step": 579
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.28461968898773193,
+      "learning_rate": 0.00015867768595041322,
+      "loss": 0.5129,
+      "step": 580
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.24368269741535187,
+      "learning_rate": 0.0001584297520661157,
+      "loss": 0.5866,
+      "step": 581
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.282255083322525,
+      "learning_rate": 0.00015818181818181816,
+      "loss": 0.6274,
+      "step": 582
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.26298072934150696,
+      "learning_rate": 0.00015793388429752065,
+      "loss": 0.5187,
+      "step": 583
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2671455144882202,
+      "learning_rate": 0.00015768595041322313,
+      "loss": 0.6878,
+      "step": 584
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2681390643119812,
+      "learning_rate": 0.00015743801652892561,
+      "loss": 0.5469,
+      "step": 585
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.38484248518943787,
+      "learning_rate": 0.0001571900826446281,
+      "loss": 0.6364,
+      "step": 586
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.23353587090969086,
+      "learning_rate": 0.00015694214876033055,
+      "loss": 0.4844,
+      "step": 587
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.29452502727508545,
+      "learning_rate": 0.00015669421487603304,
+      "loss": 0.5059,
+      "step": 588
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2460879236459732,
+      "learning_rate": 0.00015644628099173552,
+      "loss": 0.6495,
+      "step": 589
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.30693721771240234,
+      "learning_rate": 0.000156198347107438,
+      "loss": 0.5165,
+      "step": 590
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.2171495109796524,
+      "learning_rate": 0.00015595041322314046,
+      "loss": 0.6172,
+      "step": 591
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.24301984906196594,
+      "learning_rate": 0.00015570247933884294,
+      "loss": 0.6786,
+      "step": 592
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.2288222461938858,
+      "learning_rate": 0.00015545454545454546,
+      "loss": 0.5669,
+      "step": 593
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.2407921552658081,
+      "learning_rate": 0.00015520661157024794,
+      "loss": 0.5968,
+      "step": 594
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.2591527998447418,
+      "learning_rate": 0.0001549586776859504,
+      "loss": 0.544,
+      "step": 595
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.25770679116249084,
+      "learning_rate": 0.00015471074380165288,
+      "loss": 0.7177,
+      "step": 596
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.2528848648071289,
+      "learning_rate": 0.00015446280991735536,
+      "loss": 0.4703,
+      "step": 597
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.24993537366390228,
+      "learning_rate": 0.00015421487603305785,
+      "loss": 0.6003,
+      "step": 598
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.25807908177375793,
+      "learning_rate": 0.00015396694214876033,
+      "loss": 0.465,
+      "step": 599
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.3142452836036682,
+      "learning_rate": 0.0001537190082644628,
+      "loss": 0.6122,
+      "step": 600
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.27111849188804626,
+      "learning_rate": 0.00015347107438016527,
+      "loss": 0.5962,
+      "step": 601
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.28503674268722534,
+      "learning_rate": 0.00015322314049586775,
+      "loss": 0.6667,
+      "step": 602
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.27074381709098816,
+      "learning_rate": 0.00015297520661157024,
+      "loss": 0.6115,
+      "step": 603
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.25918465852737427,
+      "learning_rate": 0.0001527272727272727,
+      "loss": 0.4483,
+      "step": 604
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.24476633965969086,
+      "learning_rate": 0.00015247933884297518,
+      "loss": 0.6501,
+      "step": 605
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.21205200254917145,
+      "learning_rate": 0.00015223140495867766,
+      "loss": 0.3914,
+      "step": 606
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.25496751070022583,
+      "learning_rate": 0.00015198347107438017,
+      "loss": 0.5335,
+      "step": 607
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.27991780638694763,
+      "learning_rate": 0.00015173553719008266,
+      "loss": 0.6083,
+      "step": 608
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.23995639383792877,
+      "learning_rate": 0.0001514876033057851,
+      "loss": 0.55,
+      "step": 609
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.2349666953086853,
+      "learning_rate": 0.0001512396694214876,
+      "loss": 0.7054,
+      "step": 610
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.27498871088027954,
+      "learning_rate": 0.00015099173553719008,
+      "loss": 0.55,
+      "step": 611
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.21346105635166168,
+      "learning_rate": 0.00015074380165289256,
+      "loss": 0.3467,
+      "step": 612
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.2638354003429413,
+      "learning_rate": 0.00015049586776859502,
+      "loss": 0.5624,
+      "step": 613
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.2751975953578949,
+      "learning_rate": 0.0001502479338842975,
+      "loss": 0.3814,
+      "step": 614
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.225106880068779,
+      "learning_rate": 0.00015,
+      "loss": 0.479,
+      "step": 615
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.22013232111930847,
+      "learning_rate": 0.00014975206611570247,
+      "loss": 0.5672,
+      "step": 616
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.21252033114433289,
+      "learning_rate": 0.00014950413223140495,
+      "loss": 0.546,
+      "step": 617
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.2847185432910919,
+      "learning_rate": 0.00014925619834710744,
+      "loss": 0.4434,
+      "step": 618
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.25599631667137146,
+      "learning_rate": 0.0001490082644628099,
+      "loss": 0.4713,
+      "step": 619
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.2719402611255646,
+      "learning_rate": 0.00014876033057851238,
+      "loss": 0.4475,
+      "step": 620
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.26454958319664,
+      "learning_rate": 0.00014851239669421486,
+      "loss": 0.4515,
+      "step": 621
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.39801672101020813,
+      "learning_rate": 0.00014826446280991735,
+      "loss": 0.4647,
+      "step": 622
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.3378361463546753,
+      "learning_rate": 0.0001480165289256198,
+      "loss": 0.4414,
+      "step": 623
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.3039036989212036,
+      "learning_rate": 0.0001477685950413223,
+      "loss": 0.5634,
+      "step": 624
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.3506157398223877,
+      "learning_rate": 0.0001475206611570248,
+      "loss": 0.5001,
+      "step": 625
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.2508845925331116,
+      "learning_rate": 0.00014727272727272725,
+      "loss": 0.3379,
+      "step": 626
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.26913216710090637,
+      "learning_rate": 0.00014702479338842974,
+      "loss": 0.4575,
+      "step": 627
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.329659640789032,
+      "learning_rate": 0.00014677685950413222,
+      "loss": 0.437,
+      "step": 628
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.2972075343132019,
+      "learning_rate": 0.0001465289256198347,
+      "loss": 0.5048,
+      "step": 629
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.3184354603290558,
+      "learning_rate": 0.00014628099173553716,
+      "loss": 0.4374,
+      "step": 630
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.3377355635166168,
+      "learning_rate": 0.00014603305785123967,
+      "loss": 0.4946,
+      "step": 631
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.29106801748275757,
+      "learning_rate": 0.00014578512396694213,
+      "loss": 0.5414,
+      "step": 632
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.22808948159217834,
+      "learning_rate": 0.0001455371900826446,
+      "loss": 0.3739,
+      "step": 633
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.27818021178245544,
+      "learning_rate": 0.0001452892561983471,
+      "loss": 0.4172,
+      "step": 634
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.25634923577308655,
+      "learning_rate": 0.00014504132231404958,
+      "loss": 0.4293,
+      "step": 635
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.30696937441825867,
+      "learning_rate": 0.00014479338842975206,
+      "loss": 0.4454,
+      "step": 636
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.26105087995529175,
+      "learning_rate": 0.00014454545454545452,
+      "loss": 0.2978,
+      "step": 637
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.3100634515285492,
+      "learning_rate": 0.00014429752066115703,
+      "loss": 0.4499,
+      "step": 638
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.27640992403030396,
+      "learning_rate": 0.00014404958677685949,
+      "loss": 0.3837,
+      "step": 639
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.24559038877487183,
+      "learning_rate": 0.00014380165289256197,
+      "loss": 0.3347,
+      "step": 640
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.2920415699481964,
+      "learning_rate": 0.00014355371900826445,
+      "loss": 0.4333,
+      "step": 641
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.3147384226322174,
+      "learning_rate": 0.00014330578512396694,
+      "loss": 0.4385,
+      "step": 642
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.35469138622283936,
+      "learning_rate": 0.0001430578512396694,
+      "loss": 0.5442,
+      "step": 643
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.2619563043117523,
+      "learning_rate": 0.00014280991735537188,
+      "loss": 0.3837,
+      "step": 644
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.32273221015930176,
+      "learning_rate": 0.0001425619834710744,
+      "loss": 0.4946,
+      "step": 645
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.2692110538482666,
+      "learning_rate": 0.00014231404958677684,
+      "loss": 0.4683,
+      "step": 646
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.35255464911460876,
+      "learning_rate": 0.00014206611570247933,
+      "loss": 0.5456,
+      "step": 647
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.29768630862236023,
+      "learning_rate": 0.0001418181818181818,
+      "loss": 0.3394,
+      "step": 648
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.30738797783851624,
+      "learning_rate": 0.0001415702479338843,
+      "loss": 0.3583,
+      "step": 649
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.33226314187049866,
+      "learning_rate": 0.00014132231404958675,
+      "loss": 0.4477,
+      "step": 650
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.2842199504375458,
+      "learning_rate": 0.00014107438016528923,
+      "loss": 0.4454,
+      "step": 651
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.28207266330718994,
+      "learning_rate": 0.00014082644628099172,
+      "loss": 0.3665,
+      "step": 652
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.2228500097990036,
+      "learning_rate": 0.0001405785123966942,
+      "loss": 0.3446,
+      "step": 653
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.2969403564929962,
+      "learning_rate": 0.00014033057851239669,
+      "loss": 0.377,
+      "step": 654
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.28087565302848816,
+      "learning_rate": 0.00014008264462809917,
+      "loss": 0.3683,
+      "step": 655
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.27268192172050476,
+      "learning_rate": 0.00013983471074380165,
+      "loss": 0.427,
+      "step": 656
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.339070588350296,
+      "learning_rate": 0.0001395867768595041,
+      "loss": 0.4887,
+      "step": 657
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.3170423209667206,
+      "learning_rate": 0.0001393388429752066,
+      "loss": 0.5097,
+      "step": 658
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.3114936947822571,
+      "learning_rate": 0.00013909090909090908,
+      "loss": 0.4587,
+      "step": 659
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.28112486004829407,
+      "learning_rate": 0.00013884297520661156,
+      "loss": 0.4781,
+      "step": 660
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.28116974234580994,
+      "learning_rate": 0.00013859504132231404,
+      "loss": 0.3546,
+      "step": 661
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.25061559677124023,
+      "learning_rate": 0.00013834710743801653,
+      "loss": 0.4512,
+      "step": 662
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.29854199290275574,
+      "learning_rate": 0.00013809917355371898,
+      "loss": 0.6068,
+      "step": 663
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.2901363670825958,
+      "learning_rate": 0.00013785123966942147,
+      "loss": 0.3667,
+      "step": 664
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.29766595363616943,
+      "learning_rate": 0.00013760330578512395,
+      "loss": 0.5194,
+      "step": 665
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.2765616476535797,
+      "learning_rate": 0.00013735537190082643,
+      "loss": 0.5079,
+      "step": 666
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.27531540393829346,
+      "learning_rate": 0.00013710743801652892,
+      "loss": 0.4423,
+      "step": 667
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.3063349425792694,
+      "learning_rate": 0.0001368595041322314,
+      "loss": 0.4666,
+      "step": 668
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.24519848823547363,
+      "learning_rate": 0.00013661157024793389,
+      "loss": 0.2995,
+      "step": 669
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.4366275370121002,
+      "learning_rate": 0.00013636363636363634,
+      "loss": 0.4961,
+      "step": 670
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.28639987111091614,
+      "learning_rate": 0.00013611570247933883,
+      "loss": 0.5015,
+      "step": 671
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.2763878107070923,
+      "learning_rate": 0.0001358677685950413,
+      "loss": 0.4883,
+      "step": 672
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.5589582324028015,
+      "learning_rate": 0.0001356198347107438,
+      "loss": 0.5072,
+      "step": 673
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.238887220621109,
+      "learning_rate": 0.00013537190082644625,
+      "loss": 0.411,
+      "step": 674
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.2899521589279175,
+      "learning_rate": 0.00013512396694214876,
+      "loss": 0.3478,
+      "step": 675
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.30960512161254883,
+      "learning_rate": 0.00013487603305785124,
+      "loss": 0.5058,
+      "step": 676
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.33305928111076355,
+      "learning_rate": 0.0001346280991735537,
+      "loss": 0.4528,
+      "step": 677
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.33324292302131653,
+      "learning_rate": 0.00013438016528925618,
+      "loss": 0.3523,
+      "step": 678
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.25855520367622375,
+      "learning_rate": 0.00013413223140495867,
+      "loss": 0.4257,
+      "step": 679
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.36000239849090576,
+      "learning_rate": 0.00013388429752066115,
+      "loss": 0.4963,
+      "step": 680
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.30540961027145386,
+      "learning_rate": 0.0001336363636363636,
+      "loss": 0.4706,
+      "step": 681
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.2791118025779724,
+      "learning_rate": 0.00013338842975206612,
+      "loss": 0.4543,
+      "step": 682
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.37401753664016724,
+      "learning_rate": 0.00013314049586776857,
+      "loss": 0.5614,
+      "step": 683
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.2772528827190399,
+      "learning_rate": 0.00013289256198347106,
+      "loss": 0.3881,
+      "step": 684
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.29219475388526917,
+      "learning_rate": 0.00013264462809917354,
+      "loss": 0.5418,
+      "step": 685
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.3255159258842468,
+      "learning_rate": 0.00013239669421487603,
+      "loss": 0.4669,
+      "step": 686
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.2640572488307953,
+      "learning_rate": 0.0001321487603305785,
+      "loss": 0.4156,
+      "step": 687
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.2618845999240875,
+      "learning_rate": 0.00013190082644628097,
+      "loss": 0.3537,
+      "step": 688
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.27396076917648315,
+      "learning_rate": 0.00013165289256198348,
+      "loss": 0.4391,
+      "step": 689
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.5098498463630676,
+      "learning_rate": 0.00013140495867768593,
+      "loss": 0.3863,
+      "step": 690
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.31764644384384155,
+      "learning_rate": 0.00013115702479338842,
+      "loss": 0.3874,
+      "step": 691
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.28738152980804443,
+      "learning_rate": 0.0001309090909090909,
+      "loss": 0.3209,
+      "step": 692
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.32756757736206055,
+      "learning_rate": 0.00013066115702479338,
+      "loss": 0.4614,
+      "step": 693
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.27650028467178345,
+      "learning_rate": 0.00013041322314049584,
+      "loss": 0.4717,
+      "step": 694
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.33100056648254395,
+      "learning_rate": 0.00013016528925619832,
+      "loss": 0.4317,
+      "step": 695
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.3200342357158661,
+      "learning_rate": 0.00012991735537190083,
+      "loss": 0.4494,
+      "step": 696
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.29615214467048645,
+      "learning_rate": 0.0001296694214876033,
+      "loss": 0.3786,
+      "step": 697
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.278094619512558,
+      "learning_rate": 0.00012942148760330577,
+      "loss": 0.4484,
+      "step": 698
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.32800769805908203,
+      "learning_rate": 0.00012917355371900826,
+      "loss": 0.4635,
+      "step": 699
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.3319619596004486,
+      "learning_rate": 0.00012892561983471074,
+      "loss": 0.5001,
+      "step": 700
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.2818608283996582,
+      "learning_rate": 0.0001286776859504132,
+      "loss": 0.3536,
+      "step": 701
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.28644126653671265,
+      "learning_rate": 0.00012842975206611568,
+      "loss": 0.4168,
+      "step": 702
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.2802482545375824,
+      "learning_rate": 0.00012818181818181817,
+      "loss": 0.3918,
+      "step": 703
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.21232947707176208,
+      "learning_rate": 0.00012793388429752065,
+      "loss": 0.3218,
+      "step": 704
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.36512815952301025,
+      "learning_rate": 0.00012768595041322313,
+      "loss": 0.4566,
+      "step": 705
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.26876160502433777,
+      "learning_rate": 0.00012743801652892562,
+      "loss": 0.4394,
+      "step": 706
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.3757662773132324,
+      "learning_rate": 0.0001271900826446281,
+      "loss": 0.574,
+      "step": 707
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.3161550760269165,
+      "learning_rate": 0.00012694214876033056,
+      "loss": 0.4524,
+      "step": 708
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.31256961822509766,
+      "learning_rate": 0.00012669421487603304,
+      "loss": 0.4332,
+      "step": 709
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.3122079074382782,
+      "learning_rate": 0.00012644628099173552,
+      "loss": 0.5669,
+      "step": 710
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.33779048919677734,
+      "learning_rate": 0.000126198347107438,
+      "loss": 0.515,
+      "step": 711
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.38516169786453247,
+      "learning_rate": 0.0001259504132231405,
+      "loss": 0.5502,
+      "step": 712
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.2803480625152588,
+      "learning_rate": 0.00012570247933884297,
+      "loss": 0.404,
+      "step": 713
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.31674399971961975,
+      "learning_rate": 0.00012545454545454543,
+      "loss": 0.4403,
+      "step": 714
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.3029496669769287,
+      "learning_rate": 0.00012520661157024791,
+      "loss": 0.372,
+      "step": 715
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.22542959451675415,
+      "learning_rate": 0.0001249586776859504,
+      "loss": 0.355,
+      "step": 716
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.32029619812965393,
+      "learning_rate": 0.00012471074380165288,
+      "loss": 0.4845,
+      "step": 717
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.34882861375808716,
+      "learning_rate": 0.00012446280991735537,
+      "loss": 0.4184,
+      "step": 718
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.3319970667362213,
+      "learning_rate": 0.00012421487603305785,
+      "loss": 0.5733,
+      "step": 719
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.2770652770996094,
+      "learning_rate": 0.00012396694214876033,
+      "loss": 0.4296,
+      "step": 720
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.3109978437423706,
+      "learning_rate": 0.0001237190082644628,
+      "loss": 0.3757,
+      "step": 721
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.23606395721435547,
+      "learning_rate": 0.00012347107438016527,
+      "loss": 0.2713,
+      "step": 722
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.304574579000473,
+      "learning_rate": 0.00012322314049586776,
+      "loss": 0.4451,
+      "step": 723
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.31314462423324585,
+      "learning_rate": 0.00012297520661157024,
+      "loss": 0.493,
+      "step": 724
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.32014840841293335,
+      "learning_rate": 0.00012272727272727272,
+      "loss": 0.3784,
+      "step": 725
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.29856279492378235,
+      "learning_rate": 0.0001224793388429752,
+      "loss": 0.581,
+      "step": 726
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.30951863527297974,
+      "learning_rate": 0.0001222314049586777,
+      "loss": 0.4851,
+      "step": 727
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.264663428068161,
+      "learning_rate": 0.00012198347107438015,
+      "loss": 0.431,
+      "step": 728
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.3092226982116699,
+      "learning_rate": 0.00012173553719008264,
+      "loss": 0.4553,
+      "step": 729
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.33568286895751953,
+      "learning_rate": 0.00012148760330578511,
+      "loss": 0.4894,
+      "step": 730
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.2966444492340088,
+      "learning_rate": 0.0001212396694214876,
+      "loss": 0.3855,
+      "step": 731
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.2829122841358185,
+      "learning_rate": 0.00012099173553719007,
+      "loss": 0.5328,
+      "step": 732
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.31785663962364197,
+      "learning_rate": 0.00012074380165289255,
+      "loss": 0.4142,
+      "step": 733
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.2983114719390869,
+      "learning_rate": 0.00012049586776859502,
+      "loss": 0.4168,
+      "step": 734
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.2514868378639221,
+      "learning_rate": 0.0001202479338842975,
+      "loss": 0.4728,
+      "step": 735
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.2959445118904114,
+      "learning_rate": 0.00011999999999999999,
+      "loss": 0.458,
+      "step": 736
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.31830325722694397,
+      "learning_rate": 0.00011975206611570247,
+      "loss": 0.5035,
+      "step": 737
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.31181418895721436,
+      "learning_rate": 0.00011950413223140496,
+      "loss": 0.3776,
+      "step": 738
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.3027549684047699,
+      "learning_rate": 0.00011925619834710743,
+      "loss": 0.4483,
+      "step": 739
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.28026890754699707,
+      "learning_rate": 0.00011900826446280991,
+      "loss": 0.4236,
+      "step": 740
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.29137665033340454,
+      "learning_rate": 0.00011876033057851238,
+      "loss": 0.3615,
+      "step": 741
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.282008558511734,
+      "learning_rate": 0.00011851239669421486,
+      "loss": 0.4335,
+      "step": 742
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.297736793756485,
+      "learning_rate": 0.00011826446280991733,
+      "loss": 0.4945,
+      "step": 743
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.3276868164539337,
+      "learning_rate": 0.00011801652892561982,
+      "loss": 0.5379,
+      "step": 744
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.3510095179080963,
+      "learning_rate": 0.00011776859504132231,
+      "loss": 0.3589,
+      "step": 745
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.29952242970466614,
+      "learning_rate": 0.00011752066115702478,
+      "loss": 0.3805,
+      "step": 746
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.220473513007164,
+      "learning_rate": 0.00011727272727272727,
+      "loss": 0.3978,
+      "step": 747
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.30668944120407104,
+      "learning_rate": 0.00011702479338842974,
+      "loss": 0.3577,
+      "step": 748
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.3152049779891968,
+      "learning_rate": 0.00011677685950413222,
+      "loss": 0.5186,
+      "step": 749
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.17376375198364258,
+      "learning_rate": 0.00011652892561983469,
+      "loss": 0.32,
+      "step": 750
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.32847121357917786,
+      "learning_rate": 0.00011628099173553718,
+      "loss": 0.5403,
+      "step": 751
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.28821662068367004,
+      "learning_rate": 0.00011603305785123965,
+      "loss": 0.3516,
+      "step": 752
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.23324501514434814,
+      "learning_rate": 0.00011578512396694214,
+      "loss": 0.3398,
+      "step": 753
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.2897385060787201,
+      "learning_rate": 0.00011553719008264463,
+      "loss": 0.3775,
+      "step": 754
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.33701419830322266,
+      "learning_rate": 0.0001152892561983471,
+      "loss": 0.5225,
+      "step": 755
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.3228382468223572,
+      "learning_rate": 0.00011504132231404958,
+      "loss": 0.4384,
+      "step": 756
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.24733024835586548,
+      "learning_rate": 0.00011479338842975205,
+      "loss": 0.2883,
+      "step": 757
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.2824367880821228,
+      "learning_rate": 0.00011454545454545453,
+      "loss": 0.3141,
+      "step": 758
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.27844521403312683,
+      "learning_rate": 0.000114297520661157,
+      "loss": 0.3327,
+      "step": 759
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.26114732027053833,
+      "learning_rate": 0.0001140495867768595,
+      "loss": 0.4071,
+      "step": 760
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.34284186363220215,
+      "learning_rate": 0.00011380165289256197,
+      "loss": 0.4619,
+      "step": 761
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.2463303506374359,
+      "learning_rate": 0.00011355371900826446,
+      "loss": 0.3038,
+      "step": 762
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.26452890038490295,
+      "learning_rate": 0.00011330578512396693,
+      "loss": 0.3603,
+      "step": 763
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.27888497710227966,
+      "learning_rate": 0.00011305785123966941,
+      "loss": 0.5109,
+      "step": 764
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.3039766252040863,
+      "learning_rate": 0.00011280991735537189,
+      "loss": 0.5377,
+      "step": 765
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.28995901346206665,
+      "learning_rate": 0.00011256198347107436,
+      "loss": 0.4797,
+      "step": 766
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.3420790135860443,
+      "learning_rate": 0.00011231404958677686,
+      "loss": 0.5209,
+      "step": 767
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.33119046688079834,
+      "learning_rate": 0.00011206611570247933,
+      "loss": 0.3709,
+      "step": 768
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.3408135175704956,
+      "learning_rate": 0.00011181818181818181,
+      "loss": 0.4389,
+      "step": 769
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.29120129346847534,
+      "learning_rate": 0.00011157024793388428,
+      "loss": 0.4327,
+      "step": 770
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.32718029618263245,
+      "learning_rate": 0.00011132231404958677,
+      "loss": 0.4859,
+      "step": 771
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.34422147274017334,
+      "learning_rate": 0.00011107438016528924,
+      "loss": 0.5184,
+      "step": 772
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.330323189496994,
+      "learning_rate": 0.00011082644628099172,
+      "loss": 0.4322,
+      "step": 773
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.3218427002429962,
+      "learning_rate": 0.00011057851239669422,
+      "loss": 0.4129,
+      "step": 774
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.2976725995540619,
+      "learning_rate": 0.00011033057851239669,
+      "loss": 0.5039,
+      "step": 775
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.32841789722442627,
+      "learning_rate": 0.00011008264462809917,
+      "loss": 0.4718,
+      "step": 776
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.32977914810180664,
+      "learning_rate": 0.00010983471074380164,
+      "loss": 0.4248,
+      "step": 777
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.2632751166820526,
+      "learning_rate": 0.00010958677685950413,
+      "loss": 0.3458,
+      "step": 778
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.33028510212898254,
+      "learning_rate": 0.0001093388429752066,
+      "loss": 0.4884,
+      "step": 779
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.30288752913475037,
+      "learning_rate": 0.00010909090909090908,
+      "loss": 0.3776,
+      "step": 780
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.32292476296424866,
+      "learning_rate": 0.00010884297520661155,
+      "loss": 0.392,
+      "step": 781
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.31956765055656433,
+      "learning_rate": 0.00010859504132231405,
+      "loss": 0.3308,
+      "step": 782
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.280553936958313,
+      "learning_rate": 0.00010834710743801652,
+      "loss": 0.5806,
+      "step": 783
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.35859328508377075,
+      "learning_rate": 0.000108099173553719,
+      "loss": 0.5059,
+      "step": 784
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.2944432497024536,
+      "learning_rate": 0.00010785123966942148,
+      "loss": 0.5132,
+      "step": 785
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.27504968643188477,
+      "learning_rate": 0.00010760330578512395,
+      "loss": 0.3741,
+      "step": 786
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.29401764273643494,
+      "learning_rate": 0.00010735537190082644,
+      "loss": 0.4992,
+      "step": 787
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.30569151043891907,
+      "learning_rate": 0.00010710743801652891,
+      "loss": 0.5029,
+      "step": 788
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.28654801845550537,
+      "learning_rate": 0.0001068595041322314,
+      "loss": 0.4618,
+      "step": 789
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.26424363255500793,
+      "learning_rate": 0.00010661157024793387,
+      "loss": 0.3929,
+      "step": 790
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.28117212653160095,
+      "learning_rate": 0.00010636363636363636,
+      "loss": 0.5116,
+      "step": 791
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.28402891755104065,
+      "learning_rate": 0.00010611570247933883,
+      "loss": 0.3758,
+      "step": 792
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.32903602719306946,
+      "learning_rate": 0.00010586776859504131,
+      "loss": 0.3594,
+      "step": 793
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.4285104274749756,
+      "learning_rate": 0.0001056198347107438,
+      "loss": 0.3007,
+      "step": 794
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.27649369835853577,
+      "learning_rate": 0.00010537190082644627,
+      "loss": 0.342,
+      "step": 795
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.3094039261341095,
+      "learning_rate": 0.00010512396694214876,
+      "loss": 0.4452,
+      "step": 796
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.32547199726104736,
+      "learning_rate": 0.00010487603305785123,
+      "loss": 0.4274,
+      "step": 797
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.30244141817092896,
+      "learning_rate": 0.00010462809917355372,
+      "loss": 0.393,
+      "step": 798
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.3018583655357361,
+      "learning_rate": 0.00010438016528925619,
+      "loss": 0.4012,
+      "step": 799
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.36397960782051086,
+      "learning_rate": 0.00010413223140495867,
+      "loss": 0.5231,
+      "step": 800
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.3178517520427704,
+      "learning_rate": 0.00010388429752066114,
+      "loss": 0.4036,
+      "step": 801
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.34640219807624817,
+      "learning_rate": 0.00010363636363636362,
+      "loss": 0.4717,
+      "step": 802
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.302775114774704,
+      "learning_rate": 0.0001033884297520661,
+      "loss": 0.4207,
+      "step": 803
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.30845245718955994,
+      "learning_rate": 0.00010314049586776859,
+      "loss": 0.3976,
+      "step": 804
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.2689266502857208,
+      "learning_rate": 0.00010289256198347107,
+      "loss": 0.3777,
+      "step": 805
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.33539149165153503,
+      "learning_rate": 0.00010264462809917354,
+      "loss": 0.3896,
+      "step": 806
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.2548604905605316,
+      "learning_rate": 0.00010239669421487603,
+      "loss": 0.4026,
+      "step": 807
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.5050720572471619,
+      "learning_rate": 0.0001021487603305785,
+      "loss": 0.4008,
+      "step": 808
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.2518717646598816,
+      "learning_rate": 0.00010190082644628098,
+      "loss": 0.348,
+      "step": 809
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.39397895336151123,
+      "learning_rate": 0.00010165289256198345,
+      "loss": 0.5369,
+      "step": 810
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.3471471965312958,
+      "learning_rate": 0.00010140495867768595,
+      "loss": 0.5272,
+      "step": 811
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.4147883355617523,
+      "learning_rate": 0.00010115702479338842,
+      "loss": 0.427,
+      "step": 812
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.2932160794734955,
+      "learning_rate": 0.0001009090909090909,
+      "loss": 0.3274,
+      "step": 813
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.28647059202194214,
+      "learning_rate": 0.00010066115702479339,
+      "loss": 0.3346,
+      "step": 814
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.28154057264328003,
+      "learning_rate": 0.00010041322314049586,
+      "loss": 0.3785,
+      "step": 815
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.25706711411476135,
+      "learning_rate": 0.00010016528925619834,
+      "loss": 0.3261,
+      "step": 816
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.3318668603897095,
+      "learning_rate": 9.991735537190081e-05,
+      "loss": 0.4362,
+      "step": 817
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.33185282349586487,
+      "learning_rate": 9.966942148760331e-05,
+      "loss": 0.5219,
+      "step": 818
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.2683846056461334,
+      "learning_rate": 9.942148760330578e-05,
+      "loss": 0.3657,
+      "step": 819
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.2643420100212097,
+      "learning_rate": 9.917355371900826e-05,
+      "loss": 0.4697,
+      "step": 820
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.32440856099128723,
+      "learning_rate": 9.892561983471073e-05,
+      "loss": 0.5572,
+      "step": 821
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.22183597087860107,
+      "learning_rate": 9.867768595041321e-05,
+      "loss": 0.3379,
+      "step": 822
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.26266101002693176,
+      "learning_rate": 9.842975206611568e-05,
+      "loss": 0.439,
+      "step": 823
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.2978360950946808,
+      "learning_rate": 9.818181818181817e-05,
+      "loss": 0.4654,
+      "step": 824
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.2713984251022339,
+      "learning_rate": 9.793388429752067e-05,
+      "loss": 0.2983,
+      "step": 825
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.2561984956264496,
+      "learning_rate": 9.768595041322314e-05,
+      "loss": 0.3381,
+      "step": 826
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.2766323983669281,
+      "learning_rate": 9.743801652892562e-05,
+      "loss": 0.4167,
+      "step": 827
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.33810022473335266,
+      "learning_rate": 9.719008264462809e-05,
+      "loss": 0.3793,
+      "step": 828
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.3332251310348511,
+      "learning_rate": 9.694214876033057e-05,
+      "loss": 0.5517,
+      "step": 829
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.2713959515094757,
+      "learning_rate": 9.669421487603304e-05,
+      "loss": 0.3583,
+      "step": 830
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.2778157889842987,
+      "learning_rate": 9.644628099173553e-05,
+      "loss": 0.3089,
+      "step": 831
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.33538392186164856,
+      "learning_rate": 9.6198347107438e-05,
+      "loss": 0.3776,
+      "step": 832
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.32728123664855957,
+      "learning_rate": 9.59504132231405e-05,
+      "loss": 0.434,
+      "step": 833
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.30630162358283997,
+      "learning_rate": 9.570247933884298e-05,
+      "loss": 0.3913,
+      "step": 834
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.2960034906864166,
+      "learning_rate": 9.545454545454545e-05,
+      "loss": 0.4368,
+      "step": 835
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.35711923241615295,
+      "learning_rate": 9.520661157024793e-05,
+      "loss": 0.399,
+      "step": 836
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.30195897817611694,
+      "learning_rate": 9.49586776859504e-05,
+      "loss": 0.4421,
+      "step": 837
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.3220643401145935,
+      "learning_rate": 9.471074380165288e-05,
+      "loss": 0.3441,
+      "step": 838
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.3709239661693573,
+      "learning_rate": 9.446280991735535e-05,
+      "loss": 0.4095,
+      "step": 839
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.40360063314437866,
+      "learning_rate": 9.421487603305785e-05,
+      "loss": 0.5692,
+      "step": 840
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.32428041100502014,
+      "learning_rate": 9.396694214876032e-05,
+      "loss": 0.4306,
+      "step": 841
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.2750518321990967,
+      "learning_rate": 9.37190082644628e-05,
+      "loss": 0.3905,
+      "step": 842
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.331478476524353,
+      "learning_rate": 9.347107438016528e-05,
+      "loss": 0.6008,
+      "step": 843
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.3165242671966553,
+      "learning_rate": 9.322314049586776e-05,
+      "loss": 0.4624,
+      "step": 844
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.26457470655441284,
+      "learning_rate": 9.297520661157024e-05,
+      "loss": 0.4462,
+      "step": 845
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.3557126522064209,
+      "learning_rate": 9.272727272727271e-05,
+      "loss": 0.5737,
+      "step": 846
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.3306926488876343,
+      "learning_rate": 9.247933884297521e-05,
+      "loss": 0.4597,
+      "step": 847
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.24906127154827118,
+      "learning_rate": 9.223140495867768e-05,
+      "loss": 0.378,
+      "step": 848
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.29440054297447205,
+      "learning_rate": 9.198347107438016e-05,
+      "loss": 0.4562,
+      "step": 849
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.34878161549568176,
+      "learning_rate": 9.173553719008263e-05,
+      "loss": 0.4546,
+      "step": 850
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.3725307583808899,
+      "learning_rate": 9.148760330578512e-05,
+      "loss": 0.4119,
+      "step": 851
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.30648747086524963,
+      "learning_rate": 9.123966942148759e-05,
+      "loss": 0.4428,
+      "step": 852
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.2755535840988159,
+      "learning_rate": 9.099173553719007e-05,
+      "loss": 0.3592,
+      "step": 853
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.2802577614784241,
+      "learning_rate": 9.074380165289255e-05,
+      "loss": 0.472,
+      "step": 854
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.28871360421180725,
+      "learning_rate": 9.049586776859504e-05,
+      "loss": 0.4532,
+      "step": 855
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.37071362137794495,
+      "learning_rate": 9.024793388429752e-05,
+      "loss": 0.3426,
+      "step": 856
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.30081430077552795,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 0.4069,
+      "step": 857
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.3186596930027008,
+      "learning_rate": 8.975206611570248e-05,
+      "loss": 0.4997,
+      "step": 858
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.286479115486145,
+      "learning_rate": 8.950413223140495e-05,
+      "loss": 0.3902,
+      "step": 859
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.3457258939743042,
+      "learning_rate": 8.925619834710743e-05,
+      "loss": 0.4339,
+      "step": 860
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.30513113737106323,
+      "learning_rate": 8.90082644628099e-05,
+      "loss": 0.3414,
+      "step": 861
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.30697953701019287,
+      "learning_rate": 8.876033057851238e-05,
+      "loss": 0.4657,
+      "step": 862
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.3395203649997711,
+      "learning_rate": 8.851239669421488e-05,
+      "loss": 0.3945,
+      "step": 863
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.43322789669036865,
+      "learning_rate": 8.826446280991735e-05,
+      "loss": 0.5337,
+      "step": 864
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.3421814739704132,
+      "learning_rate": 8.801652892561983e-05,
+      "loss": 0.4481,
+      "step": 865
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.24497461318969727,
+      "learning_rate": 8.77685950413223e-05,
+      "loss": 0.4199,
+      "step": 866
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.3835270404815674,
+      "learning_rate": 8.752066115702479e-05,
+      "loss": 0.5534,
+      "step": 867
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.3144569396972656,
+      "learning_rate": 8.727272727272726e-05,
+      "loss": 0.4563,
+      "step": 868
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.2757865786552429,
+      "learning_rate": 8.702479338842974e-05,
+      "loss": 0.4241,
+      "step": 869
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.28413090109825134,
+      "learning_rate": 8.677685950413221e-05,
+      "loss": 0.3484,
+      "step": 870
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.27918362617492676,
+      "learning_rate": 8.652892561983471e-05,
+      "loss": 0.4133,
+      "step": 871
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.3901917040348053,
+      "learning_rate": 8.628099173553718e-05,
+      "loss": 0.4755,
+      "step": 872
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.34810692071914673,
+      "learning_rate": 8.603305785123966e-05,
+      "loss": 0.4516,
+      "step": 873
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.3317393958568573,
+      "learning_rate": 8.578512396694215e-05,
+      "loss": 0.4995,
+      "step": 874
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.26235052943229675,
+      "learning_rate": 8.553719008264462e-05,
+      "loss": 0.3348,
+      "step": 875
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.2735447585582733,
+      "learning_rate": 8.52892561983471e-05,
+      "loss": 0.2932,
+      "step": 876
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.30968329310417175,
+      "learning_rate": 8.504132231404957e-05,
+      "loss": 0.3783,
+      "step": 877
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.30193984508514404,
+      "learning_rate": 8.479338842975207e-05,
+      "loss": 0.4357,
+      "step": 878
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.3407258987426758,
+      "learning_rate": 8.454545454545454e-05,
+      "loss": 0.4821,
+      "step": 879
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.28090009093284607,
+      "learning_rate": 8.429752066115702e-05,
+      "loss": 0.4158,
+      "step": 880
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.2898884415626526,
+      "learning_rate": 8.404958677685949e-05,
+      "loss": 0.3091,
+      "step": 881
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.31658637523651123,
+      "learning_rate": 8.380165289256197e-05,
+      "loss": 0.3773,
+      "step": 882
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.2722189724445343,
+      "learning_rate": 8.355371900826446e-05,
+      "loss": 0.4483,
+      "step": 883
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.23621954023838043,
+      "learning_rate": 8.330578512396693e-05,
+      "loss": 0.3112,
+      "step": 884
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.3659461438655853,
+      "learning_rate": 8.305785123966942e-05,
+      "loss": 0.4507,
+      "step": 885
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.3253099322319031,
+      "learning_rate": 8.28099173553719e-05,
+      "loss": 0.4854,
+      "step": 886
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.3201637864112854,
+      "learning_rate": 8.256198347107438e-05,
+      "loss": 0.5687,
+      "step": 887
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.4112270772457123,
+      "learning_rate": 8.231404958677685e-05,
+      "loss": 0.3742,
+      "step": 888
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.3146194517612457,
+      "learning_rate": 8.206611570247933e-05,
+      "loss": 0.4869,
+      "step": 889
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.34321263432502747,
+      "learning_rate": 8.18181818181818e-05,
+      "loss": 0.5154,
+      "step": 890
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.2986968159675598,
+      "learning_rate": 8.157024793388429e-05,
+      "loss": 0.647,
+      "step": 891
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.3427133858203888,
+      "learning_rate": 8.132231404958676e-05,
+      "loss": 0.3912,
+      "step": 892
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.3434309661388397,
+      "learning_rate": 8.107438016528925e-05,
+      "loss": 0.51,
+      "step": 893
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.32024991512298584,
+      "learning_rate": 8.082644628099174e-05,
+      "loss": 0.387,
+      "step": 894
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.2961815595626831,
+      "learning_rate": 8.057851239669421e-05,
+      "loss": 0.3909,
+      "step": 895
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.3219030201435089,
+      "learning_rate": 8.033057851239669e-05,
+      "loss": 0.3911,
+      "step": 896
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.2776000201702118,
+      "learning_rate": 8.008264462809916e-05,
+      "loss": 0.3625,
+      "step": 897
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.31484290957450867,
+      "learning_rate": 7.983471074380164e-05,
+      "loss": 0.6162,
+      "step": 898
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.2789134085178375,
+      "learning_rate": 7.958677685950411e-05,
+      "loss": 0.3199,
+      "step": 899
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.27821627259254456,
+      "learning_rate": 7.933884297520661e-05,
+      "loss": 0.4295,
+      "step": 900
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.3022254705429077,
+      "learning_rate": 7.909090909090908e-05,
+      "loss": 0.309,
+      "step": 901
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.24830293655395508,
+      "learning_rate": 7.884297520661157e-05,
+      "loss": 0.3833,
+      "step": 902
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.31184327602386475,
+      "learning_rate": 7.859504132231405e-05,
+      "loss": 0.3715,
+      "step": 903
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.2993053197860718,
+      "learning_rate": 7.834710743801652e-05,
+      "loss": 0.3825,
+      "step": 904
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.3385005295276642,
+      "learning_rate": 7.8099173553719e-05,
+      "loss": 0.4868,
+      "step": 905
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.26812323927879333,
+      "learning_rate": 7.785123966942147e-05,
+      "loss": 0.2925,
+      "step": 906
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.3275848925113678,
+      "learning_rate": 7.760330578512397e-05,
+      "loss": 0.3657,
+      "step": 907
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.2972089350223541,
+      "learning_rate": 7.735537190082644e-05,
+      "loss": 0.4396,
+      "step": 908
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.27619728446006775,
+      "learning_rate": 7.710743801652892e-05,
+      "loss": 0.3946,
+      "step": 909
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.30436667799949646,
+      "learning_rate": 7.68595041322314e-05,
+      "loss": 0.4177,
+      "step": 910
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.2652393877506256,
+      "learning_rate": 7.661157024793388e-05,
+      "loss": 0.3165,
+      "step": 911
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.28303712606430054,
+      "learning_rate": 7.636363636363635e-05,
+      "loss": 0.4829,
+      "step": 912
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.33964964747428894,
+      "learning_rate": 7.611570247933883e-05,
+      "loss": 0.5043,
+      "step": 913
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.2591302692890167,
+      "learning_rate": 7.586776859504133e-05,
+      "loss": 0.3814,
+      "step": 914
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.3488747179508209,
+      "learning_rate": 7.56198347107438e-05,
+      "loss": 0.5233,
+      "step": 915
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.29015597701072693,
+      "learning_rate": 7.537190082644628e-05,
+      "loss": 0.4672,
+      "step": 916
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.31618839502334595,
+      "learning_rate": 7.512396694214875e-05,
+      "loss": 0.4538,
+      "step": 917
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.35049545764923096,
+      "learning_rate": 7.487603305785124e-05,
+      "loss": 0.4089,
+      "step": 918
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.34093132615089417,
+      "learning_rate": 7.462809917355372e-05,
+      "loss": 0.4306,
+      "step": 919
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.30601584911346436,
+      "learning_rate": 7.438016528925619e-05,
+      "loss": 0.4396,
+      "step": 920
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.45013612508773804,
+      "learning_rate": 7.413223140495867e-05,
+      "loss": 0.4477,
+      "step": 921
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.30486834049224854,
+      "learning_rate": 7.388429752066116e-05,
+      "loss": 0.3777,
+      "step": 922
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.3926061689853668,
+      "learning_rate": 7.363636363636363e-05,
+      "loss": 0.3532,
+      "step": 923
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.3843371272087097,
+      "learning_rate": 7.338842975206611e-05,
+      "loss": 0.5182,
+      "step": 924
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.30922451615333557,
+      "learning_rate": 7.314049586776858e-05,
+      "loss": 0.4361,
+      "step": 925
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.3367323875427246,
+      "learning_rate": 7.289256198347106e-05,
+      "loss": 0.3809,
+      "step": 926
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.39369019865989685,
+      "learning_rate": 7.264462809917355e-05,
+      "loss": 0.3623,
+      "step": 927
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.3159162104129791,
+      "learning_rate": 7.239669421487603e-05,
+      "loss": 0.5059,
+      "step": 928
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.34716740250587463,
+      "learning_rate": 7.214876033057851e-05,
+      "loss": 0.4201,
+      "step": 929
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.20480923354625702,
+      "learning_rate": 7.190082644628098e-05,
+      "loss": 0.2699,
+      "step": 930
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.3518913686275482,
+      "learning_rate": 7.165289256198347e-05,
+      "loss": 0.5337,
+      "step": 931
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.28605952858924866,
+      "learning_rate": 7.140495867768594e-05,
+      "loss": 0.44,
+      "step": 932
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.28229033946990967,
+      "learning_rate": 7.115702479338842e-05,
+      "loss": 0.3534,
+      "step": 933
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.3456754684448242,
+      "learning_rate": 7.09090909090909e-05,
+      "loss": 0.3952,
+      "step": 934
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.27707159519195557,
+      "learning_rate": 7.066115702479338e-05,
+      "loss": 0.3667,
+      "step": 935
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.2811780273914337,
+      "learning_rate": 7.041322314049586e-05,
+      "loss": 0.3954,
+      "step": 936
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.3099793493747711,
+      "learning_rate": 7.016528925619834e-05,
+      "loss": 0.441,
+      "step": 937
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.4153590500354767,
+      "learning_rate": 6.991735537190083e-05,
+      "loss": 0.4462,
+      "step": 938
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.2945801615715027,
+      "learning_rate": 6.96694214876033e-05,
+      "loss": 0.4535,
+      "step": 939
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.2930592894554138,
+      "learning_rate": 6.942148760330578e-05,
+      "loss": 0.5566,
+      "step": 940
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.3034913241863251,
+      "learning_rate": 6.917355371900826e-05,
+      "loss": 0.4695,
+      "step": 941
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.3054913878440857,
+      "learning_rate": 6.892561983471073e-05,
+      "loss": 0.3921,
+      "step": 942
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.3297981917858124,
+      "learning_rate": 6.867768595041322e-05,
+      "loss": 0.5057,
+      "step": 943
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.23640452325344086,
+      "learning_rate": 6.84297520661157e-05,
+      "loss": 0.329,
+      "step": 944
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.2970188856124878,
+      "learning_rate": 6.818181818181817e-05,
+      "loss": 0.4376,
+      "step": 945
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.3243064880371094,
+      "learning_rate": 6.793388429752065e-05,
+      "loss": 0.4922,
+      "step": 946
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.4473859667778015,
+      "learning_rate": 6.768595041322312e-05,
+      "loss": 0.5245,
+      "step": 947
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.2901310622692108,
+      "learning_rate": 6.743801652892562e-05,
+      "loss": 0.4996,
+      "step": 948
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.3633457124233246,
+      "learning_rate": 6.719008264462809e-05,
+      "loss": 0.4669,
+      "step": 949
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.33570581674575806,
+      "learning_rate": 6.694214876033058e-05,
+      "loss": 0.404,
+      "step": 950
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.26466354727745056,
+      "learning_rate": 6.669421487603306e-05,
+      "loss": 0.2881,
+      "step": 951
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.29028353095054626,
+      "learning_rate": 6.644628099173553e-05,
+      "loss": 0.3607,
+      "step": 952
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.2878669798374176,
+      "learning_rate": 6.619834710743801e-05,
+      "loss": 0.4415,
+      "step": 953
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.33260804414749146,
+      "learning_rate": 6.595041322314048e-05,
+      "loss": 0.4424,
+      "step": 954
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.3135119378566742,
+      "learning_rate": 6.570247933884297e-05,
+      "loss": 0.4276,
+      "step": 955
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.2714795470237732,
+      "learning_rate": 6.545454545454545e-05,
+      "loss": 0.2789,
+      "step": 956
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.3564438819885254,
+      "learning_rate": 6.520661157024792e-05,
+      "loss": 0.4683,
+      "step": 957
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.3303399682044983,
+      "learning_rate": 6.495867768595042e-05,
+      "loss": 0.4657,
+      "step": 958
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.30086350440979004,
+      "learning_rate": 6.471074380165289e-05,
+      "loss": 0.3296,
+      "step": 959
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.34699100255966187,
+      "learning_rate": 6.446280991735537e-05,
+      "loss": 0.3543,
+      "step": 960
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.326579213142395,
+      "learning_rate": 6.421487603305784e-05,
+      "loss": 0.4001,
+      "step": 961
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.3462665379047394,
+      "learning_rate": 6.396694214876032e-05,
+      "loss": 0.3999,
+      "step": 962
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.3408821225166321,
+      "learning_rate": 6.371900826446281e-05,
+      "loss": 0.3614,
+      "step": 963
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.3061428666114807,
+      "learning_rate": 6.347107438016528e-05,
+      "loss": 0.4127,
+      "step": 964
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.30745938420295715,
+      "learning_rate": 6.322314049586776e-05,
+      "loss": 0.3965,
+      "step": 965
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.33782872557640076,
+      "learning_rate": 6.297520661157025e-05,
+      "loss": 0.5026,
+      "step": 966
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.3501698076725006,
+      "learning_rate": 6.272727272727272e-05,
+      "loss": 0.4731,
+      "step": 967
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.3578520119190216,
+      "learning_rate": 6.24793388429752e-05,
+      "loss": 0.4302,
+      "step": 968
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 0.30132660269737244,
+      "learning_rate": 6.223140495867768e-05,
+      "loss": 0.3784,
+      "step": 969
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 0.29198774695396423,
+      "learning_rate": 6.198347107438017e-05,
+      "loss": 0.396,
+      "step": 970
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 0.3028549551963806,
+      "learning_rate": 6.173553719008264e-05,
+      "loss": 0.3531,
+      "step": 971
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 0.3193860352039337,
+      "learning_rate": 6.148760330578512e-05,
+      "loss": 0.5261,
+      "step": 972
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 0.330228716135025,
+      "learning_rate": 6.12396694214876e-05,
+      "loss": 0.3853,
+      "step": 973
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 0.2856347858905792,
+      "learning_rate": 6.0991735537190074e-05,
+      "loss": 0.4543,
+      "step": 974
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.3663886487483978,
+      "learning_rate": 6.074380165289256e-05,
+      "loss": 0.3821,
+      "step": 975
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.3297857642173767,
+      "learning_rate": 6.0495867768595034e-05,
+      "loss": 0.4504,
+      "step": 976
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.29853883385658264,
+      "learning_rate": 6.024793388429751e-05,
+      "loss": 0.3528,
+      "step": 977
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.3246425986289978,
+      "learning_rate": 5.9999999999999995e-05,
+      "loss": 0.3986,
+      "step": 978
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.3537238836288452,
+      "learning_rate": 5.975206611570248e-05,
+      "loss": 0.3776,
+      "step": 979
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.2915757894515991,
+      "learning_rate": 5.9504132231404955e-05,
+      "loss": 0.2895,
+      "step": 980
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.30707284808158875,
+      "learning_rate": 5.925619834710743e-05,
+      "loss": 0.3238,
+      "step": 981
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.301845520734787,
+      "learning_rate": 5.900826446280991e-05,
+      "loss": 0.4031,
+      "step": 982
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.24002347886562347,
+      "learning_rate": 5.876033057851239e-05,
+      "loss": 0.3477,
+      "step": 983
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.3008634150028229,
+      "learning_rate": 5.851239669421487e-05,
+      "loss": 0.4595,
+      "step": 984
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.32416027784347534,
+      "learning_rate": 5.8264462809917346e-05,
+      "loss": 0.403,
+      "step": 985
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.3158760368824005,
+      "learning_rate": 5.801652892561982e-05,
+      "loss": 0.305,
+      "step": 986
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.33743736147880554,
+      "learning_rate": 5.7768595041322313e-05,
+      "loss": 0.4867,
+      "step": 987
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.3402981460094452,
+      "learning_rate": 5.752066115702479e-05,
+      "loss": 0.3982,
+      "step": 988
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.3389660716056824,
+      "learning_rate": 5.727272727272727e-05,
+      "loss": 0.4311,
+      "step": 989
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.47749587893486023,
+      "learning_rate": 5.702479338842975e-05,
+      "loss": 0.3775,
+      "step": 990
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.27538084983825684,
+      "learning_rate": 5.677685950413223e-05,
+      "loss": 0.3568,
+      "step": 991
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.33023789525032043,
+      "learning_rate": 5.6528925619834704e-05,
+      "loss": 0.4225,
+      "step": 992
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.28135445713996887,
+      "learning_rate": 5.628099173553718e-05,
+      "loss": 0.3658,
+      "step": 993
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.3511416018009186,
+      "learning_rate": 5.6033057851239665e-05,
+      "loss": 0.3928,
+      "step": 994
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.2987925708293915,
+      "learning_rate": 5.578512396694214e-05,
+      "loss": 0.4015,
+      "step": 995
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.3340010344982147,
+      "learning_rate": 5.553719008264462e-05,
+      "loss": 0.4566,
+      "step": 996
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.23461014032363892,
+      "learning_rate": 5.528925619834711e-05,
+      "loss": 0.3556,
+      "step": 997
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.3425525724887848,
+      "learning_rate": 5.5041322314049586e-05,
+      "loss": 0.3736,
+      "step": 998
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.33320698142051697,
+      "learning_rate": 5.479338842975206e-05,
+      "loss": 0.3926,
+      "step": 999
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.26936790347099304,
+      "learning_rate": 5.454545454545454e-05,
+      "loss": 0.3587,
+      "step": 1000
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.322934091091156,
+      "learning_rate": 5.429752066115702e-05,
+      "loss": 0.3119,
+      "step": 1001
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.3295484483242035,
+      "learning_rate": 5.40495867768595e-05,
+      "loss": 0.3257,
+      "step": 1002
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.2893584370613098,
+      "learning_rate": 5.380165289256198e-05,
+      "loss": 0.3451,
+      "step": 1003
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.3215138912200928,
+      "learning_rate": 5.3553719008264454e-05,
+      "loss": 0.4104,
+      "step": 1004
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.19545914232730865,
+      "learning_rate": 5.330578512396694e-05,
+      "loss": 0.2245,
+      "step": 1005
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.2952648103237152,
+      "learning_rate": 5.3057851239669414e-05,
+      "loss": 0.3393,
+      "step": 1006
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.34105175733566284,
+      "learning_rate": 5.28099173553719e-05,
+      "loss": 0.519,
+      "step": 1007
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.3435216546058655,
+      "learning_rate": 5.256198347107438e-05,
+      "loss": 0.4968,
+      "step": 1008
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.29052355885505676,
+      "learning_rate": 5.231404958677686e-05,
+      "loss": 0.4419,
+      "step": 1009
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.3326230049133301,
+      "learning_rate": 5.2066115702479335e-05,
+      "loss": 0.4461,
+      "step": 1010
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 0.35595494508743286,
+      "learning_rate": 5.181818181818181e-05,
+      "loss": 0.4886,
+      "step": 1011
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 0.3467525541782379,
+      "learning_rate": 5.1570247933884295e-05,
+      "loss": 0.4671,
+      "step": 1012
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 0.29460448026657104,
+      "learning_rate": 5.132231404958677e-05,
+      "loss": 0.3872,
+      "step": 1013
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 0.273575097322464,
+      "learning_rate": 5.107438016528925e-05,
+      "loss": 0.3603,
+      "step": 1014
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 0.3603818416595459,
+      "learning_rate": 5.0826446280991726e-05,
+      "loss": 0.3539,
+      "step": 1015
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 0.31469517946243286,
+      "learning_rate": 5.057851239669421e-05,
+      "loss": 0.3988,
+      "step": 1016
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.3218969702720642,
+      "learning_rate": 5.033057851239669e-05,
+      "loss": 0.4366,
+      "step": 1017
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.34077420830726624,
+      "learning_rate": 5.008264462809917e-05,
+      "loss": 0.4248,
+      "step": 1018
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.322591096162796,
+      "learning_rate": 4.9834710743801654e-05,
+      "loss": 0.5081,
+      "step": 1019
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.35607361793518066,
+      "learning_rate": 4.958677685950413e-05,
+      "loss": 0.3596,
+      "step": 1020
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.2865798771381378,
+      "learning_rate": 4.933884297520661e-05,
+      "loss": 0.2703,
+      "step": 1021
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.30387502908706665,
+      "learning_rate": 4.9090909090909084e-05,
+      "loss": 0.3051,
+      "step": 1022
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.3474448323249817,
+      "learning_rate": 4.884297520661157e-05,
+      "loss": 0.2851,
+      "step": 1023
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.3696686625480652,
+      "learning_rate": 4.8595041322314045e-05,
+      "loss": 0.4403,
+      "step": 1024
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.33602291345596313,
+      "learning_rate": 4.834710743801652e-05,
+      "loss": 0.4134,
+      "step": 1025
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.27331918478012085,
+      "learning_rate": 4.8099173553719e-05,
+      "loss": 0.3303,
+      "step": 1026
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.3705825209617615,
+      "learning_rate": 4.785123966942149e-05,
+      "loss": 0.3411,
+      "step": 1027
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.4541082978248596,
+      "learning_rate": 4.7603305785123966e-05,
+      "loss": 0.4263,
+      "step": 1028
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.29885897040367126,
+      "learning_rate": 4.735537190082644e-05,
+      "loss": 0.5602,
+      "step": 1029
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.35169675946235657,
+      "learning_rate": 4.7107438016528926e-05,
+      "loss": 0.4409,
+      "step": 1030
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.41590291261672974,
+      "learning_rate": 4.68595041322314e-05,
+      "loss": 0.4355,
+      "step": 1031
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.33613288402557373,
+      "learning_rate": 4.661157024793388e-05,
+      "loss": 0.4399,
+      "step": 1032
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.3519938886165619,
+      "learning_rate": 4.6363636363636356e-05,
+      "loss": 0.4464,
+      "step": 1033
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.2981269359588623,
+      "learning_rate": 4.611570247933884e-05,
+      "loss": 0.3667,
+      "step": 1034
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.32030418515205383,
+      "learning_rate": 4.586776859504132e-05,
+      "loss": 0.3759,
+      "step": 1035
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.39815372228622437,
+      "learning_rate": 4.5619834710743794e-05,
+      "loss": 0.3259,
+      "step": 1036
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.33106112480163574,
+      "learning_rate": 4.537190082644628e-05,
+      "loss": 0.4985,
+      "step": 1037
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.3748137950897217,
+      "learning_rate": 4.512396694214876e-05,
+      "loss": 0.5177,
+      "step": 1038
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.31328514218330383,
+      "learning_rate": 4.487603305785124e-05,
+      "loss": 0.3406,
+      "step": 1039
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.35391247272491455,
+      "learning_rate": 4.4628099173553715e-05,
+      "loss": 0.4216,
+      "step": 1040
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.37352749705314636,
+      "learning_rate": 4.438016528925619e-05,
+      "loss": 0.4936,
+      "step": 1041
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.20523978769779205,
+      "learning_rate": 4.4132231404958675e-05,
+      "loss": 0.2241,
+      "step": 1042
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.26052072644233704,
+      "learning_rate": 4.388429752066115e-05,
+      "loss": 0.352,
+      "step": 1043
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.30189159512519836,
+      "learning_rate": 4.363636363636363e-05,
+      "loss": 0.3956,
+      "step": 1044
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.28206998109817505,
+      "learning_rate": 4.3388429752066106e-05,
+      "loss": 0.3073,
+      "step": 1045
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.3497346341609955,
+      "learning_rate": 4.314049586776859e-05,
+      "loss": 0.4544,
+      "step": 1046
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.31490492820739746,
+      "learning_rate": 4.289256198347107e-05,
+      "loss": 0.4809,
+      "step": 1047
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.26548659801483154,
+      "learning_rate": 4.264462809917355e-05,
+      "loss": 0.3189,
+      "step": 1048
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.40890252590179443,
+      "learning_rate": 4.239669421487603e-05,
+      "loss": 0.4825,
+      "step": 1049
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.392419695854187,
+      "learning_rate": 4.214876033057851e-05,
+      "loss": 0.3518,
+      "step": 1050
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.3267776370048523,
+      "learning_rate": 4.190082644628099e-05,
+      "loss": 0.5964,
+      "step": 1051
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.29872927069664,
+      "learning_rate": 4.1652892561983464e-05,
+      "loss": 0.3496,
+      "step": 1052
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.3140263259410858,
+      "learning_rate": 4.140495867768595e-05,
+      "loss": 0.3496,
+      "step": 1053
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.35923945903778076,
+      "learning_rate": 4.1157024793388424e-05,
+      "loss": 0.4328,
+      "step": 1054
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.24899311363697052,
+      "learning_rate": 4.09090909090909e-05,
+      "loss": 0.3662,
+      "step": 1055
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.300325870513916,
+      "learning_rate": 4.066115702479338e-05,
+      "loss": 0.3714,
+      "step": 1056
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.26927053928375244,
+      "learning_rate": 4.041322314049587e-05,
+      "loss": 0.3518,
+      "step": 1057
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.28170421719551086,
+      "learning_rate": 4.0165289256198345e-05,
+      "loss": 0.4214,
+      "step": 1058
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.3097275197505951,
+      "learning_rate": 3.991735537190082e-05,
+      "loss": 0.3387,
+      "step": 1059
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.36259180307388306,
+      "learning_rate": 3.9669421487603306e-05,
+      "loss": 0.4968,
+      "step": 1060
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.3555668592453003,
+      "learning_rate": 3.942148760330578e-05,
+      "loss": 0.4415,
+      "step": 1061
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.2894740104675293,
+      "learning_rate": 3.917355371900826e-05,
+      "loss": 0.3911,
+      "step": 1062
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.3361656665802002,
+      "learning_rate": 3.8925619834710736e-05,
+      "loss": 0.4286,
+      "step": 1063
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.33269697427749634,
+      "learning_rate": 3.867768595041322e-05,
+      "loss": 0.5162,
+      "step": 1064
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.3324260711669922,
+      "learning_rate": 3.84297520661157e-05,
+      "loss": 0.4073,
+      "step": 1065
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.3037840723991394,
+      "learning_rate": 3.8181818181818174e-05,
+      "loss": 0.4084,
+      "step": 1066
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.29843100905418396,
+      "learning_rate": 3.7933884297520664e-05,
+      "loss": 0.4028,
+      "step": 1067
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.24433061480522156,
+      "learning_rate": 3.768595041322314e-05,
+      "loss": 0.3769,
+      "step": 1068
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.31540754437446594,
+      "learning_rate": 3.743801652892562e-05,
+      "loss": 0.4006,
+      "step": 1069
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.3915780186653137,
+      "learning_rate": 3.7190082644628094e-05,
+      "loss": 0.3859,
+      "step": 1070
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.7843402028083801,
+      "learning_rate": 3.694214876033058e-05,
+      "loss": 0.4284,
+      "step": 1071
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.3000487685203552,
+      "learning_rate": 3.6694214876033055e-05,
+      "loss": 0.6066,
+      "step": 1072
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.2342897206544876,
+      "learning_rate": 3.644628099173553e-05,
+      "loss": 0.3012,
+      "step": 1073
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.3100823760032654,
+      "learning_rate": 3.6198347107438015e-05,
+      "loss": 0.4236,
+      "step": 1074
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.3442421853542328,
+      "learning_rate": 3.595041322314049e-05,
+      "loss": 0.4716,
+      "step": 1075
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.2785506546497345,
+      "learning_rate": 3.570247933884297e-05,
+      "loss": 0.307,
+      "step": 1076
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.333635151386261,
+      "learning_rate": 3.545454545454545e-05,
+      "loss": 0.4521,
+      "step": 1077
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 0.3365010619163513,
+      "learning_rate": 3.520661157024793e-05,
+      "loss": 0.4522,
+      "step": 1078
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 0.31510964035987854,
+      "learning_rate": 3.495867768595041e-05,
+      "loss": 0.4101,
+      "step": 1079
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 0.2939818501472473,
+      "learning_rate": 3.471074380165289e-05,
+      "loss": 0.378,
+      "step": 1080
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 0.33073171973228455,
+      "learning_rate": 3.446280991735537e-05,
+      "loss": 0.4319,
+      "step": 1081
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 0.306769460439682,
+      "learning_rate": 3.421487603305785e-05,
+      "loss": 0.4584,
+      "step": 1082
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 0.3151317536830902,
+      "learning_rate": 3.396694214876033e-05,
+      "loss": 0.3202,
+      "step": 1083
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 0.313348650932312,
+      "learning_rate": 3.371900826446281e-05,
+      "loss": 0.4051,
+      "step": 1084
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.3377431333065033,
+      "learning_rate": 3.347107438016529e-05,
+      "loss": 0.3842,
+      "step": 1085
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.31378257274627686,
+      "learning_rate": 3.3223140495867765e-05,
+      "loss": 0.377,
+      "step": 1086
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.31627315282821655,
+      "learning_rate": 3.297520661157024e-05,
+      "loss": 0.4278,
+      "step": 1087
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.2957272529602051,
+      "learning_rate": 3.2727272727272725e-05,
+      "loss": 0.3384,
+      "step": 1088
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.3261624872684479,
+      "learning_rate": 3.247933884297521e-05,
+      "loss": 0.4525,
+      "step": 1089
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.28680557012557983,
+      "learning_rate": 3.2231404958677685e-05,
+      "loss": 0.3627,
+      "step": 1090
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.29543063044548035,
+      "learning_rate": 3.198347107438016e-05,
+      "loss": 0.2922,
+      "step": 1091
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.3554795980453491,
+      "learning_rate": 3.173553719008264e-05,
+      "loss": 0.4692,
+      "step": 1092
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.28728747367858887,
+      "learning_rate": 3.148760330578512e-05,
+      "loss": 0.2595,
+      "step": 1093
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.3099517524242401,
+      "learning_rate": 3.12396694214876e-05,
+      "loss": 0.3912,
+      "step": 1094
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.3173176050186157,
+      "learning_rate": 3.099173553719008e-05,
+      "loss": 0.4186,
+      "step": 1095
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.3445116877555847,
+      "learning_rate": 3.074380165289256e-05,
+      "loss": 0.4966,
+      "step": 1096
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.32030245661735535,
+      "learning_rate": 3.0495867768595037e-05,
+      "loss": 0.4671,
+      "step": 1097
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.3321797847747803,
+      "learning_rate": 3.0247933884297517e-05,
+      "loss": 0.4265,
+      "step": 1098
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.36085036396980286,
+      "learning_rate": 2.9999999999999997e-05,
+      "loss": 0.4324,
+      "step": 1099
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.2999497950077057,
+      "learning_rate": 2.9752066115702478e-05,
+      "loss": 0.3173,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.31063607335090637,
+      "learning_rate": 2.9504132231404954e-05,
+      "loss": 0.4941,
+      "step": 1101
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.2864468991756439,
+      "learning_rate": 2.9256198347107435e-05,
+      "loss": 0.4309,
+      "step": 1102
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.2904879152774811,
+      "learning_rate": 2.900826446280991e-05,
+      "loss": 0.4782,
+      "step": 1103
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.31169822812080383,
+      "learning_rate": 2.8760330578512395e-05,
+      "loss": 0.4881,
+      "step": 1104
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.3462170660495758,
+      "learning_rate": 2.8512396694214875e-05,
+      "loss": 0.3551,
+      "step": 1105
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.3066549301147461,
+      "learning_rate": 2.8264462809917352e-05,
+      "loss": 0.4522,
+      "step": 1106
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.33785369992256165,
+      "learning_rate": 2.8016528925619832e-05,
+      "loss": 0.3763,
+      "step": 1107
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.2975507378578186,
+      "learning_rate": 2.776859504132231e-05,
+      "loss": 0.3193,
+      "step": 1108
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.31934845447540283,
+      "learning_rate": 2.7520661157024793e-05,
+      "loss": 0.2994,
+      "step": 1109
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.29450473189353943,
+      "learning_rate": 2.727272727272727e-05,
+      "loss": 0.4279,
+      "step": 1110
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.3054717779159546,
+      "learning_rate": 2.702479338842975e-05,
+      "loss": 0.4687,
+      "step": 1111
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.32938167452812195,
+      "learning_rate": 2.6776859504132227e-05,
+      "loss": 0.4815,
+      "step": 1112
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.2678495943546295,
+      "learning_rate": 2.6528925619834707e-05,
+      "loss": 0.3116,
+      "step": 1113
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.26357004046440125,
+      "learning_rate": 2.628099173553719e-05,
+      "loss": 0.3286,
+      "step": 1114
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.3359578251838684,
+      "learning_rate": 2.6033057851239667e-05,
+      "loss": 0.4137,
+      "step": 1115
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.3395717442035675,
+      "learning_rate": 2.5785123966942148e-05,
+      "loss": 0.3812,
+      "step": 1116
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.29891693592071533,
+      "learning_rate": 2.5537190082644625e-05,
+      "loss": 0.2989,
+      "step": 1117
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.402649462223053,
+      "learning_rate": 2.5289256198347105e-05,
+      "loss": 0.4333,
+      "step": 1118
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.3397662341594696,
+      "learning_rate": 2.5041322314049585e-05,
+      "loss": 0.4188,
+      "step": 1119
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.33743607997894287,
+      "learning_rate": 2.4793388429752065e-05,
+      "loss": 0.5309,
+      "step": 1120
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.3248274624347687,
+      "learning_rate": 2.4545454545454542e-05,
+      "loss": 0.3905,
+      "step": 1121
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.3567257821559906,
+      "learning_rate": 2.4297520661157022e-05,
+      "loss": 0.4107,
+      "step": 1122
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.4383893311023712,
+      "learning_rate": 2.40495867768595e-05,
+      "loss": 0.5024,
+      "step": 1123
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.2777807414531708,
+      "learning_rate": 2.3801652892561983e-05,
+      "loss": 0.3289,
+      "step": 1124
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.3409118950366974,
+      "learning_rate": 2.3553719008264463e-05,
+      "loss": 0.5199,
+      "step": 1125
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.3060845732688904,
+      "learning_rate": 2.330578512396694e-05,
+      "loss": 0.412,
+      "step": 1126
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.3366425335407257,
+      "learning_rate": 2.305785123966942e-05,
+      "loss": 0.484,
+      "step": 1127
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.36060798168182373,
+      "learning_rate": 2.2809917355371897e-05,
+      "loss": 0.5543,
+      "step": 1128
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.25729015469551086,
+      "learning_rate": 2.256198347107438e-05,
+      "loss": 0.2763,
+      "step": 1129
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.2890430688858032,
+      "learning_rate": 2.2314049586776857e-05,
+      "loss": 0.3762,
+      "step": 1130
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.31579041481018066,
+      "learning_rate": 2.2066115702479338e-05,
+      "loss": 0.396,
+      "step": 1131
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.3136342763900757,
+      "learning_rate": 2.1818181818181814e-05,
+      "loss": 0.4134,
+      "step": 1132
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.37239784002304077,
+      "learning_rate": 2.1570247933884295e-05,
+      "loss": 0.4666,
+      "step": 1133
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.2847795784473419,
+      "learning_rate": 2.1322314049586775e-05,
+      "loss": 0.3481,
+      "step": 1134
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.27870920300483704,
+      "learning_rate": 2.1074380165289255e-05,
+      "loss": 0.2669,
+      "step": 1135
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.2700231969356537,
+      "learning_rate": 2.0826446280991732e-05,
+      "loss": 0.2798,
+      "step": 1136
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.3257925510406494,
+      "learning_rate": 2.0578512396694212e-05,
+      "loss": 0.4931,
+      "step": 1137
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.2964242994785309,
+      "learning_rate": 2.033057851239669e-05,
+      "loss": 0.429,
+      "step": 1138
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.32561832666397095,
+      "learning_rate": 2.0082644628099173e-05,
+      "loss": 0.3467,
+      "step": 1139
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.27957382798194885,
+      "learning_rate": 1.9834710743801653e-05,
+      "loss": 0.2686,
+      "step": 1140
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.3476884663105011,
+      "learning_rate": 1.958677685950413e-05,
+      "loss": 0.4814,
+      "step": 1141
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.2950107753276825,
+      "learning_rate": 1.933884297520661e-05,
+      "loss": 0.3578,
+      "step": 1142
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.30689096450805664,
+      "learning_rate": 1.9090909090909087e-05,
+      "loss": 0.3725,
+      "step": 1143
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.430915504693985,
+      "learning_rate": 1.884297520661157e-05,
+      "loss": 0.4766,
+      "step": 1144
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.3086168169975281,
+      "learning_rate": 1.8595041322314047e-05,
+      "loss": 0.5506,
+      "step": 1145
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.3441203534603119,
+      "learning_rate": 1.8347107438016527e-05,
+      "loss": 0.4251,
+      "step": 1146
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.2828252613544464,
+      "learning_rate": 1.8099173553719008e-05,
+      "loss": 0.3,
+      "step": 1147
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.33563023805618286,
+      "learning_rate": 1.7851239669421485e-05,
+      "loss": 0.4082,
+      "step": 1148
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.33100175857543945,
+      "learning_rate": 1.7603305785123965e-05,
+      "loss": 0.5853,
+      "step": 1149
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.3554556369781494,
+      "learning_rate": 1.7355371900826445e-05,
+      "loss": 0.5677,
+      "step": 1150
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.32995131611824036,
+      "learning_rate": 1.7107438016528925e-05,
+      "loss": 0.3315,
+      "step": 1151
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.3160393238067627,
+      "learning_rate": 1.6859504132231405e-05,
+      "loss": 0.3632,
+      "step": 1152
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.3632807433605194,
+      "learning_rate": 1.6611570247933882e-05,
+      "loss": 0.4053,
+      "step": 1153
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.2931605279445648,
+      "learning_rate": 1.6363636363636363e-05,
+      "loss": 0.358,
+      "step": 1154
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.32687610387802124,
+      "learning_rate": 1.6115702479338843e-05,
+      "loss": 0.4584,
+      "step": 1155
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.3283078074455261,
+      "learning_rate": 1.586776859504132e-05,
+      "loss": 0.3818,
+      "step": 1156
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.31993189454078674,
+      "learning_rate": 1.56198347107438e-05,
+      "loss": 0.3714,
+      "step": 1157
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.2674204409122467,
+      "learning_rate": 1.537190082644628e-05,
+      "loss": 0.3943,
+      "step": 1158
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.3968242406845093,
+      "learning_rate": 1.5123966942148759e-05,
+      "loss": 0.4465,
+      "step": 1159
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.2870213985443115,
+      "learning_rate": 1.4876033057851239e-05,
+      "loss": 0.3616,
+      "step": 1160
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.29502633213996887,
+      "learning_rate": 1.4628099173553717e-05,
+      "loss": 0.4112,
+      "step": 1161
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.36414459347724915,
+      "learning_rate": 1.4380165289256198e-05,
+      "loss": 0.3928,
+      "step": 1162
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.274940550327301,
+      "learning_rate": 1.4132231404958676e-05,
+      "loss": 0.3971,
+      "step": 1163
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.3382115364074707,
+      "learning_rate": 1.3884297520661155e-05,
+      "loss": 0.339,
+      "step": 1164
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.32059189677238464,
+      "learning_rate": 1.3636363636363635e-05,
+      "loss": 0.4632,
+      "step": 1165
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.40788954496383667,
+      "learning_rate": 1.3388429752066113e-05,
+      "loss": 0.4729,
+      "step": 1166
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.4415609836578369,
+      "learning_rate": 1.3140495867768595e-05,
+      "loss": 0.4311,
+      "step": 1167
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.29439279437065125,
+      "learning_rate": 1.2892561983471074e-05,
+      "loss": 0.3428,
+      "step": 1168
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.38421952724456787,
+      "learning_rate": 1.2644628099173552e-05,
+      "loss": 0.5504,
+      "step": 1169
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.2757047116756439,
+      "learning_rate": 1.2396694214876033e-05,
+      "loss": 0.3488,
+      "step": 1170
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.27029332518577576,
+      "learning_rate": 1.2148760330578511e-05,
+      "loss": 0.3922,
+      "step": 1171
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.29828086495399475,
+      "learning_rate": 1.1900826446280991e-05,
+      "loss": 0.3484,
+      "step": 1172
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.3248095214366913,
+      "learning_rate": 1.165289256198347e-05,
+      "loss": 0.4166,
+      "step": 1173
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.3183375895023346,
+      "learning_rate": 1.1404958677685948e-05,
+      "loss": 0.4207,
+      "step": 1174
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.38209760189056396,
+      "learning_rate": 1.1157024793388429e-05,
+      "loss": 0.4136,
+      "step": 1175
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.31191781163215637,
+      "learning_rate": 1.0909090909090907e-05,
+      "loss": 0.3821,
+      "step": 1176
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.3147072494029999,
+      "learning_rate": 1.0661157024793387e-05,
+      "loss": 0.2973,
+      "step": 1177
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.346629798412323,
+      "learning_rate": 1.0413223140495866e-05,
+      "loss": 0.5924,
+      "step": 1178
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.30329591035842896,
+      "learning_rate": 1.0165289256198345e-05,
+      "loss": 0.4802,
+      "step": 1179
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.3608144521713257,
+      "learning_rate": 9.917355371900826e-06,
+      "loss": 0.4187,
+      "step": 1180
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.3330174684524536,
+      "learning_rate": 9.669421487603305e-06,
+      "loss": 0.4585,
+      "step": 1181
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.2880091071128845,
+      "learning_rate": 9.421487603305785e-06,
+      "loss": 0.3926,
+      "step": 1182
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.2711026668548584,
+      "learning_rate": 9.173553719008264e-06,
+      "loss": 0.3128,
+      "step": 1183
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.3472573161125183,
+      "learning_rate": 8.925619834710742e-06,
+      "loss": 0.3626,
+      "step": 1184
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.29903772473335266,
+      "learning_rate": 8.677685950413222e-06,
+      "loss": 0.3778,
+      "step": 1185
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.309654176235199,
+      "learning_rate": 8.429752066115703e-06,
+      "loss": 0.3965,
+      "step": 1186
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.3163444399833679,
+      "learning_rate": 8.181818181818181e-06,
+      "loss": 0.3207,
+      "step": 1187
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.3754628300666809,
+      "learning_rate": 7.93388429752066e-06,
+      "loss": 0.3954,
+      "step": 1188
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.2967177629470825,
+      "learning_rate": 7.68595041322314e-06,
+      "loss": 0.4092,
+      "step": 1189
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.37930914759635925,
+      "learning_rate": 7.438016528925619e-06,
+      "loss": 0.5038,
+      "step": 1190
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.31978312134742737,
+      "learning_rate": 7.190082644628099e-06,
+      "loss": 0.3039,
+      "step": 1191
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.34556475281715393,
+      "learning_rate": 6.942148760330577e-06,
+      "loss": 0.3749,
+      "step": 1192
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.33958449959754944,
+      "learning_rate": 6.694214876033057e-06,
+      "loss": 0.4974,
+      "step": 1193
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.34213709831237793,
+      "learning_rate": 6.446280991735537e-06,
+      "loss": 0.4874,
+      "step": 1194
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.3194979131221771,
+      "learning_rate": 6.198347107438016e-06,
+      "loss": 0.4415,
+      "step": 1195
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.3170003890991211,
+      "learning_rate": 5.950413223140496e-06,
+      "loss": 0.299,
+      "step": 1196
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.35796797275543213,
+      "learning_rate": 5.702479338842974e-06,
+      "loss": 0.4516,
+      "step": 1197
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.36410433053970337,
+      "learning_rate": 5.454545454545454e-06,
+      "loss": 0.3137,
+      "step": 1198
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.27563753724098206,
+      "learning_rate": 5.206611570247933e-06,
+      "loss": 0.3465,
+      "step": 1199
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.3430056869983673,
+      "learning_rate": 4.958677685950413e-06,
+      "loss": 0.5325,
+      "step": 1200
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.3032241463661194,
+      "learning_rate": 4.710743801652893e-06,
+      "loss": 0.3802,
+      "step": 1201
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.3008878231048584,
+      "learning_rate": 4.462809917355371e-06,
+      "loss": 0.3674,
+      "step": 1202
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.34465453028678894,
+      "learning_rate": 4.214876033057851e-06,
+      "loss": 0.3465,
+      "step": 1203
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.3217530846595764,
+      "learning_rate": 3.96694214876033e-06,
+      "loss": 0.395,
+      "step": 1204
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.3256390690803528,
+      "learning_rate": 3.7190082644628097e-06,
+      "loss": 0.2928,
+      "step": 1205
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.404376357793808,
+      "learning_rate": 3.4710743801652887e-06,
+      "loss": 0.5579,
+      "step": 1206
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.2786218822002411,
+      "learning_rate": 3.2231404958677685e-06,
+      "loss": 0.3842,
+      "step": 1207
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.339501291513443,
+      "learning_rate": 2.975206611570248e-06,
+      "loss": 0.4061,
+      "step": 1208
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.3386409878730774,
+      "learning_rate": 2.727272727272727e-06,
+      "loss": 0.3452,
+      "step": 1209
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.36449265480041504,
+      "learning_rate": 2.4793388429752066e-06,
+      "loss": 0.3769,
+      "step": 1210
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.3336932361125946,
+      "learning_rate": 2.2314049586776856e-06,
+      "loss": 0.4361,
+      "step": 1211
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.28075236082077026,
+      "learning_rate": 1.983471074380165e-06,
+      "loss": 0.3614,
+      "step": 1212
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.31337854266166687,
+      "learning_rate": 1.7355371900826443e-06,
+      "loss": 0.37,
+      "step": 1213
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.3034374415874481,
+      "learning_rate": 1.487603305785124e-06,
+      "loss": 0.274,
+      "step": 1214
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.3485061526298523,
+      "learning_rate": 1.2396694214876033e-06,
+      "loss": 0.425,
+      "step": 1215
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.24720066785812378,
+      "learning_rate": 9.917355371900825e-07,
+      "loss": 0.307,
+      "step": 1216
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.2727121412754059,
+      "learning_rate": 7.43801652892562e-07,
+      "loss": 0.2991,
+      "step": 1217
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.33211690187454224,
+      "learning_rate": 4.958677685950412e-07,
+      "loss": 0.5309,
+      "step": 1218
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.328895628452301,
+      "learning_rate": 2.479338842975206e-07,
+      "loss": 0.3547,
+      "step": 1219
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.2642543315887451,
+      "learning_rate": 0.0,
+      "loss": 0.3047,
+      "step": 1220
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1220,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 610,
+  "total_flos": 2.626577866972938e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}