diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,7021 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3091190108191654,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0003091190108191654,
+      "grad_norm": 0.17105351388454437,
+      "learning_rate": 2e-05,
+      "loss": 1.589,
+      "step": 1
+    },
+    {
+      "epoch": 0.0006182380216383308,
+      "grad_norm": 0.13888764381408691,
+      "learning_rate": 4e-05,
+      "loss": 1.6333,
+      "step": 2
+    },
+    {
+      "epoch": 0.0009273570324574962,
+      "grad_norm": 0.13389942049980164,
+      "learning_rate": 6e-05,
+      "loss": 1.6075,
+      "step": 3
+    },
+    {
+      "epoch": 0.0012364760432766616,
+      "grad_norm": 0.1443634182214737,
+      "learning_rate": 8e-05,
+      "loss": 1.3981,
+      "step": 4
+    },
+    {
+      "epoch": 0.0015455950540958269,
+      "grad_norm": 0.2410346418619156,
+      "learning_rate": 0.0001,
+      "loss": 1.6522,
+      "step": 5
+    },
+    {
+      "epoch": 0.0018547140649149924,
+      "grad_norm": 0.23892079293727875,
+      "learning_rate": 0.00012,
+      "loss": 1.3345,
+      "step": 6
+    },
+    {
+      "epoch": 0.0021638330757341576,
+      "grad_norm": 0.32107171416282654,
+      "learning_rate": 0.00014,
+      "loss": 1.4086,
+      "step": 7
+    },
+    {
+      "epoch": 0.002472952086553323,
+      "grad_norm": 0.38699325919151306,
+      "learning_rate": 0.00016,
+      "loss": 1.2824,
+      "step": 8
+    },
+    {
+      "epoch": 0.0027820710973724882,
+      "grad_norm": 0.2972716987133026,
+      "learning_rate": 0.00018,
+      "loss": 1.3528,
+      "step": 9
+    },
+    {
+      "epoch": 0.0030911901081916537,
+      "grad_norm": 0.288402795791626,
+      "learning_rate": 0.0002,
+      "loss": 1.0556,
+      "step": 10
+    },
+    {
+      "epoch": 0.0034003091190108192,
+      "grad_norm": 0.4650692641735077,
+      "learning_rate": 0.00019993798449612405,
+      "loss": 1.0995,
+      "step": 11
+    },
+    {
+      "epoch": 0.0037094281298299847,
+      "grad_norm": 0.39375749230384827,
+      "learning_rate": 0.0001998759689922481,
+      "loss": 0.9978,
+      "step": 12
+    },
+    {
+      "epoch": 0.00401854714064915,
+      "grad_norm": 0.3362458348274231,
+      "learning_rate": 0.0001998139534883721,
+      "loss": 1.0522,
+      "step": 13
+    },
+    {
+      "epoch": 0.004327666151468315,
+      "grad_norm": 0.2642221450805664,
+      "learning_rate": 0.00019975193798449614,
+      "loss": 0.9661,
+      "step": 14
+    },
+    {
+      "epoch": 0.00463678516228748,
+      "grad_norm": 0.3542484939098358,
+      "learning_rate": 0.00019968992248062018,
+      "loss": 0.8814,
+      "step": 15
+    },
+    {
+      "epoch": 0.004945904173106646,
+      "grad_norm": 0.1401689201593399,
+      "learning_rate": 0.00019962790697674421,
+      "loss": 0.8629,
+      "step": 16
+    },
+    {
+      "epoch": 0.005255023183925811,
+      "grad_norm": 0.20015761256217957,
+      "learning_rate": 0.00019956589147286823,
+      "loss": 0.8454,
+      "step": 17
+    },
+    {
+      "epoch": 0.0055641421947449764,
+      "grad_norm": 0.1540534645318985,
+      "learning_rate": 0.00019950387596899224,
+      "loss": 0.8852,
+      "step": 18
+    },
+    {
+      "epoch": 0.005873261205564142,
+      "grad_norm": 0.15904690325260162,
+      "learning_rate": 0.00019944186046511628,
+      "loss": 0.7254,
+      "step": 19
+    },
+    {
+      "epoch": 0.0061823802163833074,
+      "grad_norm": 0.21907807886600494,
+      "learning_rate": 0.00019937984496124032,
+      "loss": 1.0708,
+      "step": 20
+    },
+    {
+      "epoch": 0.006491499227202473,
+      "grad_norm": 0.14591765403747559,
+      "learning_rate": 0.00019931782945736435,
+      "loss": 0.8555,
+      "step": 21
+    },
+    {
+      "epoch": 0.0068006182380216385,
+      "grad_norm": 0.16298744082450867,
+      "learning_rate": 0.00019925581395348837,
+      "loss": 0.8262,
+      "step": 22
+    },
+    {
+      "epoch": 0.0071097372488408035,
+      "grad_norm": 0.14358466863632202,
+      "learning_rate": 0.0001991937984496124,
+      "loss": 0.8744,
+      "step": 23
+    },
+    {
+      "epoch": 0.0074188562596599695,
+      "grad_norm": 0.149592325091362,
+      "learning_rate": 0.00019913178294573644,
+      "loss": 0.9376,
+      "step": 24
+    },
+    {
+      "epoch": 0.0077279752704791345,
+      "grad_norm": 0.13760673999786377,
+      "learning_rate": 0.00019906976744186048,
+      "loss": 0.8854,
+      "step": 25
+    },
+    {
+      "epoch": 0.0080370942812983,
+      "grad_norm": 0.19108699262142181,
+      "learning_rate": 0.00019900775193798452,
+      "loss": 0.885,
+      "step": 26
+    },
+    {
+      "epoch": 0.008346213292117466,
+      "grad_norm": 0.1892910748720169,
+      "learning_rate": 0.00019894573643410853,
+      "loss": 0.9911,
+      "step": 27
+    },
+    {
+      "epoch": 0.00865533230293663,
+      "grad_norm": 0.13371500372886658,
+      "learning_rate": 0.00019888372093023257,
+      "loss": 0.8475,
+      "step": 28
+    },
+    {
+      "epoch": 0.008964451313755796,
+      "grad_norm": 0.1226775124669075,
+      "learning_rate": 0.0001988217054263566,
+      "loss": 0.7733,
+      "step": 29
+    },
+    {
+      "epoch": 0.00927357032457496,
+      "grad_norm": 0.13002170622348785,
+      "learning_rate": 0.00019875968992248062,
+      "loss": 0.8008,
+      "step": 30
+    },
+    {
+      "epoch": 0.009582689335394128,
+      "grad_norm": 0.13575126230716705,
+      "learning_rate": 0.00019869767441860466,
+      "loss": 0.8841,
+      "step": 31
+    },
+    {
+      "epoch": 0.009891808346213293,
+      "grad_norm": 0.15838854014873505,
+      "learning_rate": 0.00019863565891472867,
+      "loss": 0.8856,
+      "step": 32
+    },
+    {
+      "epoch": 0.010200927357032458,
+      "grad_norm": 0.12744970619678497,
+      "learning_rate": 0.0001985736434108527,
+      "loss": 0.8327,
+      "step": 33
+    },
+    {
+      "epoch": 0.010510046367851623,
+      "grad_norm": 0.16277430951595306,
+      "learning_rate": 0.00019851162790697675,
+      "loss": 0.9559,
+      "step": 34
+    },
+    {
+      "epoch": 0.010819165378670788,
+      "grad_norm": 0.11398226767778397,
+      "learning_rate": 0.0001984496124031008,
+      "loss": 0.8987,
+      "step": 35
+    },
+    {
+      "epoch": 0.011128284389489953,
+      "grad_norm": 0.15259447693824768,
+      "learning_rate": 0.00019838759689922483,
+      "loss": 0.916,
+      "step": 36
+    },
+    {
+      "epoch": 0.01143740340030912,
+      "grad_norm": 0.16493409872055054,
+      "learning_rate": 0.00019832558139534884,
+      "loss": 0.7795,
+      "step": 37
+    },
+    {
+      "epoch": 0.011746522411128285,
+      "grad_norm": 0.14143070578575134,
+      "learning_rate": 0.00019826356589147288,
+      "loss": 0.814,
+      "step": 38
+    },
+    {
+      "epoch": 0.01205564142194745,
+      "grad_norm": 0.11884719133377075,
+      "learning_rate": 0.00019820155038759692,
+      "loss": 0.7559,
+      "step": 39
+    },
+    {
+      "epoch": 0.012364760432766615,
+      "grad_norm": 0.12909553945064545,
+      "learning_rate": 0.00019813953488372096,
+      "loss": 0.9106,
+      "step": 40
+    },
+    {
+      "epoch": 0.01267387944358578,
+      "grad_norm": 0.11181219667196274,
+      "learning_rate": 0.00019807751937984497,
+      "loss": 0.7127,
+      "step": 41
+    },
+    {
+      "epoch": 0.012982998454404947,
+      "grad_norm": 0.16188634932041168,
+      "learning_rate": 0.000198015503875969,
+      "loss": 0.856,
+      "step": 42
+    },
+    {
+      "epoch": 0.013292117465224112,
+      "grad_norm": 0.1277618706226349,
+      "learning_rate": 0.00019795348837209304,
+      "loss": 0.8901,
+      "step": 43
+    },
+    {
+      "epoch": 0.013601236476043277,
+      "grad_norm": 0.13743072748184204,
+      "learning_rate": 0.00019789147286821706,
+      "loss": 0.9266,
+      "step": 44
+    },
+    {
+      "epoch": 0.013910355486862442,
+      "grad_norm": 0.12885789573192596,
+      "learning_rate": 0.0001978294573643411,
+      "loss": 0.8982,
+      "step": 45
+    },
+    {
+      "epoch": 0.014219474497681607,
+      "grad_norm": 0.14296455681324005,
+      "learning_rate": 0.0001977674418604651,
+      "loss": 1.0045,
+      "step": 46
+    },
+    {
+      "epoch": 0.014528593508500772,
+      "grad_norm": 0.13536542654037476,
+      "learning_rate": 0.00019770542635658915,
+      "loss": 0.8648,
+      "step": 47
+    },
+    {
+      "epoch": 0.014837712519319939,
+      "grad_norm": 0.1285800188779831,
+      "learning_rate": 0.00019764341085271318,
+      "loss": 0.8113,
+      "step": 48
+    },
+    {
+      "epoch": 0.015146831530139104,
+      "grad_norm": 0.1538587212562561,
+      "learning_rate": 0.00019758139534883722,
+      "loss": 0.8465,
+      "step": 49
+    },
+    {
+      "epoch": 0.015455950540958269,
+      "grad_norm": 0.11420200765132904,
+      "learning_rate": 0.00019751937984496126,
+      "loss": 0.8647,
+      "step": 50
+    },
+    {
+      "epoch": 0.015765069551777436,
+      "grad_norm": 0.13382850587368011,
+      "learning_rate": 0.00019745736434108527,
+      "loss": 0.9545,
+      "step": 51
+    },
+    {
+      "epoch": 0.0160741885625966,
+      "grad_norm": 0.11594673991203308,
+      "learning_rate": 0.0001973953488372093,
+      "loss": 0.8599,
+      "step": 52
+    },
+    {
+      "epoch": 0.016383307573415766,
+      "grad_norm": 0.119788758456707,
+      "learning_rate": 0.00019733333333333335,
+      "loss": 0.8394,
+      "step": 53
+    },
+    {
+      "epoch": 0.01669242658423493,
+      "grad_norm": 0.1150812953710556,
+      "learning_rate": 0.0001972713178294574,
+      "loss": 0.8727,
+      "step": 54
+    },
+    {
+      "epoch": 0.017001545595054096,
+      "grad_norm": 0.1359858363866806,
+      "learning_rate": 0.0001972093023255814,
+      "loss": 0.76,
+      "step": 55
+    },
+    {
+      "epoch": 0.01731066460587326,
+      "grad_norm": 0.15345649421215057,
+      "learning_rate": 0.00019714728682170544,
+      "loss": 0.7292,
+      "step": 56
+    },
+    {
+      "epoch": 0.017619783616692426,
+      "grad_norm": 0.14331281185150146,
+      "learning_rate": 0.00019708527131782945,
+      "loss": 0.8034,
+      "step": 57
+    },
+    {
+      "epoch": 0.01792890262751159,
+      "grad_norm": 0.16820766031742096,
+      "learning_rate": 0.0001970232558139535,
+      "loss": 0.7534,
+      "step": 58
+    },
+    {
+      "epoch": 0.018238021638330756,
+      "grad_norm": 0.16281287372112274,
+      "learning_rate": 0.00019696124031007753,
+      "loss": 0.8409,
+      "step": 59
+    },
+    {
+      "epoch": 0.01854714064914992,
+      "grad_norm": 0.16938892006874084,
+      "learning_rate": 0.00019689922480620157,
+      "loss": 0.8786,
+      "step": 60
+    },
+    {
+      "epoch": 0.018856259659969087,
+      "grad_norm": 0.13455645740032196,
+      "learning_rate": 0.00019683720930232558,
+      "loss": 0.7462,
+      "step": 61
+    },
+    {
+      "epoch": 0.019165378670788255,
+      "grad_norm": 0.12973853945732117,
+      "learning_rate": 0.00019677519379844962,
+      "loss": 0.8304,
+      "step": 62
+    },
+    {
+      "epoch": 0.01947449768160742,
+      "grad_norm": 0.158578023314476,
+      "learning_rate": 0.00019671317829457366,
+      "loss": 0.8244,
+      "step": 63
+    },
+    {
+      "epoch": 0.019783616692426585,
+      "grad_norm": 0.125227153301239,
+      "learning_rate": 0.0001966511627906977,
+      "loss": 0.8637,
+      "step": 64
+    },
+    {
+      "epoch": 0.02009273570324575,
+      "grad_norm": 0.1529238075017929,
+      "learning_rate": 0.0001965891472868217,
+      "loss": 0.8271,
+      "step": 65
+    },
+    {
+      "epoch": 0.020401854714064915,
+      "grad_norm": 0.1335589438676834,
+      "learning_rate": 0.00019652713178294575,
+      "loss": 0.8129,
+      "step": 66
+    },
+    {
+      "epoch": 0.02071097372488408,
+      "grad_norm": 0.13944782316684723,
+      "learning_rate": 0.00019646511627906978,
+      "loss": 0.887,
+      "step": 67
+    },
+    {
+      "epoch": 0.021020092735703245,
+      "grad_norm": 0.12453600019216537,
+      "learning_rate": 0.00019640310077519382,
+      "loss": 0.8369,
+      "step": 68
+    },
+    {
+      "epoch": 0.02132921174652241,
+      "grad_norm": 0.11443863809108734,
+      "learning_rate": 0.00019634108527131786,
+      "loss": 0.8769,
+      "step": 69
+    },
+    {
+      "epoch": 0.021638330757341576,
+      "grad_norm": 0.1325102150440216,
+      "learning_rate": 0.00019627906976744185,
+      "loss": 0.7528,
+      "step": 70
+    },
+    {
+      "epoch": 0.02194744976816074,
+      "grad_norm": 0.13488665223121643,
+      "learning_rate": 0.00019621705426356589,
+      "loss": 0.876,
+      "step": 71
+    },
+    {
+      "epoch": 0.022256568778979906,
+      "grad_norm": 0.15174520015716553,
+      "learning_rate": 0.00019615503875968992,
+      "loss": 0.8219,
+      "step": 72
+    },
+    {
+      "epoch": 0.022565687789799074,
+      "grad_norm": 0.13083337247371674,
+      "learning_rate": 0.00019609302325581396,
+      "loss": 0.8696,
+      "step": 73
+    },
+    {
+      "epoch": 0.02287480680061824,
+      "grad_norm": 0.13707856833934784,
+      "learning_rate": 0.000196031007751938,
+      "loss": 0.9781,
+      "step": 74
+    },
+    {
+      "epoch": 0.023183925811437404,
+      "grad_norm": 0.14287059009075165,
+      "learning_rate": 0.000195968992248062,
+      "loss": 0.8975,
+      "step": 75
+    },
+    {
+      "epoch": 0.02349304482225657,
+      "grad_norm": 0.14259910583496094,
+      "learning_rate": 0.00019590697674418605,
+      "loss": 0.781,
+      "step": 76
+    },
+    {
+      "epoch": 0.023802163833075735,
+      "grad_norm": 0.17812331020832062,
+      "learning_rate": 0.0001958449612403101,
+      "loss": 0.8664,
+      "step": 77
+    },
+    {
+      "epoch": 0.0241112828438949,
+      "grad_norm": 0.10900291800498962,
+      "learning_rate": 0.00019578294573643413,
+      "loss": 0.8155,
+      "step": 78
+    },
+    {
+      "epoch": 0.024420401854714065,
+      "grad_norm": 0.1299259066581726,
+      "learning_rate": 0.00019572093023255814,
+      "loss": 0.8878,
+      "step": 79
+    },
+    {
+      "epoch": 0.02472952086553323,
+      "grad_norm": 0.1341174691915512,
+      "learning_rate": 0.00019565891472868218,
+      "loss": 0.855,
+      "step": 80
+    },
+    {
+      "epoch": 0.025038639876352395,
+      "grad_norm": 0.11747386306524277,
+      "learning_rate": 0.00019559689922480622,
+      "loss": 0.8901,
+      "step": 81
+    },
+    {
+      "epoch": 0.02534775888717156,
+      "grad_norm": 0.12569762766361237,
+      "learning_rate": 0.00019553488372093026,
+      "loss": 0.8644,
+      "step": 82
+    },
+    {
+      "epoch": 0.025656877897990725,
+      "grad_norm": 0.11595705896615982,
+      "learning_rate": 0.0001954728682170543,
+      "loss": 0.7919,
+      "step": 83
+    },
+    {
+      "epoch": 0.025965996908809894,
+      "grad_norm": 0.15013526380062103,
+      "learning_rate": 0.0001954108527131783,
+      "loss": 0.7987,
+      "step": 84
+    },
+    {
+      "epoch": 0.02627511591962906,
+      "grad_norm": 0.13101589679718018,
+      "learning_rate": 0.00019534883720930232,
+      "loss": 0.7824,
+      "step": 85
+    },
+    {
+      "epoch": 0.026584234930448224,
+      "grad_norm": 0.12921208143234253,
+      "learning_rate": 0.00019528682170542636,
+      "loss": 0.9077,
+      "step": 86
+    },
+    {
+      "epoch": 0.02689335394126739,
+      "grad_norm": 0.18682463467121124,
+      "learning_rate": 0.0001952248062015504,
+      "loss": 0.8614,
+      "step": 87
+    },
+    {
+      "epoch": 0.027202472952086554,
+      "grad_norm": 0.15416069328784943,
+      "learning_rate": 0.00019516279069767444,
+      "loss": 0.8038,
+      "step": 88
+    },
+    {
+      "epoch": 0.02751159196290572,
+      "grad_norm": 0.13872137665748596,
+      "learning_rate": 0.00019510077519379845,
+      "loss": 0.9001,
+      "step": 89
+    },
+    {
+      "epoch": 0.027820710973724884,
+      "grad_norm": 0.1256810575723648,
+      "learning_rate": 0.00019503875968992249,
+      "loss": 0.8332,
+      "step": 90
+    },
+    {
+      "epoch": 0.02812982998454405,
+      "grad_norm": 0.15000316500663757,
+      "learning_rate": 0.00019497674418604652,
+      "loss": 0.8345,
+      "step": 91
+    },
+    {
+      "epoch": 0.028438948995363214,
+      "grad_norm": 0.11536971479654312,
+      "learning_rate": 0.00019491472868217056,
+      "loss": 0.7702,
+      "step": 92
+    },
+    {
+      "epoch": 0.02874806800618238,
+      "grad_norm": 0.11627457290887833,
+      "learning_rate": 0.00019485271317829457,
+      "loss": 0.8944,
+      "step": 93
+    },
+    {
+      "epoch": 0.029057187017001544,
+      "grad_norm": 0.12913382053375244,
+      "learning_rate": 0.00019479069767441861,
+      "loss": 0.8054,
+      "step": 94
+    },
+    {
+      "epoch": 0.02936630602782071,
+      "grad_norm": 0.14983727037906647,
+      "learning_rate": 0.00019472868217054265,
+      "loss": 0.7897,
+      "step": 95
+    },
+    {
+      "epoch": 0.029675425038639878,
+      "grad_norm": 0.1396576315164566,
+      "learning_rate": 0.0001946666666666667,
+      "loss": 0.7595,
+      "step": 96
+    },
+    {
+      "epoch": 0.029984544049459043,
+      "grad_norm": 0.1093367412686348,
+      "learning_rate": 0.0001946046511627907,
+      "loss": 0.878,
+      "step": 97
+    },
+    {
+      "epoch": 0.030293663060278208,
+      "grad_norm": 0.14023703336715698,
+      "learning_rate": 0.00019454263565891474,
+      "loss": 0.8051,
+      "step": 98
+    },
+    {
+      "epoch": 0.030602782071097373,
+      "grad_norm": 0.11650537699460983,
+      "learning_rate": 0.00019448062015503875,
+      "loss": 0.7733,
+      "step": 99
+    },
+    {
+      "epoch": 0.030911901081916538,
+      "grad_norm": 0.13736988604068756,
+      "learning_rate": 0.0001944186046511628,
+      "loss": 0.8414,
+      "step": 100
+    },
+    {
+      "epoch": 0.031221020092735703,
+      "grad_norm": 0.13810019195079803,
+      "learning_rate": 0.00019435658914728683,
+      "loss": 0.9192,
+      "step": 101
+    },
+    {
+      "epoch": 0.03153013910355487,
+      "grad_norm": 0.16648177802562714,
+      "learning_rate": 0.00019429457364341087,
+      "loss": 0.8692,
+      "step": 102
+    },
+    {
+      "epoch": 0.03183925811437403,
+      "grad_norm": 0.16531941294670105,
+      "learning_rate": 0.00019423255813953488,
+      "loss": 0.8585,
+      "step": 103
+    },
+    {
+      "epoch": 0.0321483771251932,
+      "grad_norm": 0.12364251166582108,
+      "learning_rate": 0.00019417054263565892,
+      "loss": 0.7652,
+      "step": 104
+    },
+    {
+      "epoch": 0.03245749613601236,
+      "grad_norm": 0.139155313372612,
+      "learning_rate": 0.00019410852713178296,
+      "loss": 0.7448,
+      "step": 105
+    },
+    {
+      "epoch": 0.03276661514683153,
+      "grad_norm": 0.11827906966209412,
+      "learning_rate": 0.000194046511627907,
+      "loss": 0.9182,
+      "step": 106
+    },
+    {
+      "epoch": 0.033075734157650694,
+      "grad_norm": 0.1247883066534996,
+      "learning_rate": 0.00019398449612403104,
+      "loss": 0.8138,
+      "step": 107
+    },
+    {
+      "epoch": 0.03338485316846986,
+      "grad_norm": 0.12576410174369812,
+      "learning_rate": 0.00019392248062015505,
+      "loss": 0.803,
+      "step": 108
+    },
+    {
+      "epoch": 0.033693972179289024,
+      "grad_norm": 0.12698566913604736,
+      "learning_rate": 0.00019386046511627909,
+      "loss": 0.8579,
+      "step": 109
+    },
+    {
+      "epoch": 0.03400309119010819,
+      "grad_norm": 0.10796654969453812,
+      "learning_rate": 0.0001937984496124031,
+      "loss": 0.812,
+      "step": 110
+    },
+    {
+      "epoch": 0.034312210200927354,
+      "grad_norm": 0.12361832708120346,
+      "learning_rate": 0.00019373643410852714,
+      "loss": 0.8606,
+      "step": 111
+    },
+    {
+      "epoch": 0.03462132921174652,
+      "grad_norm": 0.12853065133094788,
+      "learning_rate": 0.00019367441860465118,
+      "loss": 0.9208,
+      "step": 112
+    },
+    {
+      "epoch": 0.03493044822256569,
+      "grad_norm": 0.119226835668087,
+      "learning_rate": 0.0001936124031007752,
+      "loss": 0.7306,
+      "step": 113
+    },
+    {
+      "epoch": 0.03523956723338485,
+      "grad_norm": 0.12476561963558197,
+      "learning_rate": 0.00019355038759689923,
+      "loss": 0.8046,
+      "step": 114
+    },
+    {
+      "epoch": 0.03554868624420402,
+      "grad_norm": 0.11674510687589645,
+      "learning_rate": 0.00019348837209302326,
+      "loss": 0.8214,
+      "step": 115
+    },
+    {
+      "epoch": 0.03585780525502318,
+      "grad_norm": 0.1358969360589981,
+      "learning_rate": 0.0001934263565891473,
+      "loss": 0.7432,
+      "step": 116
+    },
+    {
+      "epoch": 0.03616692426584235,
+      "grad_norm": 0.1318214237689972,
+      "learning_rate": 0.00019336434108527132,
+      "loss": 0.9497,
+      "step": 117
+    },
+    {
+      "epoch": 0.03647604327666151,
+      "grad_norm": 0.1427808552980423,
+      "learning_rate": 0.00019330232558139535,
+      "loss": 0.8012,
+      "step": 118
+    },
+    {
+      "epoch": 0.03678516228748068,
+      "grad_norm": 0.14754672348499298,
+      "learning_rate": 0.0001932403100775194,
+      "loss": 0.8112,
+      "step": 119
+    },
+    {
+      "epoch": 0.03709428129829984,
+      "grad_norm": 0.13510632514953613,
+      "learning_rate": 0.00019317829457364343,
+      "loss": 0.8607,
+      "step": 120
+    },
+    {
+      "epoch": 0.03740340030911901,
+      "grad_norm": 0.11763066798448563,
+      "learning_rate": 0.00019311627906976747,
+      "loss": 0.8349,
+      "step": 121
+    },
+    {
+      "epoch": 0.03771251931993817,
+      "grad_norm": 0.13032180070877075,
+      "learning_rate": 0.00019305426356589148,
+      "loss": 0.8847,
+      "step": 122
+    },
+    {
+      "epoch": 0.03802163833075734,
+      "grad_norm": 0.11119523644447327,
+      "learning_rate": 0.00019299224806201552,
+      "loss": 0.8302,
+      "step": 123
+    },
+    {
+      "epoch": 0.03833075734157651,
+      "grad_norm": 0.1144416555762291,
+      "learning_rate": 0.00019293023255813953,
+      "loss": 0.7984,
+      "step": 124
+    },
+    {
+      "epoch": 0.03863987635239567,
+      "grad_norm": 0.12676909565925598,
+      "learning_rate": 0.00019286821705426357,
+      "loss": 0.8186,
+      "step": 125
+    },
+    {
+      "epoch": 0.03894899536321484,
+      "grad_norm": 0.10949283838272095,
+      "learning_rate": 0.0001928062015503876,
+      "loss": 0.7339,
+      "step": 126
+    },
+    {
+      "epoch": 0.039258114374034,
+      "grad_norm": 0.11983365565538406,
+      "learning_rate": 0.00019274418604651162,
+      "loss": 0.7616,
+      "step": 127
+    },
+    {
+      "epoch": 0.03956723338485317,
+      "grad_norm": 0.1309802085161209,
+      "learning_rate": 0.00019268217054263566,
+      "loss": 0.7917,
+      "step": 128
+    },
+    {
+      "epoch": 0.03987635239567233,
+      "grad_norm": 0.1349460780620575,
+      "learning_rate": 0.0001926201550387597,
+      "loss": 0.8753,
+      "step": 129
+    },
+    {
+      "epoch": 0.0401854714064915,
+      "grad_norm": 0.12506724894046783,
+      "learning_rate": 0.00019255813953488374,
+      "loss": 0.7819,
+      "step": 130
+    },
+    {
+      "epoch": 0.04049459041731066,
+      "grad_norm": 0.13243618607521057,
+      "learning_rate": 0.00019249612403100778,
+      "loss": 0.8049,
+      "step": 131
+    },
+    {
+      "epoch": 0.04080370942812983,
+      "grad_norm": 0.14795252680778503,
+      "learning_rate": 0.0001924341085271318,
+      "loss": 0.7969,
+      "step": 132
+    },
+    {
+      "epoch": 0.04111282843894899,
+      "grad_norm": 0.15747897326946259,
+      "learning_rate": 0.00019237209302325583,
+      "loss": 0.8746,
+      "step": 133
+    },
+    {
+      "epoch": 0.04142194744976816,
+      "grad_norm": 0.15109744668006897,
+      "learning_rate": 0.00019231007751937987,
+      "loss": 0.8562,
+      "step": 134
+    },
+    {
+      "epoch": 0.04173106646058733,
+      "grad_norm": 0.13535654544830322,
+      "learning_rate": 0.0001922480620155039,
+      "loss": 0.8928,
+      "step": 135
+    },
+    {
+      "epoch": 0.04204018547140649,
+      "grad_norm": 0.1262591928243637,
+      "learning_rate": 0.00019218604651162792,
+      "loss": 0.8302,
+      "step": 136
+    },
+    {
+      "epoch": 0.04234930448222566,
+      "grad_norm": 0.11443354189395905,
+      "learning_rate": 0.00019212403100775193,
+      "loss": 0.8672,
+      "step": 137
+    },
+    {
+      "epoch": 0.04265842349304482,
+      "grad_norm": 0.11836638301610947,
+      "learning_rate": 0.00019206201550387597,
+      "loss": 0.7593,
+      "step": 138
+    },
+    {
+      "epoch": 0.04296754250386399,
+      "grad_norm": 0.12662746012210846,
+      "learning_rate": 0.000192,
+      "loss": 0.7531,
+      "step": 139
+    },
+    {
+      "epoch": 0.04327666151468315,
+      "grad_norm": 0.12387800961732864,
+      "learning_rate": 0.00019193798449612404,
+      "loss": 0.7422,
+      "step": 140
+    },
+    {
+      "epoch": 0.04358578052550232,
+      "grad_norm": 0.12786395847797394,
+      "learning_rate": 0.00019187596899224806,
+      "loss": 0.744,
+      "step": 141
+    },
+    {
+      "epoch": 0.04389489953632148,
+      "grad_norm": 0.12761859595775604,
+      "learning_rate": 0.0001918139534883721,
+      "loss": 0.8977,
+      "step": 142
+    },
+    {
+      "epoch": 0.04420401854714065,
+      "grad_norm": 0.10713964700698853,
+      "learning_rate": 0.00019175193798449613,
+      "loss": 0.7677,
+      "step": 143
+    },
+    {
+      "epoch": 0.04451313755795981,
+      "grad_norm": 0.13007132709026337,
+      "learning_rate": 0.00019168992248062017,
+      "loss": 0.7516,
+      "step": 144
+    },
+    {
+      "epoch": 0.04482225656877898,
+      "grad_norm": 0.12673480808734894,
+      "learning_rate": 0.0001916279069767442,
+      "loss": 0.8469,
+      "step": 145
+    },
+    {
+      "epoch": 0.04513137557959815,
+      "grad_norm": 0.14040741324424744,
+      "learning_rate": 0.00019156589147286822,
+      "loss": 0.8119,
+      "step": 146
+    },
+    {
+      "epoch": 0.04544049459041731,
+      "grad_norm": 0.1404358148574829,
+      "learning_rate": 0.00019150387596899226,
+      "loss": 0.8062,
+      "step": 147
+    },
+    {
+      "epoch": 0.04574961360123648,
+      "grad_norm": 0.140091672539711,
+      "learning_rate": 0.0001914418604651163,
+      "loss": 0.9796,
+      "step": 148
+    },
+    {
+      "epoch": 0.04605873261205564,
+      "grad_norm": 0.12712246179580688,
+      "learning_rate": 0.00019137984496124034,
+      "loss": 0.7021,
+      "step": 149
+    },
+    {
+      "epoch": 0.04636785162287481,
+      "grad_norm": 0.1542489379644394,
+      "learning_rate": 0.00019131782945736435,
+      "loss": 0.7141,
+      "step": 150
+    },
+    {
+      "epoch": 0.04667697063369397,
+      "grad_norm": 0.1310671865940094,
+      "learning_rate": 0.00019125581395348836,
+      "loss": 0.7513,
+      "step": 151
+    },
+    {
+      "epoch": 0.04698608964451314,
+      "grad_norm": 0.1205151230096817,
+      "learning_rate": 0.0001911937984496124,
+      "loss": 0.8243,
+      "step": 152
+    },
+    {
+      "epoch": 0.0472952086553323,
+      "grad_norm": 0.13522934913635254,
+      "learning_rate": 0.00019113178294573644,
+      "loss": 0.9441,
+      "step": 153
+    },
+    {
+      "epoch": 0.04760432766615147,
+      "grad_norm": 0.11995132267475128,
+      "learning_rate": 0.00019106976744186048,
+      "loss": 0.7032,
+      "step": 154
+    },
+    {
+      "epoch": 0.04791344667697063,
+      "grad_norm": 0.11310404539108276,
+      "learning_rate": 0.0001910077519379845,
+      "loss": 0.6913,
+      "step": 155
+    },
+    {
+      "epoch": 0.0482225656877898,
+      "grad_norm": 0.11462230980396271,
+      "learning_rate": 0.00019094573643410853,
+      "loss": 0.8314,
+      "step": 156
+    },
+    {
+      "epoch": 0.04853168469860897,
+      "grad_norm": 0.12306851893663406,
+      "learning_rate": 0.00019088372093023257,
+      "loss": 0.9122,
+      "step": 157
+    },
+    {
+      "epoch": 0.04884080370942813,
+      "grad_norm": 0.09559505432844162,
+      "learning_rate": 0.0001908217054263566,
+      "loss": 0.8177,
+      "step": 158
+    },
+    {
+      "epoch": 0.0491499227202473,
+      "grad_norm": 0.11616392433643341,
+      "learning_rate": 0.00019075968992248064,
+      "loss": 0.9223,
+      "step": 159
+    },
+    {
+      "epoch": 0.04945904173106646,
+      "grad_norm": 0.12350696325302124,
+      "learning_rate": 0.00019069767441860466,
+      "loss": 0.9004,
+      "step": 160
+    },
+    {
+      "epoch": 0.04976816074188563,
+      "grad_norm": 0.12747159600257874,
+      "learning_rate": 0.0001906356589147287,
+      "loss": 0.9126,
+      "step": 161
+    },
+    {
+      "epoch": 0.05007727975270479,
+      "grad_norm": 0.12991321086883545,
+      "learning_rate": 0.00019057364341085273,
+      "loss": 0.9113,
+      "step": 162
+    },
+    {
+      "epoch": 0.05038639876352396,
+      "grad_norm": 0.11218614876270294,
+      "learning_rate": 0.00019051162790697677,
+      "loss": 0.7117,
+      "step": 163
+    },
+    {
+      "epoch": 0.05069551777434312,
+      "grad_norm": 0.13032029569149017,
+      "learning_rate": 0.00019044961240310078,
+      "loss": 0.841,
+      "step": 164
+    },
+    {
+      "epoch": 0.05100463678516229,
+      "grad_norm": 0.1347358375787735,
+      "learning_rate": 0.0001903875968992248,
+      "loss": 0.8103,
+      "step": 165
+    },
+    {
+      "epoch": 0.05131375579598145,
+      "grad_norm": 0.11914915591478348,
+      "learning_rate": 0.00019032558139534883,
+      "loss": 0.8913,
+      "step": 166
+    },
+    {
+      "epoch": 0.05162287480680062,
+      "grad_norm": 0.15790300071239471,
+      "learning_rate": 0.00019026356589147287,
+      "loss": 0.801,
+      "step": 167
+    },
+    {
+      "epoch": 0.05193199381761979,
+      "grad_norm": 0.15204893052577972,
+      "learning_rate": 0.0001902015503875969,
+      "loss": 0.8085,
+      "step": 168
+    },
+    {
+      "epoch": 0.05224111282843895,
+      "grad_norm": 0.11781688779592514,
+      "learning_rate": 0.00019013953488372095,
+      "loss": 0.9159,
+      "step": 169
+    },
+    {
+      "epoch": 0.05255023183925812,
+      "grad_norm": 0.12103480845689774,
+      "learning_rate": 0.00019007751937984496,
+      "loss": 0.8221,
+      "step": 170
+    },
+    {
+      "epoch": 0.05285935085007728,
+      "grad_norm": 0.12477768957614899,
+      "learning_rate": 0.000190015503875969,
+      "loss": 0.7622,
+      "step": 171
+    },
+    {
+      "epoch": 0.05316846986089645,
+      "grad_norm": 0.11186879873275757,
+      "learning_rate": 0.00018995348837209304,
+      "loss": 0.7959,
+      "step": 172
+    },
+    {
+      "epoch": 0.05347758887171561,
+      "grad_norm": 0.12586474418640137,
+      "learning_rate": 0.00018989147286821708,
+      "loss": 0.6735,
+      "step": 173
+    },
+    {
+      "epoch": 0.05378670788253478,
+      "grad_norm": 0.11750543862581253,
+      "learning_rate": 0.0001898294573643411,
+      "loss": 0.8302,
+      "step": 174
+    },
+    {
+      "epoch": 0.05409582689335394,
+      "grad_norm": 0.1301109939813614,
+      "learning_rate": 0.00018976744186046513,
+      "loss": 0.9917,
+      "step": 175
+    },
+    {
+      "epoch": 0.05440494590417311,
+      "grad_norm": 0.1335320919752121,
+      "learning_rate": 0.00018970542635658917,
+      "loss": 0.8256,
+      "step": 176
+    },
+    {
+      "epoch": 0.05471406491499227,
+      "grad_norm": 0.12373456358909607,
+      "learning_rate": 0.00018964341085271318,
+      "loss": 0.8458,
+      "step": 177
+    },
+    {
+      "epoch": 0.05502318392581144,
+      "grad_norm": 0.1291348785161972,
+      "learning_rate": 0.00018958139534883722,
+      "loss": 0.7675,
+      "step": 178
+    },
+    {
+      "epoch": 0.0553323029366306,
+      "grad_norm": 0.12421860545873642,
+      "learning_rate": 0.00018951937984496123,
+      "loss": 0.8195,
+      "step": 179
+    },
+    {
+      "epoch": 0.05564142194744977,
+      "grad_norm": 0.1433798372745514,
+      "learning_rate": 0.00018945736434108527,
+      "loss": 0.7938,
+      "step": 180
+    },
+    {
+      "epoch": 0.055950540958268936,
+      "grad_norm": 0.15060195326805115,
+      "learning_rate": 0.0001893953488372093,
+      "loss": 0.7178,
+      "step": 181
+    },
+    {
+      "epoch": 0.0562596599690881,
+      "grad_norm": 0.13103605806827545,
+      "learning_rate": 0.00018933333333333335,
+      "loss": 0.8432,
+      "step": 182
+    },
+    {
+      "epoch": 0.05656877897990727,
+      "grad_norm": 0.1537558138370514,
+      "learning_rate": 0.00018927131782945738,
+      "loss": 0.81,
+      "step": 183
+    },
+    {
+      "epoch": 0.05687789799072643,
+      "grad_norm": 0.12011228501796722,
+      "learning_rate": 0.0001892093023255814,
+      "loss": 0.8906,
+      "step": 184
+    },
+    {
+      "epoch": 0.0571870170015456,
+      "grad_norm": 0.11913521587848663,
+      "learning_rate": 0.00018914728682170543,
+      "loss": 0.7819,
+      "step": 185
+    },
+    {
+      "epoch": 0.05749613601236476,
+      "grad_norm": 0.13771173357963562,
+      "learning_rate": 0.00018908527131782947,
+      "loss": 0.7775,
+      "step": 186
+    },
+    {
+      "epoch": 0.05780525502318393,
+      "grad_norm": 0.11831659823656082,
+      "learning_rate": 0.0001890232558139535,
+      "loss": 0.7461,
+      "step": 187
+    },
+    {
+      "epoch": 0.05811437403400309,
+      "grad_norm": 0.11049254238605499,
+      "learning_rate": 0.00018896124031007752,
+      "loss": 0.7784,
+      "step": 188
+    },
+    {
+      "epoch": 0.05842349304482226,
+      "grad_norm": 0.10577117651700974,
+      "learning_rate": 0.00018889922480620156,
+      "loss": 0.7211,
+      "step": 189
+    },
+    {
+      "epoch": 0.05873261205564142,
+      "grad_norm": 0.13082446157932281,
+      "learning_rate": 0.00018883720930232557,
+      "loss": 0.6854,
+      "step": 190
+    },
+    {
+      "epoch": 0.05904173106646059,
+      "grad_norm": 0.11105687916278839,
+      "learning_rate": 0.0001887751937984496,
+      "loss": 0.8418,
+      "step": 191
+    },
+    {
+      "epoch": 0.059350850077279756,
+      "grad_norm": 0.1412641704082489,
+      "learning_rate": 0.00018871317829457365,
+      "loss": 0.8293,
+      "step": 192
+    },
+    {
+      "epoch": 0.05965996908809892,
+      "grad_norm": 0.12148229032754898,
+      "learning_rate": 0.0001886511627906977,
+      "loss": 0.856,
+      "step": 193
+    },
+    {
+      "epoch": 0.059969088098918086,
+      "grad_norm": 0.12246838212013245,
+      "learning_rate": 0.0001885891472868217,
+      "loss": 0.9042,
+      "step": 194
+    },
+    {
+      "epoch": 0.06027820710973725,
+      "grad_norm": 0.12810048460960388,
+      "learning_rate": 0.00018852713178294574,
+      "loss": 0.7636,
+      "step": 195
+    },
+    {
+      "epoch": 0.060587326120556416,
+      "grad_norm": 0.12897038459777832,
+      "learning_rate": 0.00018846511627906978,
+      "loss": 0.8182,
+      "step": 196
+    },
+    {
+      "epoch": 0.06089644513137558,
+      "grad_norm": 0.13533450663089752,
+      "learning_rate": 0.00018840310077519382,
+      "loss": 0.7578,
+      "step": 197
+    },
+    {
+      "epoch": 0.061205564142194746,
+      "grad_norm": 0.13347265124320984,
+      "learning_rate": 0.00018834108527131783,
+      "loss": 0.8414,
+      "step": 198
+    },
+    {
+      "epoch": 0.06151468315301391,
+      "grad_norm": 0.14512066543102264,
+      "learning_rate": 0.00018827906976744187,
+      "loss": 0.8698,
+      "step": 199
+    },
+    {
+      "epoch": 0.061823802163833076,
+      "grad_norm": 0.11501649022102356,
+      "learning_rate": 0.0001882170542635659,
+      "loss": 0.8591,
+      "step": 200
+    },
+    {
+      "epoch": 0.06213292117465224,
+      "grad_norm": 0.11760124564170837,
+      "learning_rate": 0.00018815503875968995,
+      "loss": 0.6859,
+      "step": 201
+    },
+    {
+      "epoch": 0.062442040185471406,
+      "grad_norm": 0.13676373660564423,
+      "learning_rate": 0.00018809302325581399,
+      "loss": 0.7216,
+      "step": 202
+    },
+    {
+      "epoch": 0.06275115919629057,
+      "grad_norm": 0.11492311954498291,
+      "learning_rate": 0.000188031007751938,
+      "loss": 0.8223,
+      "step": 203
+    },
+    {
+      "epoch": 0.06306027820710974,
+      "grad_norm": 0.11568205058574677,
+      "learning_rate": 0.000187968992248062,
+      "loss": 0.8193,
+      "step": 204
+    },
+    {
+      "epoch": 0.0633693972179289,
+      "grad_norm": 0.12526321411132812,
+      "learning_rate": 0.00018790697674418605,
+      "loss": 0.8803,
+      "step": 205
+    },
+    {
+      "epoch": 0.06367851622874807,
+      "grad_norm": 0.12961214780807495,
+      "learning_rate": 0.00018784496124031009,
+      "loss": 0.8083,
+      "step": 206
+    },
+    {
+      "epoch": 0.06398763523956723,
+      "grad_norm": 0.11950293183326721,
+      "learning_rate": 0.00018778294573643412,
+      "loss": 0.8049,
+      "step": 207
+    },
+    {
+      "epoch": 0.0642967542503864,
+      "grad_norm": 0.11256164312362671,
+      "learning_rate": 0.00018772093023255814,
+      "loss": 0.7365,
+      "step": 208
+    },
+    {
+      "epoch": 0.06460587326120557,
+      "grad_norm": 0.13182170689105988,
+      "learning_rate": 0.00018765891472868217,
+      "loss": 0.7031,
+      "step": 209
+    },
+    {
+      "epoch": 0.06491499227202473,
+      "grad_norm": 0.1193682923913002,
+      "learning_rate": 0.00018759689922480621,
+      "loss": 0.8785,
+      "step": 210
+    },
+    {
+      "epoch": 0.06522411128284389,
+      "grad_norm": 0.13558265566825867,
+      "learning_rate": 0.00018753488372093025,
+      "loss": 0.8275,
+      "step": 211
+    },
+    {
+      "epoch": 0.06553323029366306,
+      "grad_norm": 0.12028771638870239,
+      "learning_rate": 0.00018747286821705426,
+      "loss": 0.8436,
+      "step": 212
+    },
+    {
+      "epoch": 0.06584234930448223,
+      "grad_norm": 0.12355880439281464,
+      "learning_rate": 0.0001874108527131783,
+      "loss": 0.687,
+      "step": 213
+    },
+    {
+      "epoch": 0.06615146831530139,
+      "grad_norm": 0.11852920055389404,
+      "learning_rate": 0.00018734883720930234,
+      "loss": 0.8268,
+      "step": 214
+    },
+    {
+      "epoch": 0.06646058732612056,
+      "grad_norm": 0.1262328028678894,
+      "learning_rate": 0.00018728682170542638,
+      "loss": 0.841,
+      "step": 215
+    },
+    {
+      "epoch": 0.06676970633693972,
+      "grad_norm": 0.13128647208213806,
+      "learning_rate": 0.00018722480620155042,
+      "loss": 0.7472,
+      "step": 216
+    },
+    {
+      "epoch": 0.06707882534775889,
+      "grad_norm": 0.12075427919626236,
+      "learning_rate": 0.00018716279069767443,
+      "loss": 0.8483,
+      "step": 217
+    },
+    {
+      "epoch": 0.06738794435857805,
+      "grad_norm": 0.11870454251766205,
+      "learning_rate": 0.00018710077519379844,
+      "loss": 0.8179,
+      "step": 218
+    },
+    {
+      "epoch": 0.06769706336939722,
+      "grad_norm": 0.12058960646390915,
+      "learning_rate": 0.00018703875968992248,
+      "loss": 0.8403,
+      "step": 219
+    },
+    {
+      "epoch": 0.06800618238021638,
+      "grad_norm": 0.13978858292102814,
+      "learning_rate": 0.00018697674418604652,
+      "loss": 0.6639,
+      "step": 220
+    },
+    {
+      "epoch": 0.06831530139103555,
+      "grad_norm": 0.10775326192378998,
+      "learning_rate": 0.00018691472868217056,
+      "loss": 0.8841,
+      "step": 221
+    },
+    {
+      "epoch": 0.06862442040185471,
+      "grad_norm": 0.10687053948640823,
+      "learning_rate": 0.00018685271317829457,
+      "loss": 0.7715,
+      "step": 222
+    },
+    {
+      "epoch": 0.06893353941267388,
+      "grad_norm": 0.12222916632890701,
+      "learning_rate": 0.0001867906976744186,
+      "loss": 0.7728,
+      "step": 223
+    },
+    {
+      "epoch": 0.06924265842349304,
+      "grad_norm": 0.13918592035770416,
+      "learning_rate": 0.00018672868217054265,
+      "loss": 0.8792,
+      "step": 224
+    },
+    {
+      "epoch": 0.0695517774343122,
+      "grad_norm": 0.11157078295946121,
+      "learning_rate": 0.0001866666666666667,
+      "loss": 0.8906,
+      "step": 225
+    },
+    {
+      "epoch": 0.06986089644513138,
+      "grad_norm": 0.12403914332389832,
+      "learning_rate": 0.0001866046511627907,
+      "loss": 0.9234,
+      "step": 226
+    },
+    {
+      "epoch": 0.07017001545595054,
+      "grad_norm": 0.11490818858146667,
+      "learning_rate": 0.00018654263565891474,
+      "loss": 0.8447,
+      "step": 227
+    },
+    {
+      "epoch": 0.0704791344667697,
+      "grad_norm": 0.13033214211463928,
+      "learning_rate": 0.00018648062015503878,
+      "loss": 0.7801,
+      "step": 228
+    },
+    {
+      "epoch": 0.07078825347758887,
+      "grad_norm": 0.1061464175581932,
+      "learning_rate": 0.00018641860465116281,
+      "loss": 0.6313,
+      "step": 229
+    },
+    {
+      "epoch": 0.07109737248840804,
+      "grad_norm": 0.12007651478052139,
+      "learning_rate": 0.00018635658914728683,
+      "loss": 0.7526,
+      "step": 230
+    },
+    {
+      "epoch": 0.0714064914992272,
+      "grad_norm": 0.12450309842824936,
+      "learning_rate": 0.00018629457364341086,
+      "loss": 0.8403,
+      "step": 231
+    },
+    {
+      "epoch": 0.07171561051004637,
+      "grad_norm": 0.16374681890010834,
+      "learning_rate": 0.00018623255813953488,
+      "loss": 0.7729,
+      "step": 232
+    },
+    {
+      "epoch": 0.07202472952086553,
+      "grad_norm": 0.13087786734104156,
+      "learning_rate": 0.00018617054263565892,
+      "loss": 0.8882,
+      "step": 233
+    },
+    {
+      "epoch": 0.0723338485316847,
+      "grad_norm": 0.13743267953395844,
+      "learning_rate": 0.00018610852713178295,
+      "loss": 0.8316,
+      "step": 234
+    },
+    {
+      "epoch": 0.07264296754250386,
+      "grad_norm": 0.1110304743051529,
+      "learning_rate": 0.000186046511627907,
+      "loss": 0.773,
+      "step": 235
+    },
+    {
+      "epoch": 0.07295208655332303,
+      "grad_norm": 0.12651820480823517,
+      "learning_rate": 0.000185984496124031,
+      "loss": 0.9538,
+      "step": 236
+    },
+    {
+      "epoch": 0.0732612055641422,
+      "grad_norm": 0.11768705397844315,
+      "learning_rate": 0.00018592248062015504,
+      "loss": 0.7585,
+      "step": 237
+    },
+    {
+      "epoch": 0.07357032457496136,
+      "grad_norm": 0.1165948212146759,
+      "learning_rate": 0.00018586046511627908,
+      "loss": 0.868,
+      "step": 238
+    },
+    {
+      "epoch": 0.07387944358578052,
+      "grad_norm": 0.12231750786304474,
+      "learning_rate": 0.00018579844961240312,
+      "loss": 0.8445,
+      "step": 239
+    },
+    {
+      "epoch": 0.07418856259659969,
+      "grad_norm": 0.13796208798885345,
+      "learning_rate": 0.00018573643410852716,
+      "loss": 0.8391,
+      "step": 240
+    },
+    {
+      "epoch": 0.07449768160741886,
+      "grad_norm": 0.1166827604174614,
+      "learning_rate": 0.00018567441860465117,
+      "loss": 0.7763,
+      "step": 241
+    },
+    {
+      "epoch": 0.07480680061823802,
+      "grad_norm": 0.12125882506370544,
+      "learning_rate": 0.0001856124031007752,
+      "loss": 0.7467,
+      "step": 242
+    },
+    {
+      "epoch": 0.07511591962905718,
+      "grad_norm": 0.13202430307865143,
+      "learning_rate": 0.00018555038759689925,
+      "loss": 0.8348,
+      "step": 243
+    },
+    {
+      "epoch": 0.07542503863987635,
+      "grad_norm": 0.13780809938907623,
+      "learning_rate": 0.00018548837209302326,
+      "loss": 0.7159,
+      "step": 244
+    },
+    {
+      "epoch": 0.07573415765069552,
+      "grad_norm": 0.163734570145607,
+      "learning_rate": 0.0001854263565891473,
+      "loss": 0.8078,
+      "step": 245
+    },
+    {
+      "epoch": 0.07604327666151468,
+      "grad_norm": 0.15040288865566254,
+      "learning_rate": 0.0001853643410852713,
+      "loss": 0.7911,
+      "step": 246
+    },
+    {
+      "epoch": 0.07635239567233384,
+      "grad_norm": 0.13316433131694794,
+      "learning_rate": 0.00018530232558139535,
+      "loss": 0.8217,
+      "step": 247
+    },
+    {
+      "epoch": 0.07666151468315302,
+      "grad_norm": 0.14527438580989838,
+      "learning_rate": 0.0001852403100775194,
+      "loss": 0.8287,
+      "step": 248
+    },
+    {
+      "epoch": 0.07697063369397218,
+      "grad_norm": 0.11744588613510132,
+      "learning_rate": 0.00018517829457364343,
+      "loss": 0.8774,
+      "step": 249
+    },
+    {
+      "epoch": 0.07727975270479134,
+      "grad_norm": 0.15297925472259521,
+      "learning_rate": 0.00018511627906976744,
+      "loss": 0.8035,
+      "step": 250
+    },
+    {
+      "epoch": 0.0775888717156105,
+      "grad_norm": 0.12520894408226013,
+      "learning_rate": 0.00018505426356589148,
+      "loss": 0.812,
+      "step": 251
+    },
+    {
+      "epoch": 0.07789799072642968,
+      "grad_norm": 0.12046486139297485,
+      "learning_rate": 0.00018499224806201552,
+      "loss": 0.7703,
+      "step": 252
+    },
+    {
+      "epoch": 0.07820710973724884,
+      "grad_norm": 0.10196825861930847,
+      "learning_rate": 0.00018493023255813955,
+      "loss": 0.7299,
+      "step": 253
+    },
+    {
+      "epoch": 0.078516228748068,
+      "grad_norm": 0.12353216111660004,
+      "learning_rate": 0.0001848682170542636,
+      "loss": 0.6998,
+      "step": 254
+    },
+    {
+      "epoch": 0.07882534775888717,
+      "grad_norm": 0.10435248166322708,
+      "learning_rate": 0.0001848062015503876,
+      "loss": 0.8073,
+      "step": 255
+    },
+    {
+      "epoch": 0.07913446676970634,
+      "grad_norm": 0.1290121078491211,
+      "learning_rate": 0.00018474418604651164,
+      "loss": 0.7954,
+      "step": 256
+    },
+    {
+      "epoch": 0.0794435857805255,
+      "grad_norm": 0.12450750917196274,
+      "learning_rate": 0.00018468217054263566,
+      "loss": 0.9049,
+      "step": 257
+    },
+    {
+      "epoch": 0.07975270479134466,
+      "grad_norm": 0.1351582258939743,
+      "learning_rate": 0.0001846201550387597,
+      "loss": 0.7842,
+      "step": 258
+    },
+    {
+      "epoch": 0.08006182380216384,
+      "grad_norm": 0.13335275650024414,
+      "learning_rate": 0.00018455813953488373,
+      "loss": 0.7771,
+      "step": 259
+    },
+    {
+      "epoch": 0.080370942812983,
+      "grad_norm": 0.10518497973680496,
+      "learning_rate": 0.00018449612403100774,
+      "loss": 0.7927,
+      "step": 260
+    },
+    {
+      "epoch": 0.08068006182380216,
+      "grad_norm": 0.11359915882349014,
+      "learning_rate": 0.00018443410852713178,
+      "loss": 0.8461,
+      "step": 261
+    },
+    {
+      "epoch": 0.08098918083462132,
+      "grad_norm": 0.12962335348129272,
+      "learning_rate": 0.00018437209302325582,
+      "loss": 0.8299,
+      "step": 262
+    },
+    {
+      "epoch": 0.0812982998454405,
+      "grad_norm": 0.1394529491662979,
+      "learning_rate": 0.00018431007751937986,
+      "loss": 0.8031,
+      "step": 263
+    },
+    {
+      "epoch": 0.08160741885625966,
+      "grad_norm": 0.11067520827054977,
+      "learning_rate": 0.0001842480620155039,
+      "loss": 0.8019,
+      "step": 264
+    },
+    {
+      "epoch": 0.08191653786707882,
+      "grad_norm": 0.14076265692710876,
+      "learning_rate": 0.0001841860465116279,
+      "loss": 0.8521,
+      "step": 265
+    },
+    {
+      "epoch": 0.08222565687789798,
+      "grad_norm": 0.14540016651153564,
+      "learning_rate": 0.00018412403100775195,
+      "loss": 0.8124,
+      "step": 266
+    },
+    {
+      "epoch": 0.08253477588871716,
+      "grad_norm": 0.14692644774913788,
+      "learning_rate": 0.000184062015503876,
+      "loss": 0.8629,
+      "step": 267
+    },
+    {
+      "epoch": 0.08284389489953632,
+      "grad_norm": 0.12723390758037567,
+      "learning_rate": 0.00018400000000000003,
+      "loss": 0.7565,
+      "step": 268
+    },
+    {
+      "epoch": 0.08315301391035548,
+      "grad_norm": 0.13681192696094513,
+      "learning_rate": 0.00018393798449612404,
+      "loss": 0.8458,
+      "step": 269
+    },
+    {
+      "epoch": 0.08346213292117466,
+      "grad_norm": 0.1476822942495346,
+      "learning_rate": 0.00018387596899224805,
+      "loss": 0.7544,
+      "step": 270
+    },
+    {
+      "epoch": 0.08377125193199382,
+      "grad_norm": 0.11408766359090805,
+      "learning_rate": 0.0001838139534883721,
+      "loss": 0.8782,
+      "step": 271
+    },
+    {
+      "epoch": 0.08408037094281298,
+      "grad_norm": 0.14710593223571777,
+      "learning_rate": 0.00018375193798449613,
+      "loss": 0.7568,
+      "step": 272
+    },
+    {
+      "epoch": 0.08438948995363214,
+      "grad_norm": 0.12558385729789734,
+      "learning_rate": 0.00018368992248062017,
+      "loss": 0.8574,
+      "step": 273
+    },
+    {
+      "epoch": 0.08469860896445132,
+      "grad_norm": 0.11890698224306107,
+      "learning_rate": 0.00018362790697674418,
+      "loss": 0.6688,
+      "step": 274
+    },
+    {
+      "epoch": 0.08500772797527048,
+      "grad_norm": 0.10440011322498322,
+      "learning_rate": 0.00018356589147286822,
+      "loss": 0.7699,
+      "step": 275
+    },
+    {
+      "epoch": 0.08531684698608964,
+      "grad_norm": 0.13725546002388,
+      "learning_rate": 0.00018350387596899226,
+      "loss": 0.8359,
+      "step": 276
+    },
+    {
+      "epoch": 0.0856259659969088,
+      "grad_norm": 0.12335329502820969,
+      "learning_rate": 0.0001834418604651163,
+      "loss": 0.7212,
+      "step": 277
+    },
+    {
+      "epoch": 0.08593508500772798,
+      "grad_norm": 0.1138865053653717,
+      "learning_rate": 0.00018337984496124033,
+      "loss": 0.6597,
+      "step": 278
+    },
+    {
+      "epoch": 0.08624420401854714,
+      "grad_norm": 0.11864970624446869,
+      "learning_rate": 0.00018331782945736435,
+      "loss": 0.7944,
+      "step": 279
+    },
+    {
+      "epoch": 0.0865533230293663,
+      "grad_norm": 0.14360670745372772,
+      "learning_rate": 0.00018325581395348838,
+      "loss": 0.8186,
+      "step": 280
+    },
+    {
+      "epoch": 0.08686244204018548,
+      "grad_norm": 0.13418716192245483,
+      "learning_rate": 0.00018319379844961242,
+      "loss": 0.8478,
+      "step": 281
+    },
+    {
+      "epoch": 0.08717156105100464,
+      "grad_norm": 0.13283377885818481,
+      "learning_rate": 0.00018313178294573646,
+      "loss": 0.8309,
+      "step": 282
+    },
+    {
+      "epoch": 0.0874806800618238,
+      "grad_norm": 0.11697278916835785,
+      "learning_rate": 0.00018306976744186047,
+      "loss": 0.8521,
+      "step": 283
+    },
+    {
+      "epoch": 0.08778979907264296,
+      "grad_norm": 0.11819571256637573,
+      "learning_rate": 0.00018300775193798448,
+      "loss": 0.6976,
+      "step": 284
+    },
+    {
+      "epoch": 0.08809891808346214,
+      "grad_norm": 0.11848420649766922,
+      "learning_rate": 0.00018294573643410852,
+      "loss": 0.8549,
+      "step": 285
+    },
+    {
+      "epoch": 0.0884080370942813,
+      "grad_norm": 0.10397352278232574,
+      "learning_rate": 0.00018288372093023256,
+      "loss": 0.6554,
+      "step": 286
+    },
+    {
+      "epoch": 0.08871715610510046,
+      "grad_norm": 0.15076309442520142,
+      "learning_rate": 0.0001828217054263566,
+      "loss": 0.6948,
+      "step": 287
+    },
+    {
+      "epoch": 0.08902627511591962,
+      "grad_norm": 0.13722991943359375,
+      "learning_rate": 0.0001827596899224806,
+      "loss": 0.8466,
+      "step": 288
+    },
+    {
+      "epoch": 0.0893353941267388,
+      "grad_norm": 0.11547433584928513,
+      "learning_rate": 0.00018269767441860465,
+      "loss": 0.7359,
+      "step": 289
+    },
+    {
+      "epoch": 0.08964451313755796,
+      "grad_norm": 0.12837247550487518,
+      "learning_rate": 0.0001826356589147287,
+      "loss": 0.8533,
+      "step": 290
+    },
+    {
+      "epoch": 0.08995363214837712,
+      "grad_norm": 0.11957511305809021,
+      "learning_rate": 0.00018257364341085273,
+      "loss": 0.6918,
+      "step": 291
+    },
+    {
+      "epoch": 0.0902627511591963,
+      "grad_norm": 0.11487089097499847,
+      "learning_rate": 0.00018251162790697677,
+      "loss": 0.8411,
+      "step": 292
+    },
+    {
+      "epoch": 0.09057187017001546,
+      "grad_norm": 0.12621980905532837,
+      "learning_rate": 0.00018244961240310078,
+      "loss": 0.7913,
+      "step": 293
+    },
+    {
+      "epoch": 0.09088098918083462,
+      "grad_norm": 0.14285391569137573,
+      "learning_rate": 0.00018238759689922482,
+      "loss": 0.9191,
+      "step": 294
+    },
+    {
+      "epoch": 0.09119010819165378,
+      "grad_norm": 0.14195428788661957,
+      "learning_rate": 0.00018232558139534886,
+      "loss": 0.8699,
+      "step": 295
+    },
+    {
+      "epoch": 0.09149922720247296,
+      "grad_norm": 0.12400256842374802,
+      "learning_rate": 0.0001822635658914729,
+      "loss": 0.7707,
+      "step": 296
+    },
+    {
+      "epoch": 0.09180834621329212,
+      "grad_norm": 0.1220916360616684,
+      "learning_rate": 0.0001822015503875969,
+      "loss": 0.7948,
+      "step": 297
+    },
+    {
+      "epoch": 0.09211746522411128,
+      "grad_norm": 0.11888230592012405,
+      "learning_rate": 0.00018213953488372092,
+      "loss": 0.8237,
+      "step": 298
+    },
+    {
+      "epoch": 0.09242658423493044,
+      "grad_norm": 0.134236678481102,
+      "learning_rate": 0.00018207751937984496,
+      "loss": 0.7503,
+      "step": 299
+    },
+    {
+      "epoch": 0.09273570324574962,
+      "grad_norm": 0.09614330530166626,
+      "learning_rate": 0.000182015503875969,
+      "loss": 0.772,
+      "step": 300
+    },
+    {
+      "epoch": 0.09304482225656878,
+      "grad_norm": 0.11686000227928162,
+      "learning_rate": 0.00018195348837209303,
+      "loss": 0.8648,
+      "step": 301
+    },
+    {
+      "epoch": 0.09335394126738794,
+      "grad_norm": 0.11321427673101425,
+      "learning_rate": 0.00018189147286821707,
+      "loss": 0.8302,
+      "step": 302
+    },
+    {
+      "epoch": 0.09366306027820712,
+      "grad_norm": 0.12898504734039307,
+      "learning_rate": 0.00018182945736434109,
+      "loss": 0.8,
+      "step": 303
+    },
+    {
+      "epoch": 0.09397217928902628,
+      "grad_norm": 0.10747554153203964,
+      "learning_rate": 0.00018176744186046512,
+      "loss": 0.7696,
+      "step": 304
+    },
+    {
+      "epoch": 0.09428129829984544,
+      "grad_norm": 0.12886860966682434,
+      "learning_rate": 0.00018170542635658916,
+      "loss": 0.8459,
+      "step": 305
+    },
+    {
+      "epoch": 0.0945904173106646,
+      "grad_norm": 0.12835724651813507,
+      "learning_rate": 0.0001816434108527132,
+      "loss": 0.8523,
+      "step": 306
+    },
+    {
+      "epoch": 0.09489953632148378,
+      "grad_norm": 0.11970589309930801,
+      "learning_rate": 0.0001815813953488372,
+      "loss": 0.7864,
+      "step": 307
+    },
+    {
+      "epoch": 0.09520865533230294,
+      "grad_norm": 0.12258201837539673,
+      "learning_rate": 0.00018151937984496125,
+      "loss": 0.7593,
+      "step": 308
+    },
+    {
+      "epoch": 0.0955177743431221,
+      "grad_norm": 0.1291266828775406,
+      "learning_rate": 0.0001814573643410853,
+      "loss": 0.925,
+      "step": 309
+    },
+    {
+      "epoch": 0.09582689335394126,
+      "grad_norm": 0.12266039103269577,
+      "learning_rate": 0.0001813953488372093,
+      "loss": 0.7341,
+      "step": 310
+    },
+    {
+      "epoch": 0.09613601236476044,
+      "grad_norm": 0.10808485746383667,
+      "learning_rate": 0.00018133333333333334,
+      "loss": 0.8983,
+      "step": 311
+    },
+    {
+      "epoch": 0.0964451313755796,
+      "grad_norm": 0.1303120255470276,
+      "learning_rate": 0.00018127131782945735,
+      "loss": 0.9788,
+      "step": 312
+    },
+    {
+      "epoch": 0.09675425038639876,
+      "grad_norm": 0.1282745897769928,
+      "learning_rate": 0.0001812093023255814,
+      "loss": 0.6776,
+      "step": 313
+    },
+    {
+      "epoch": 0.09706336939721794,
+      "grad_norm": 0.10674197226762772,
+      "learning_rate": 0.00018114728682170543,
+      "loss": 0.819,
+      "step": 314
+    },
+    {
+      "epoch": 0.0973724884080371,
+      "grad_norm": 0.10732909291982651,
+      "learning_rate": 0.00018108527131782947,
+      "loss": 0.8426,
+      "step": 315
+    },
+    {
+      "epoch": 0.09768160741885626,
+      "grad_norm": 0.14202672243118286,
+      "learning_rate": 0.0001810232558139535,
+      "loss": 0.8447,
+      "step": 316
+    },
+    {
+      "epoch": 0.09799072642967542,
+      "grad_norm": 0.12257728725671768,
+      "learning_rate": 0.00018096124031007752,
+      "loss": 0.8032,
+      "step": 317
+    },
+    {
+      "epoch": 0.0982998454404946,
+      "grad_norm": 0.11397712677717209,
+      "learning_rate": 0.00018089922480620156,
+      "loss": 0.7667,
+      "step": 318
+    },
+    {
+      "epoch": 0.09860896445131376,
+      "grad_norm": 0.11759169399738312,
+      "learning_rate": 0.0001808372093023256,
+      "loss": 0.775,
+      "step": 319
+    },
+    {
+      "epoch": 0.09891808346213292,
+      "grad_norm": 0.10919482260942459,
+      "learning_rate": 0.00018077519379844964,
+      "loss": 0.6987,
+      "step": 320
+    },
+    {
+      "epoch": 0.09922720247295208,
+      "grad_norm": 0.14136123657226562,
+      "learning_rate": 0.00018071317829457365,
+      "loss": 0.7642,
+      "step": 321
+    },
+    {
+      "epoch": 0.09953632148377126,
+      "grad_norm": 0.12550586462020874,
+      "learning_rate": 0.00018065116279069769,
+      "loss": 0.9144,
+      "step": 322
+    },
+    {
+      "epoch": 0.09984544049459042,
+      "grad_norm": 0.1267971247434616,
+      "learning_rate": 0.00018058914728682172,
+      "loss": 0.8512,
+      "step": 323
+    },
+    {
+      "epoch": 0.10015455950540958,
+      "grad_norm": 0.12473420053720474,
+      "learning_rate": 0.00018052713178294574,
+      "loss": 0.9391,
+      "step": 324
+    },
+    {
+      "epoch": 0.10046367851622875,
+      "grad_norm": 0.11510586738586426,
+      "learning_rate": 0.00018046511627906977,
+      "loss": 0.7916,
+      "step": 325
+    },
+    {
+      "epoch": 0.10077279752704792,
+      "grad_norm": 0.13380743563175201,
+      "learning_rate": 0.00018040310077519381,
+      "loss": 0.819,
+      "step": 326
+    },
+    {
+      "epoch": 0.10108191653786708,
+      "grad_norm": 0.1224348247051239,
+      "learning_rate": 0.00018034108527131783,
+      "loss": 0.778,
+      "step": 327
+    },
+    {
+      "epoch": 0.10139103554868624,
+      "grad_norm": 0.11977488547563553,
+      "learning_rate": 0.00018027906976744186,
+      "loss": 0.8521,
+      "step": 328
+    },
+    {
+      "epoch": 0.10170015455950542,
+      "grad_norm": 0.11883991211652756,
+      "learning_rate": 0.0001802170542635659,
+      "loss": 0.8959,
+      "step": 329
+    },
+    {
+      "epoch": 0.10200927357032458,
+      "grad_norm": 0.13148127496242523,
+      "learning_rate": 0.00018015503875968994,
+      "loss": 0.7503,
+      "step": 330
+    },
+    {
+      "epoch": 0.10231839258114374,
+      "grad_norm": 0.12128669023513794,
+      "learning_rate": 0.00018009302325581395,
+      "loss": 0.7469,
+      "step": 331
+    },
+    {
+      "epoch": 0.1026275115919629,
+      "grad_norm": 0.12330310046672821,
+      "learning_rate": 0.000180031007751938,
+      "loss": 0.83,
+      "step": 332
+    },
+    {
+      "epoch": 0.10293663060278208,
+      "grad_norm": 0.10930616408586502,
+      "learning_rate": 0.00017996899224806203,
+      "loss": 0.7841,
+      "step": 333
+    },
+    {
+      "epoch": 0.10324574961360124,
+      "grad_norm": 0.12586379051208496,
+      "learning_rate": 0.00017990697674418607,
+      "loss": 0.729,
+      "step": 334
+    },
+    {
+      "epoch": 0.1035548686244204,
+      "grad_norm": 0.11840980499982834,
+      "learning_rate": 0.0001798449612403101,
+      "loss": 0.7329,
+      "step": 335
+    },
+    {
+      "epoch": 0.10386398763523957,
+      "grad_norm": 0.11878569424152374,
+      "learning_rate": 0.00017978294573643412,
+      "loss": 0.8202,
+      "step": 336
+    },
+    {
+      "epoch": 0.10417310664605874,
+      "grad_norm": 0.1265515387058258,
+      "learning_rate": 0.00017972093023255813,
+      "loss": 0.6638,
+      "step": 337
+    },
+    {
+      "epoch": 0.1044822256568779,
+      "grad_norm": 0.1272660791873932,
+      "learning_rate": 0.00017965891472868217,
+      "loss": 0.8512,
+      "step": 338
+    },
+    {
+      "epoch": 0.10479134466769706,
+      "grad_norm": 0.11359579861164093,
+      "learning_rate": 0.0001795968992248062,
+      "loss": 0.8199,
+      "step": 339
+    },
+    {
+      "epoch": 0.10510046367851623,
+      "grad_norm": 0.11645165085792542,
+      "learning_rate": 0.00017953488372093025,
+      "loss": 0.9163,
+      "step": 340
+    },
+    {
+      "epoch": 0.1054095826893354,
+      "grad_norm": 0.11384947597980499,
+      "learning_rate": 0.00017947286821705426,
+      "loss": 0.7825,
+      "step": 341
+    },
+    {
+      "epoch": 0.10571870170015456,
+      "grad_norm": 0.11389808356761932,
+      "learning_rate": 0.0001794108527131783,
+      "loss": 0.8078,
+      "step": 342
+    },
+    {
+      "epoch": 0.10602782071097372,
+      "grad_norm": 0.12317777425050735,
+      "learning_rate": 0.00017934883720930234,
+      "loss": 0.9068,
+      "step": 343
+    },
+    {
+      "epoch": 0.1063369397217929,
+      "grad_norm": 0.10351788252592087,
+      "learning_rate": 0.00017928682170542638,
+      "loss": 0.8258,
+      "step": 344
+    },
+    {
+      "epoch": 0.10664605873261206,
+      "grad_norm": 0.11422822624444962,
+      "learning_rate": 0.0001792248062015504,
+      "loss": 0.8725,
+      "step": 345
+    },
+    {
+      "epoch": 0.10695517774343122,
+      "grad_norm": 0.11480465531349182,
+      "learning_rate": 0.00017916279069767443,
+      "loss": 0.8415,
+      "step": 346
+    },
+    {
+      "epoch": 0.10726429675425038,
+      "grad_norm": 0.11581287533044815,
+      "learning_rate": 0.00017910077519379846,
+      "loss": 0.6787,
+      "step": 347
+    },
+    {
+      "epoch": 0.10757341576506955,
+      "grad_norm": 0.10481414198875427,
+      "learning_rate": 0.0001790387596899225,
+      "loss": 0.6735,
+      "step": 348
+    },
+    {
+      "epoch": 0.10788253477588872,
+      "grad_norm": 0.12571753561496735,
+      "learning_rate": 0.00017897674418604654,
+      "loss": 0.7918,
+      "step": 349
+    },
+    {
+      "epoch": 0.10819165378670788,
+      "grad_norm": 0.1036786288022995,
+      "learning_rate": 0.00017891472868217055,
+      "loss": 0.7985,
+      "step": 350
+    },
+    {
+      "epoch": 0.10850077279752705,
+      "grad_norm": 0.12399487942457199,
+      "learning_rate": 0.00017885271317829457,
+      "loss": 0.9029,
+      "step": 351
+    },
+    {
+      "epoch": 0.10880989180834622,
+      "grad_norm": 0.1186407133936882,
+      "learning_rate": 0.0001787906976744186,
+      "loss": 0.7395,
+      "step": 352
+    },
+    {
+      "epoch": 0.10911901081916538,
+      "grad_norm": 0.1321779191493988,
+      "learning_rate": 0.00017872868217054264,
+      "loss": 0.7905,
+      "step": 353
+    },
+    {
+      "epoch": 0.10942812982998454,
+      "grad_norm": 0.14797626435756683,
+      "learning_rate": 0.00017866666666666668,
+      "loss": 0.9305,
+      "step": 354
+    },
+    {
+      "epoch": 0.10973724884080371,
+      "grad_norm": 0.11093270033597946,
+      "learning_rate": 0.0001786046511627907,
+      "loss": 0.8375,
+      "step": 355
+    },
+    {
+      "epoch": 0.11004636785162288,
+      "grad_norm": 0.11384811252355576,
+      "learning_rate": 0.00017854263565891473,
+      "loss": 0.639,
+      "step": 356
+    },
+    {
+      "epoch": 0.11035548686244204,
+      "grad_norm": 0.13438202440738678,
+      "learning_rate": 0.00017848062015503877,
+      "loss": 0.777,
+      "step": 357
+    },
+    {
+      "epoch": 0.1106646058732612,
+      "grad_norm": 0.11255431920289993,
+      "learning_rate": 0.0001784186046511628,
+      "loss": 0.7742,
+      "step": 358
+    },
+    {
+      "epoch": 0.11097372488408037,
+      "grad_norm": 0.12108633667230606,
+      "learning_rate": 0.00017835658914728682,
+      "loss": 0.7723,
+      "step": 359
+    },
+    {
+      "epoch": 0.11128284389489954,
+      "grad_norm": 0.13009031116962433,
+      "learning_rate": 0.00017829457364341086,
+      "loss": 0.8108,
+      "step": 360
+    },
+    {
+      "epoch": 0.1115919629057187,
+      "grad_norm": 0.10433927178382874,
+      "learning_rate": 0.0001782325581395349,
+      "loss": 0.6764,
+      "step": 361
+    },
+    {
+      "epoch": 0.11190108191653787,
+      "grad_norm": 0.132685124874115,
+      "learning_rate": 0.00017817054263565894,
+      "loss": 0.8012,
+      "step": 362
+    },
+    {
+      "epoch": 0.11221020092735703,
+      "grad_norm": 0.11265043169260025,
+      "learning_rate": 0.00017810852713178298,
+      "loss": 0.7925,
+      "step": 363
+    },
+    {
+      "epoch": 0.1125193199381762,
+      "grad_norm": 0.12110339850187302,
+      "learning_rate": 0.000178046511627907,
+      "loss": 0.8647,
+      "step": 364
+    },
+    {
+      "epoch": 0.11282843894899536,
+      "grad_norm": 0.1396140158176422,
+      "learning_rate": 0.000177984496124031,
+      "loss": 0.7221,
+      "step": 365
+    },
+    {
+      "epoch": 0.11313755795981453,
+      "grad_norm": 0.11034229397773743,
+      "learning_rate": 0.00017792248062015504,
+      "loss": 0.8636,
+      "step": 366
+    },
+    {
+      "epoch": 0.1134466769706337,
+      "grad_norm": 0.12675125896930695,
+      "learning_rate": 0.00017786046511627908,
+      "loss": 0.7799,
+      "step": 367
+    },
+    {
+      "epoch": 0.11375579598145286,
+      "grad_norm": 0.10970692336559296,
+      "learning_rate": 0.00017779844961240312,
+      "loss": 0.8741,
+      "step": 368
+    },
+    {
+      "epoch": 0.11406491499227202,
+      "grad_norm": 0.1316499263048172,
+      "learning_rate": 0.00017773643410852713,
+      "loss": 0.7235,
+      "step": 369
+    },
+    {
+      "epoch": 0.1143740340030912,
+      "grad_norm": 0.15425892174243927,
+      "learning_rate": 0.00017767441860465117,
+      "loss": 0.7253,
+      "step": 370
+    },
+    {
+      "epoch": 0.11468315301391035,
+      "grad_norm": 0.1116160973906517,
+      "learning_rate": 0.0001776124031007752,
+      "loss": 0.7544,
+      "step": 371
+    },
+    {
+      "epoch": 0.11499227202472952,
+      "grad_norm": 0.112430639564991,
+      "learning_rate": 0.00017755038759689924,
+      "loss": 0.8633,
+      "step": 372
+    },
+    {
+      "epoch": 0.11530139103554869,
+      "grad_norm": 0.12070276588201523,
+      "learning_rate": 0.00017748837209302328,
+      "loss": 0.7936,
+      "step": 373
+    },
+    {
+      "epoch": 0.11561051004636785,
+      "grad_norm": 0.14540359377861023,
+      "learning_rate": 0.0001774263565891473,
+      "loss": 0.9689,
+      "step": 374
+    },
+    {
+      "epoch": 0.11591962905718702,
+      "grad_norm": 0.1259058117866516,
+      "learning_rate": 0.00017736434108527133,
+      "loss": 0.8228,
+      "step": 375
+    },
+    {
+      "epoch": 0.11622874806800618,
+      "grad_norm": 0.09805137664079666,
+      "learning_rate": 0.00017730232558139537,
+      "loss": 0.7173,
+      "step": 376
+    },
+    {
+      "epoch": 0.11653786707882535,
+      "grad_norm": 0.1228744387626648,
+      "learning_rate": 0.00017724031007751938,
+      "loss": 0.7926,
+      "step": 377
+    },
+    {
+      "epoch": 0.11684698608964451,
+      "grad_norm": 0.1288052648305893,
+      "learning_rate": 0.00017717829457364342,
+      "loss": 0.7403,
+      "step": 378
+    },
+    {
+      "epoch": 0.11715610510046368,
+      "grad_norm": 0.11749331653118134,
+      "learning_rate": 0.00017711627906976743,
+      "loss": 0.702,
+      "step": 379
+    },
+    {
+      "epoch": 0.11746522411128284,
+      "grad_norm": 0.12872126698493958,
+      "learning_rate": 0.00017705426356589147,
+      "loss": 0.7654,
+      "step": 380
+    },
+    {
+      "epoch": 0.11777434312210201,
+      "grad_norm": 0.12806439399719238,
+      "learning_rate": 0.0001769922480620155,
+      "loss": 0.897,
+      "step": 381
+    },
+    {
+      "epoch": 0.11808346213292117,
+      "grad_norm": 0.1399737149477005,
+      "learning_rate": 0.00017693023255813955,
+      "loss": 0.7706,
+      "step": 382
+    },
+    {
+      "epoch": 0.11839258114374034,
+      "grad_norm": 0.14267806708812714,
+      "learning_rate": 0.00017686821705426356,
+      "loss": 0.8854,
+      "step": 383
+    },
+    {
+      "epoch": 0.11870170015455951,
+      "grad_norm": 0.10857547074556351,
+      "learning_rate": 0.0001768062015503876,
+      "loss": 0.7281,
+      "step": 384
+    },
+    {
+      "epoch": 0.11901081916537867,
+      "grad_norm": 0.11292342841625214,
+      "learning_rate": 0.00017674418604651164,
+      "loss": 0.7288,
+      "step": 385
+    },
+    {
+      "epoch": 0.11931993817619783,
+      "grad_norm": 0.117954321205616,
+      "learning_rate": 0.00017668217054263568,
+      "loss": 0.7577,
+      "step": 386
+    },
+    {
+      "epoch": 0.119629057187017,
+      "grad_norm": 0.11536258459091187,
+      "learning_rate": 0.00017662015503875972,
+      "loss": 0.8542,
+      "step": 387
+    },
+    {
+      "epoch": 0.11993817619783617,
+      "grad_norm": 0.13967657089233398,
+      "learning_rate": 0.00017655813953488373,
+      "loss": 0.7666,
+      "step": 388
+    },
+    {
+      "epoch": 0.12024729520865533,
+      "grad_norm": 0.124544158577919,
+      "learning_rate": 0.00017649612403100777,
+      "loss": 0.8928,
+      "step": 389
+    },
+    {
+      "epoch": 0.1205564142194745,
+      "grad_norm": 0.11185236275196075,
+      "learning_rate": 0.00017643410852713178,
+      "loss": 0.665,
+      "step": 390
+    },
+    {
+      "epoch": 0.12086553323029366,
+      "grad_norm": 0.11170051246881485,
+      "learning_rate": 0.00017637209302325582,
+      "loss": 0.7362,
+      "step": 391
+    },
+    {
+      "epoch": 0.12117465224111283,
+      "grad_norm": 0.12095949798822403,
+      "learning_rate": 0.00017631007751937986,
+      "loss": 0.7309,
+      "step": 392
+    },
+    {
+      "epoch": 0.121483771251932,
+      "grad_norm": 0.12416354566812515,
+      "learning_rate": 0.00017624806201550387,
+      "loss": 0.7633,
+      "step": 393
+    },
+    {
+      "epoch": 0.12179289026275116,
+      "grad_norm": 0.11069466918706894,
+      "learning_rate": 0.0001761860465116279,
+      "loss": 0.7525,
+      "step": 394
+    },
+    {
+      "epoch": 0.12210200927357033,
+      "grad_norm": 0.11477687954902649,
+      "learning_rate": 0.00017612403100775195,
+      "loss": 0.8462,
+      "step": 395
+    },
+    {
+      "epoch": 0.12241112828438949,
+      "grad_norm": 0.13723276555538177,
+      "learning_rate": 0.00017606201550387598,
+      "loss": 0.9396,
+      "step": 396
+    },
+    {
+      "epoch": 0.12272024729520865,
+      "grad_norm": 0.11079475283622742,
+      "learning_rate": 0.00017600000000000002,
+      "loss": 0.8683,
+      "step": 397
+    },
+    {
+      "epoch": 0.12302936630602782,
+      "grad_norm": 0.11033523827791214,
+      "learning_rate": 0.00017593798449612403,
+      "loss": 0.6651,
+      "step": 398
+    },
+    {
+      "epoch": 0.12333848531684699,
+      "grad_norm": 0.11088595539331436,
+      "learning_rate": 0.00017587596899224807,
+      "loss": 0.7183,
+      "step": 399
+    },
+    {
+      "epoch": 0.12364760432766615,
+      "grad_norm": 0.11949151009321213,
+      "learning_rate": 0.0001758139534883721,
+      "loss": 0.752,
+      "step": 400
+    },
+    {
+      "epoch": 0.12395672333848531,
+      "grad_norm": 0.11605624854564667,
+      "learning_rate": 0.00017575193798449615,
+      "loss": 0.8398,
+      "step": 401
+    },
+    {
+      "epoch": 0.12426584234930448,
+      "grad_norm": 0.1079692542552948,
+      "learning_rate": 0.00017568992248062016,
+      "loss": 0.8006,
+      "step": 402
+    },
+    {
+      "epoch": 0.12457496136012365,
+      "grad_norm": 0.12857861816883087,
+      "learning_rate": 0.0001756279069767442,
+      "loss": 0.754,
+      "step": 403
+    },
+    {
+      "epoch": 0.12488408037094281,
+      "grad_norm": 0.11760881543159485,
+      "learning_rate": 0.0001755658914728682,
+      "loss": 0.7992,
+      "step": 404
+    },
+    {
+      "epoch": 0.125193199381762,
+      "grad_norm": 0.1251303255558014,
+      "learning_rate": 0.00017550387596899225,
+      "loss": 0.8104,
+      "step": 405
+    },
+    {
+      "epoch": 0.12550231839258114,
+      "grad_norm": 0.1187320277094841,
+      "learning_rate": 0.0001754418604651163,
+      "loss": 0.7542,
+      "step": 406
+    },
+    {
+      "epoch": 0.1258114374034003,
+      "grad_norm": 0.1084708720445633,
+      "learning_rate": 0.0001753798449612403,
+      "loss": 0.7296,
+      "step": 407
+    },
+    {
+      "epoch": 0.1261205564142195,
+      "grad_norm": 0.1298135370016098,
+      "learning_rate": 0.00017531782945736434,
+      "loss": 0.7441,
+      "step": 408
+    },
+    {
+      "epoch": 0.12642967542503863,
+      "grad_norm": 0.1294536590576172,
+      "learning_rate": 0.00017525581395348838,
+      "loss": 0.7905,
+      "step": 409
+    },
+    {
+      "epoch": 0.1267387944358578,
+      "grad_norm": 0.10958458483219147,
+      "learning_rate": 0.00017519379844961242,
+      "loss": 0.8569,
+      "step": 410
+    },
+    {
+      "epoch": 0.12704791344667696,
+      "grad_norm": 0.12941788136959076,
+      "learning_rate": 0.00017513178294573646,
+      "loss": 0.8383,
+      "step": 411
+    },
+    {
+      "epoch": 0.12735703245749613,
+      "grad_norm": 0.12861841917037964,
+      "learning_rate": 0.00017506976744186047,
+      "loss": 0.7911,
+      "step": 412
+    },
+    {
+      "epoch": 0.1276661514683153,
+      "grad_norm": 0.13337025046348572,
+      "learning_rate": 0.0001750077519379845,
+      "loss": 0.7918,
+      "step": 413
+    },
+    {
+      "epoch": 0.12797527047913446,
+      "grad_norm": 0.11046712100505829,
+      "learning_rate": 0.00017494573643410855,
+      "loss": 0.6964,
+      "step": 414
+    },
+    {
+      "epoch": 0.12828438948995363,
+      "grad_norm": 0.10456400364637375,
+      "learning_rate": 0.00017488372093023258,
+      "loss": 0.8032,
+      "step": 415
+    },
+    {
+      "epoch": 0.1285935085007728,
+      "grad_norm": 0.1251031905412674,
+      "learning_rate": 0.0001748217054263566,
+      "loss": 0.7741,
+      "step": 416
+    },
+    {
+      "epoch": 0.12890262751159196,
+      "grad_norm": 0.13418059051036835,
+      "learning_rate": 0.0001747596899224806,
+      "loss": 0.862,
+      "step": 417
+    },
+    {
+      "epoch": 0.12921174652241113,
+      "grad_norm": 0.11014249175786972,
+      "learning_rate": 0.00017469767441860465,
+      "loss": 0.8202,
+      "step": 418
+    },
+    {
+      "epoch": 0.1295208655332303,
+      "grad_norm": 0.13367420434951782,
+      "learning_rate": 0.00017463565891472869,
+      "loss": 0.8483,
+      "step": 419
+    },
+    {
+      "epoch": 0.12982998454404945,
+      "grad_norm": 0.11982861161231995,
+      "learning_rate": 0.00017457364341085272,
+      "loss": 0.7669,
+      "step": 420
+    },
+    {
+      "epoch": 0.13013910355486863,
+      "grad_norm": 0.13078713417053223,
+      "learning_rate": 0.00017451162790697674,
+      "loss": 0.7996,
+      "step": 421
+    },
+    {
+      "epoch": 0.13044822256568778,
+      "grad_norm": 0.1363217681646347,
+      "learning_rate": 0.00017444961240310077,
+      "loss": 0.8364,
+      "step": 422
+    },
+    {
+      "epoch": 0.13075734157650695,
+      "grad_norm": 0.11756312847137451,
+      "learning_rate": 0.0001743875968992248,
+      "loss": 0.7808,
+      "step": 423
+    },
+    {
+      "epoch": 0.13106646058732613,
+      "grad_norm": 0.12155081331729889,
+      "learning_rate": 0.00017432558139534885,
+      "loss": 0.7729,
+      "step": 424
+    },
+    {
+      "epoch": 0.13137557959814528,
+      "grad_norm": 0.13399578630924225,
+      "learning_rate": 0.0001742635658914729,
+      "loss": 0.7278,
+      "step": 425
+    },
+    {
+      "epoch": 0.13168469860896445,
+      "grad_norm": 0.12727884948253632,
+      "learning_rate": 0.0001742015503875969,
+      "loss": 0.7534,
+      "step": 426
+    },
+    {
+      "epoch": 0.13199381761978363,
+      "grad_norm": 0.13630586862564087,
+      "learning_rate": 0.00017413953488372094,
+      "loss": 0.7763,
+      "step": 427
+    },
+    {
+      "epoch": 0.13230293663060277,
+      "grad_norm": 0.14212100207805634,
+      "learning_rate": 0.00017407751937984498,
+      "loss": 0.8291,
+      "step": 428
+    },
+    {
+      "epoch": 0.13261205564142195,
+      "grad_norm": 0.11936759203672409,
+      "learning_rate": 0.00017401550387596902,
+      "loss": 0.9107,
+      "step": 429
+    },
+    {
+      "epoch": 0.13292117465224113,
+      "grad_norm": 0.11957745999097824,
+      "learning_rate": 0.00017395348837209303,
+      "loss": 0.7514,
+      "step": 430
+    },
+    {
+      "epoch": 0.13323029366306027,
+      "grad_norm": 0.11473491042852402,
+      "learning_rate": 0.00017389147286821704,
+      "loss": 0.8061,
+      "step": 431
+    },
+    {
+      "epoch": 0.13353941267387945,
+      "grad_norm": 0.12292005121707916,
+      "learning_rate": 0.00017382945736434108,
+      "loss": 0.801,
+      "step": 432
+    },
+    {
+      "epoch": 0.1338485316846986,
+      "grad_norm": 0.11472901701927185,
+      "learning_rate": 0.00017376744186046512,
+      "loss": 0.7885,
+      "step": 433
+    },
+    {
+      "epoch": 0.13415765069551777,
+      "grad_norm": 0.1211596429347992,
+      "learning_rate": 0.00017370542635658916,
+      "loss": 0.8281,
+      "step": 434
+    },
+    {
+      "epoch": 0.13446676970633695,
+      "grad_norm": 0.1142617017030716,
+      "learning_rate": 0.0001736434108527132,
+      "loss": 0.823,
+      "step": 435
+    },
+    {
+      "epoch": 0.1347758887171561,
+      "grad_norm": 0.10048012435436249,
+      "learning_rate": 0.0001735813953488372,
+      "loss": 0.8976,
+      "step": 436
+    },
+    {
+      "epoch": 0.13508500772797527,
+      "grad_norm": 0.12125738710165024,
+      "learning_rate": 0.00017351937984496125,
+      "loss": 0.8559,
+      "step": 437
+    },
+    {
+      "epoch": 0.13539412673879445,
+      "grad_norm": 0.12249696254730225,
+      "learning_rate": 0.00017345736434108529,
+      "loss": 0.783,
+      "step": 438
+    },
+    {
+      "epoch": 0.1357032457496136,
+      "grad_norm": 0.14693719148635864,
+      "learning_rate": 0.00017339534883720932,
+      "loss": 0.8892,
+      "step": 439
+    },
+    {
+      "epoch": 0.13601236476043277,
+      "grad_norm": 0.12531165778636932,
+      "learning_rate": 0.00017333333333333334,
+      "loss": 0.8706,
+      "step": 440
+    },
+    {
+      "epoch": 0.13632148377125194,
+      "grad_norm": 0.1262034773826599,
+      "learning_rate": 0.00017327131782945737,
+      "loss": 0.7885,
+      "step": 441
+    },
+    {
+      "epoch": 0.1366306027820711,
+      "grad_norm": 0.12192118167877197,
+      "learning_rate": 0.00017320930232558141,
+      "loss": 0.7533,
+      "step": 442
+    },
+    {
+      "epoch": 0.13693972179289027,
+      "grad_norm": 0.12393314391374588,
+      "learning_rate": 0.00017314728682170545,
+      "loss": 0.7126,
+      "step": 443
+    },
+    {
+      "epoch": 0.13724884080370942,
+      "grad_norm": 0.14559726417064667,
+      "learning_rate": 0.00017308527131782946,
+      "loss": 0.7041,
+      "step": 444
+    },
+    {
+      "epoch": 0.1375579598145286,
+      "grad_norm": 0.11685144901275635,
+      "learning_rate": 0.00017302325581395348,
+      "loss": 0.7084,
+      "step": 445
+    },
+    {
+      "epoch": 0.13786707882534777,
+      "grad_norm": 0.12664124369621277,
+      "learning_rate": 0.00017296124031007751,
+      "loss": 0.7724,
+      "step": 446
+    },
+    {
+      "epoch": 0.13817619783616691,
+      "grad_norm": 0.1175457313656807,
+      "learning_rate": 0.00017289922480620155,
+      "loss": 0.8241,
+      "step": 447
+    },
+    {
+      "epoch": 0.1384853168469861,
+      "grad_norm": 0.11846484243869781,
+      "learning_rate": 0.0001728372093023256,
+      "loss": 0.8515,
+      "step": 448
+    },
+    {
+      "epoch": 0.13879443585780527,
+      "grad_norm": 0.13215206563472748,
+      "learning_rate": 0.00017277519379844963,
+      "loss": 0.8095,
+      "step": 449
+    },
+    {
+      "epoch": 0.1391035548686244,
+      "grad_norm": 0.1288730353116989,
+      "learning_rate": 0.00017271317829457364,
+      "loss": 0.7236,
+      "step": 450
+    },
+    {
+      "epoch": 0.1394126738794436,
+      "grad_norm": 0.11009534448385239,
+      "learning_rate": 0.00017265116279069768,
+      "loss": 0.8836,
+      "step": 451
+    },
+    {
+      "epoch": 0.13972179289026276,
+      "grad_norm": 0.1256999373435974,
+      "learning_rate": 0.00017258914728682172,
+      "loss": 0.8412,
+      "step": 452
+    },
+    {
+      "epoch": 0.1400309119010819,
+      "grad_norm": 0.12464401125907898,
+      "learning_rate": 0.00017252713178294576,
+      "loss": 0.8152,
+      "step": 453
+    },
+    {
+      "epoch": 0.1403400309119011,
+      "grad_norm": 0.11386653035879135,
+      "learning_rate": 0.00017246511627906977,
+      "loss": 0.7923,
+      "step": 454
+    },
+    {
+      "epoch": 0.14064914992272023,
+      "grad_norm": 0.11337646096944809,
+      "learning_rate": 0.0001724031007751938,
+      "loss": 0.6623,
+      "step": 455
+    },
+    {
+      "epoch": 0.1409582689335394,
+      "grad_norm": 0.13900204002857208,
+      "learning_rate": 0.00017234108527131785,
+      "loss": 0.8303,
+      "step": 456
+    },
+    {
+      "epoch": 0.14126738794435859,
+      "grad_norm": 0.13519424200057983,
+      "learning_rate": 0.00017227906976744186,
+      "loss": 0.7967,
+      "step": 457
+    },
+    {
+      "epoch": 0.14157650695517773,
+      "grad_norm": 0.12967944145202637,
+      "learning_rate": 0.0001722170542635659,
+      "loss": 0.7826,
+      "step": 458
+    },
+    {
+      "epoch": 0.1418856259659969,
+      "grad_norm": 0.12591594457626343,
+      "learning_rate": 0.00017215503875968994,
+      "loss": 0.9955,
+      "step": 459
+    },
+    {
+      "epoch": 0.14219474497681608,
+      "grad_norm": 0.11622080206871033,
+      "learning_rate": 0.00017209302325581395,
+      "loss": 0.8291,
+      "step": 460
+    },
+    {
+      "epoch": 0.14250386398763523,
+      "grad_norm": 0.12004160135984421,
+      "learning_rate": 0.000172031007751938,
+      "loss": 0.8015,
+      "step": 461
+    },
+    {
+      "epoch": 0.1428129829984544,
+      "grad_norm": 0.11545343697071075,
+      "learning_rate": 0.00017196899224806203,
+      "loss": 0.7617,
+      "step": 462
+    },
+    {
+      "epoch": 0.14312210200927358,
+      "grad_norm": 0.1136220321059227,
+      "learning_rate": 0.00017190697674418606,
+      "loss": 0.723,
+      "step": 463
+    },
+    {
+      "epoch": 0.14343122102009273,
+      "grad_norm": 0.11028563231229782,
+      "learning_rate": 0.00017184496124031008,
+      "loss": 0.8464,
+      "step": 464
+    },
+    {
+      "epoch": 0.1437403400309119,
+      "grad_norm": 0.10660995543003082,
+      "learning_rate": 0.00017178294573643412,
+      "loss": 0.7367,
+      "step": 465
+    },
+    {
+      "epoch": 0.14404945904173105,
+      "grad_norm": 0.10705665498971939,
+      "learning_rate": 0.00017172093023255815,
+      "loss": 0.6892,
+      "step": 466
+    },
+    {
+      "epoch": 0.14435857805255023,
+      "grad_norm": 0.124393992125988,
+      "learning_rate": 0.0001716589147286822,
+      "loss": 0.7663,
+      "step": 467
+    },
+    {
+      "epoch": 0.1446676970633694,
+      "grad_norm": 0.10380648076534271,
+      "learning_rate": 0.00017159689922480623,
+      "loss": 0.7851,
+      "step": 468
+    },
+    {
+      "epoch": 0.14497681607418855,
+      "grad_norm": 0.13513809442520142,
+      "learning_rate": 0.00017153488372093024,
+      "loss": 0.8207,
+      "step": 469
+    },
+    {
+      "epoch": 0.14528593508500773,
+      "grad_norm": 0.1310744434595108,
+      "learning_rate": 0.00017147286821705425,
+      "loss": 0.7328,
+      "step": 470
+    },
+    {
+      "epoch": 0.1455950540958269,
+      "grad_norm": 0.13068106770515442,
+      "learning_rate": 0.0001714108527131783,
+      "loss": 0.8289,
+      "step": 471
+    },
+    {
+      "epoch": 0.14590417310664605,
+      "grad_norm": 0.09564946591854095,
+      "learning_rate": 0.00017134883720930233,
+      "loss": 0.7596,
+      "step": 472
+    },
+    {
+      "epoch": 0.14621329211746523,
+      "grad_norm": 0.11033451557159424,
+      "learning_rate": 0.00017128682170542637,
+      "loss": 0.802,
+      "step": 473
+    },
+    {
+      "epoch": 0.1465224111282844,
+      "grad_norm": 0.11931835860013962,
+      "learning_rate": 0.00017122480620155038,
+      "loss": 0.7087,
+      "step": 474
+    },
+    {
+      "epoch": 0.14683153013910355,
+      "grad_norm": 0.12470009177923203,
+      "learning_rate": 0.00017116279069767442,
+      "loss": 0.7323,
+      "step": 475
+    },
+    {
+      "epoch": 0.14714064914992273,
+      "grad_norm": 0.1364419013261795,
+      "learning_rate": 0.00017110077519379846,
+      "loss": 0.7856,
+      "step": 476
+    },
+    {
+      "epoch": 0.14744976816074187,
+      "grad_norm": 0.12685492634773254,
+      "learning_rate": 0.0001710387596899225,
+      "loss": 0.8633,
+      "step": 477
+    },
+    {
+      "epoch": 0.14775888717156105,
+      "grad_norm": 0.11873108893632889,
+      "learning_rate": 0.0001709767441860465,
+      "loss": 0.7816,
+      "step": 478
+    },
+    {
+      "epoch": 0.14806800618238022,
+      "grad_norm": 0.12090124189853668,
+      "learning_rate": 0.00017091472868217055,
+      "loss": 0.9065,
+      "step": 479
+    },
+    {
+      "epoch": 0.14837712519319937,
+      "grad_norm": 0.11901501566171646,
+      "learning_rate": 0.0001708527131782946,
+      "loss": 0.8258,
+      "step": 480
+    },
+    {
+      "epoch": 0.14868624420401855,
+      "grad_norm": 0.11180437356233597,
+      "learning_rate": 0.00017079069767441863,
+      "loss": 0.7396,
+      "step": 481
+    },
+    {
+      "epoch": 0.14899536321483772,
+      "grad_norm": 0.16076162457466125,
+      "learning_rate": 0.00017072868217054267,
+      "loss": 0.7794,
+      "step": 482
+    },
+    {
+      "epoch": 0.14930448222565687,
+      "grad_norm": 0.13752448558807373,
+      "learning_rate": 0.00017066666666666668,
+      "loss": 0.6987,
+      "step": 483
+    },
+    {
+      "epoch": 0.14961360123647605,
+      "grad_norm": 0.11785899847745895,
+      "learning_rate": 0.0001706046511627907,
+      "loss": 0.8017,
+      "step": 484
+    },
+    {
+      "epoch": 0.14992272024729522,
+      "grad_norm": 0.11127035319805145,
+      "learning_rate": 0.00017054263565891473,
+      "loss": 0.7538,
+      "step": 485
+    },
+    {
+      "epoch": 0.15023183925811437,
+      "grad_norm": 0.12820091843605042,
+      "learning_rate": 0.00017048062015503877,
+      "loss": 0.7771,
+      "step": 486
+    },
+    {
+      "epoch": 0.15054095826893354,
+      "grad_norm": 0.13787533342838287,
+      "learning_rate": 0.0001704186046511628,
+      "loss": 0.789,
+      "step": 487
+    },
+    {
+      "epoch": 0.1508500772797527,
+      "grad_norm": 0.12819816172122955,
+      "learning_rate": 0.00017035658914728682,
+      "loss": 0.7542,
+      "step": 488
+    },
+    {
+      "epoch": 0.15115919629057187,
+      "grad_norm": 0.12091512233018875,
+      "learning_rate": 0.00017029457364341086,
+      "loss": 0.8161,
+      "step": 489
+    },
+    {
+      "epoch": 0.15146831530139104,
+      "grad_norm": 0.1256888210773468,
+      "learning_rate": 0.0001702325581395349,
+      "loss": 0.8272,
+      "step": 490
+    },
+    {
+      "epoch": 0.1517774343122102,
+      "grad_norm": 0.11789566278457642,
+      "learning_rate": 0.00017017054263565893,
+      "loss": 0.7079,
+      "step": 491
+    },
+    {
+      "epoch": 0.15208655332302937,
+      "grad_norm": 0.11957567185163498,
+      "learning_rate": 0.00017010852713178294,
+      "loss": 0.8352,
+      "step": 492
+    },
+    {
+      "epoch": 0.15239567233384854,
+      "grad_norm": 0.11315543204545975,
+      "learning_rate": 0.00017004651162790698,
+      "loss": 0.9455,
+      "step": 493
+    },
+    {
+      "epoch": 0.1527047913446677,
+      "grad_norm": 0.1091320812702179,
+      "learning_rate": 0.00016998449612403102,
+      "loss": 0.686,
+      "step": 494
+    },
+    {
+      "epoch": 0.15301391035548687,
+      "grad_norm": 0.11446017026901245,
+      "learning_rate": 0.00016992248062015506,
+      "loss": 0.8191,
+      "step": 495
+    },
+    {
+      "epoch": 0.15332302936630604,
+      "grad_norm": 0.11834724992513657,
+      "learning_rate": 0.0001698604651162791,
+      "loss": 0.8379,
+      "step": 496
+    },
+    {
+      "epoch": 0.1536321483771252,
+      "grad_norm": 0.12001053988933563,
+      "learning_rate": 0.0001697984496124031,
+      "loss": 0.7332,
+      "step": 497
+    },
+    {
+      "epoch": 0.15394126738794436,
+      "grad_norm": 0.11104556918144226,
+      "learning_rate": 0.00016973643410852712,
+      "loss": 0.8642,
+      "step": 498
+    },
+    {
+      "epoch": 0.1542503863987635,
+      "grad_norm": 0.10420899838209152,
+      "learning_rate": 0.00016967441860465116,
+      "loss": 0.8252,
+      "step": 499
+    },
+    {
+      "epoch": 0.1545595054095827,
+      "grad_norm": 0.1481151580810547,
+      "learning_rate": 0.0001696124031007752,
+      "loss": 0.7085,
+      "step": 500
+    },
+    {
+      "epoch": 0.15486862442040186,
+      "grad_norm": 0.13192850351333618,
+      "learning_rate": 0.00016955038759689924,
+      "loss": 0.8223,
+      "step": 501
+    },
+    {
+      "epoch": 0.155177743431221,
+      "grad_norm": 0.11016976088285446,
+      "learning_rate": 0.00016948837209302325,
+      "loss": 0.7833,
+      "step": 502
+    },
+    {
+      "epoch": 0.15548686244204019,
+      "grad_norm": 0.13597513735294342,
+      "learning_rate": 0.0001694263565891473,
+      "loss": 0.8608,
+      "step": 503
+    },
+    {
+      "epoch": 0.15579598145285936,
+      "grad_norm": 0.13814714550971985,
+      "learning_rate": 0.00016936434108527133,
+      "loss": 0.8234,
+      "step": 504
+    },
+    {
+      "epoch": 0.1561051004636785,
+      "grad_norm": 0.1129792258143425,
+      "learning_rate": 0.00016930232558139537,
+      "loss": 0.7622,
+      "step": 505
+    },
+    {
+      "epoch": 0.15641421947449768,
+      "grad_norm": 0.1326257288455963,
+      "learning_rate": 0.0001692403100775194,
+      "loss": 0.7745,
+      "step": 506
+    },
+    {
+      "epoch": 0.15672333848531686,
+      "grad_norm": 0.10894762724637985,
+      "learning_rate": 0.00016917829457364342,
+      "loss": 0.8396,
+      "step": 507
+    },
+    {
+      "epoch": 0.157032457496136,
+      "grad_norm": 0.10844721645116806,
+      "learning_rate": 0.00016911627906976746,
+      "loss": 0.6738,
+      "step": 508
+    },
+    {
+      "epoch": 0.15734157650695518,
+      "grad_norm": 0.12142128497362137,
+      "learning_rate": 0.0001690542635658915,
+      "loss": 0.8698,
+      "step": 509
+    },
+    {
+      "epoch": 0.15765069551777433,
+      "grad_norm": 0.12891779839992523,
+      "learning_rate": 0.0001689922480620155,
+      "loss": 0.7915,
+      "step": 510
+    },
+    {
+      "epoch": 0.1579598145285935,
+      "grad_norm": 0.1314953863620758,
+      "learning_rate": 0.00016893023255813955,
+      "loss": 0.8347,
+      "step": 511
+    },
+    {
+      "epoch": 0.15826893353941268,
+      "grad_norm": 0.12055188417434692,
+      "learning_rate": 0.00016886821705426356,
+      "loss": 0.9093,
+      "step": 512
+    },
+    {
+      "epoch": 0.15857805255023183,
+      "grad_norm": 0.12292719632387161,
+      "learning_rate": 0.0001688062015503876,
+      "loss": 0.8591,
+      "step": 513
+    },
+    {
+      "epoch": 0.158887171561051,
+      "grad_norm": 0.11341209709644318,
+      "learning_rate": 0.00016874418604651163,
+      "loss": 0.7724,
+      "step": 514
+    },
+    {
+      "epoch": 0.15919629057187018,
+      "grad_norm": 0.11800853163003922,
+      "learning_rate": 0.00016868217054263567,
+      "loss": 0.707,
+      "step": 515
+    },
+    {
+      "epoch": 0.15950540958268933,
+      "grad_norm": 0.14028507471084595,
+      "learning_rate": 0.00016862015503875968,
+      "loss": 0.7773,
+      "step": 516
+    },
+    {
+      "epoch": 0.1598145285935085,
+      "grad_norm": 0.11926918476819992,
+      "learning_rate": 0.00016855813953488372,
+      "loss": 0.8076,
+      "step": 517
+    },
+    {
+      "epoch": 0.16012364760432768,
+      "grad_norm": 0.11683503538370132,
+      "learning_rate": 0.00016849612403100776,
+      "loss": 0.7492,
+      "step": 518
+    },
+    {
+      "epoch": 0.16043276661514683,
+      "grad_norm": 0.14212507009506226,
+      "learning_rate": 0.0001684341085271318,
+      "loss": 0.7072,
+      "step": 519
+    },
+    {
+      "epoch": 0.160741885625966,
+      "grad_norm": 0.12642718851566315,
+      "learning_rate": 0.00016837209302325584,
+      "loss": 0.9201,
+      "step": 520
+    },
+    {
+      "epoch": 0.16105100463678515,
+      "grad_norm": 0.15104375779628754,
+      "learning_rate": 0.00016831007751937985,
+      "loss": 0.7333,
+      "step": 521
+    },
+    {
+      "epoch": 0.16136012364760433,
+      "grad_norm": 0.15067335963249207,
+      "learning_rate": 0.0001682480620155039,
+      "loss": 0.8267,
+      "step": 522
+    },
+    {
+      "epoch": 0.1616692426584235,
+      "grad_norm": 0.12420719116926193,
+      "learning_rate": 0.00016818604651162793,
+      "loss": 0.8489,
+      "step": 523
+    },
+    {
+      "epoch": 0.16197836166924265,
+      "grad_norm": 0.10997667163610458,
+      "learning_rate": 0.00016812403100775194,
+      "loss": 0.7615,
+      "step": 524
+    },
+    {
+      "epoch": 0.16228748068006182,
+      "grad_norm": 0.12284649908542633,
+      "learning_rate": 0.00016806201550387598,
+      "loss": 0.7435,
+      "step": 525
+    },
+    {
+      "epoch": 0.162596599690881,
+      "grad_norm": 0.10515284538269043,
+      "learning_rate": 0.000168,
+      "loss": 0.789,
+      "step": 526
+    },
+    {
+      "epoch": 0.16290571870170015,
+      "grad_norm": 0.12312375009059906,
+      "learning_rate": 0.00016793798449612403,
+      "loss": 0.8253,
+      "step": 527
+    },
+    {
+      "epoch": 0.16321483771251932,
+      "grad_norm": 0.10993171483278275,
+      "learning_rate": 0.00016787596899224807,
+      "loss": 0.7591,
+      "step": 528
+    },
+    {
+      "epoch": 0.1635239567233385,
+      "grad_norm": 0.11605069786310196,
+      "learning_rate": 0.0001678139534883721,
+      "loss": 0.8267,
+      "step": 529
+    },
+    {
+      "epoch": 0.16383307573415765,
+      "grad_norm": 0.12003269046545029,
+      "learning_rate": 0.00016775193798449615,
+      "loss": 0.8455,
+      "step": 530
+    },
+    {
+      "epoch": 0.16414219474497682,
+      "grad_norm": 0.12208808213472366,
+      "learning_rate": 0.00016768992248062016,
+      "loss": 0.8168,
+      "step": 531
+    },
+    {
+      "epoch": 0.16445131375579597,
+      "grad_norm": 0.12368449568748474,
+      "learning_rate": 0.0001676279069767442,
+      "loss": 0.8713,
+      "step": 532
+    },
+    {
+      "epoch": 0.16476043276661514,
+      "grad_norm": 0.12407387793064117,
+      "learning_rate": 0.00016756589147286823,
+      "loss": 0.7938,
+      "step": 533
+    },
+    {
+      "epoch": 0.16506955177743432,
+      "grad_norm": 0.12617334723472595,
+      "learning_rate": 0.00016750387596899227,
+      "loss": 0.8575,
+      "step": 534
+    },
+    {
+      "epoch": 0.16537867078825347,
+      "grad_norm": 0.15063488483428955,
+      "learning_rate": 0.00016744186046511629,
+      "loss": 0.9415,
+      "step": 535
+    },
+    {
+      "epoch": 0.16568778979907264,
+      "grad_norm": 0.12658260762691498,
+      "learning_rate": 0.00016737984496124032,
+      "loss": 0.85,
+      "step": 536
+    },
+    {
+      "epoch": 0.16599690880989182,
+      "grad_norm": 0.09913121163845062,
+      "learning_rate": 0.00016731782945736434,
+      "loss": 0.8092,
+      "step": 537
+    },
+    {
+      "epoch": 0.16630602782071097,
+      "grad_norm": 0.12728868424892426,
+      "learning_rate": 0.00016725581395348837,
+      "loss": 0.6869,
+      "step": 538
+    },
+    {
+      "epoch": 0.16661514683153014,
+      "grad_norm": 0.12953142821788788,
+      "learning_rate": 0.0001671937984496124,
+      "loss": 0.7474,
+      "step": 539
+    },
+    {
+      "epoch": 0.16692426584234932,
+      "grad_norm": 0.1168576180934906,
+      "learning_rate": 0.00016713178294573642,
+      "loss": 0.7697,
+      "step": 540
+    },
+    {
+      "epoch": 0.16723338485316847,
+      "grad_norm": 0.12081418931484222,
+      "learning_rate": 0.00016706976744186046,
+      "loss": 0.8952,
+      "step": 541
+    },
+    {
+      "epoch": 0.16754250386398764,
+      "grad_norm": 0.12843774259090424,
+      "learning_rate": 0.0001670077519379845,
+      "loss": 0.8786,
+      "step": 542
+    },
+    {
+      "epoch": 0.1678516228748068,
+      "grad_norm": 0.13334107398986816,
+      "learning_rate": 0.00016694573643410854,
+      "loss": 0.7732,
+      "step": 543
+    },
+    {
+      "epoch": 0.16816074188562596,
+      "grad_norm": 0.12304075807332993,
+      "learning_rate": 0.00016688372093023258,
+      "loss": 0.8681,
+      "step": 544
+    },
+    {
+      "epoch": 0.16846986089644514,
+      "grad_norm": 0.11800245940685272,
+      "learning_rate": 0.0001668217054263566,
+      "loss": 0.8226,
+      "step": 545
+    },
+    {
+      "epoch": 0.1687789799072643,
+      "grad_norm": 0.1358041614294052,
+      "learning_rate": 0.00016675968992248063,
+      "loss": 0.7346,
+      "step": 546
+    },
+    {
+      "epoch": 0.16908809891808346,
+      "grad_norm": 0.101251982152462,
+      "learning_rate": 0.00016669767441860467,
+      "loss": 0.7556,
+      "step": 547
+    },
+    {
+      "epoch": 0.16939721792890264,
+      "grad_norm": 0.15012463927268982,
+      "learning_rate": 0.0001666356589147287,
+      "loss": 0.7558,
+      "step": 548
+    },
+    {
+      "epoch": 0.16970633693972179,
+      "grad_norm": 0.11593160778284073,
+      "learning_rate": 0.00016657364341085272,
+      "loss": 0.7963,
+      "step": 549
+    },
+    {
+      "epoch": 0.17001545595054096,
+      "grad_norm": 0.11717840284109116,
+      "learning_rate": 0.00016651162790697673,
+      "loss": 0.8219,
+      "step": 550
+    },
+    {
+      "epoch": 0.17032457496136014,
+      "grad_norm": 0.12220215797424316,
+      "learning_rate": 0.00016644961240310077,
+      "loss": 0.8221,
+      "step": 551
+    },
+    {
+      "epoch": 0.17063369397217928,
+      "grad_norm": 0.1270114630460739,
+      "learning_rate": 0.0001663875968992248,
+      "loss": 0.7718,
+      "step": 552
+    },
+    {
+      "epoch": 0.17094281298299846,
+      "grad_norm": 0.11445185542106628,
+      "learning_rate": 0.00016632558139534885,
+      "loss": 0.8031,
+      "step": 553
+    },
+    {
+      "epoch": 0.1712519319938176,
+      "grad_norm": 0.1258378028869629,
+      "learning_rate": 0.00016626356589147286,
+      "loss": 0.8165,
+      "step": 554
+    },
+    {
+      "epoch": 0.17156105100463678,
+      "grad_norm": 0.12015929818153381,
+      "learning_rate": 0.0001662015503875969,
+      "loss": 0.7292,
+      "step": 555
+    },
+    {
+      "epoch": 0.17187017001545596,
+      "grad_norm": 0.12911571562290192,
+      "learning_rate": 0.00016613953488372094,
+      "loss": 0.7586,
+      "step": 556
+    },
+    {
+      "epoch": 0.1721792890262751,
+      "grad_norm": 0.11884018778800964,
+      "learning_rate": 0.00016607751937984497,
+      "loss": 0.816,
+      "step": 557
+    },
+    {
+      "epoch": 0.17248840803709428,
+      "grad_norm": 0.12241604179143906,
+      "learning_rate": 0.00016601550387596901,
+      "loss": 0.785,
+      "step": 558
+    },
+    {
+      "epoch": 0.17279752704791346,
+      "grad_norm": 0.1106485053896904,
+      "learning_rate": 0.00016595348837209303,
+      "loss": 0.8593,
+      "step": 559
+    },
+    {
+      "epoch": 0.1731066460587326,
+      "grad_norm": 0.13855457305908203,
+      "learning_rate": 0.00016589147286821706,
+      "loss": 0.8647,
+      "step": 560
+    },
+    {
+      "epoch": 0.17341576506955178,
+      "grad_norm": 0.12671570479869843,
+      "learning_rate": 0.0001658294573643411,
+      "loss": 0.7475,
+      "step": 561
+    },
+    {
+      "epoch": 0.17372488408037096,
+      "grad_norm": 0.10744766145944595,
+      "learning_rate": 0.00016576744186046514,
+      "loss": 0.6681,
+      "step": 562
+    },
+    {
+      "epoch": 0.1740340030911901,
+      "grad_norm": 0.15949758887290955,
+      "learning_rate": 0.00016570542635658915,
+      "loss": 0.7603,
+      "step": 563
+    },
+    {
+      "epoch": 0.17434312210200928,
+      "grad_norm": 0.14625856280326843,
+      "learning_rate": 0.00016564341085271316,
+      "loss": 0.9034,
+      "step": 564
+    },
+    {
+      "epoch": 0.17465224111282843,
+      "grad_norm": 0.10362540185451508,
+      "learning_rate": 0.0001655813953488372,
+      "loss": 0.8991,
+      "step": 565
+    },
+    {
+      "epoch": 0.1749613601236476,
+      "grad_norm": 0.1137462630867958,
+      "learning_rate": 0.00016551937984496124,
+      "loss": 0.8242,
+      "step": 566
+    },
+    {
+      "epoch": 0.17527047913446678,
+      "grad_norm": 0.12184014916419983,
+      "learning_rate": 0.00016545736434108528,
+      "loss": 0.7607,
+      "step": 567
+    },
+    {
+      "epoch": 0.17557959814528593,
+      "grad_norm": 0.1274954229593277,
+      "learning_rate": 0.00016539534883720932,
+      "loss": 0.7599,
+      "step": 568
+    },
+    {
+      "epoch": 0.1758887171561051,
+      "grad_norm": 0.11610583961009979,
+      "learning_rate": 0.00016533333333333333,
+      "loss": 0.8305,
+      "step": 569
+    },
+    {
+      "epoch": 0.17619783616692428,
+      "grad_norm": 0.1202671155333519,
+      "learning_rate": 0.00016527131782945737,
+      "loss": 0.8162,
+      "step": 570
+    },
+    {
+      "epoch": 0.17650695517774342,
+      "grad_norm": 0.1324416995048523,
+      "learning_rate": 0.0001652093023255814,
+      "loss": 0.808,
+      "step": 571
+    },
+    {
+      "epoch": 0.1768160741885626,
+      "grad_norm": 0.11402853578329086,
+      "learning_rate": 0.00016514728682170545,
+      "loss": 0.856,
+      "step": 572
+    },
+    {
+      "epoch": 0.17712519319938178,
+      "grad_norm": 0.11911406368017197,
+      "learning_rate": 0.00016508527131782946,
+      "loss": 0.7992,
+      "step": 573
+    },
+    {
+      "epoch": 0.17743431221020092,
+      "grad_norm": 0.13559581339359283,
+      "learning_rate": 0.0001650232558139535,
+      "loss": 0.8584,
+      "step": 574
+    },
+    {
+      "epoch": 0.1777434312210201,
+      "grad_norm": 0.12889279425144196,
+      "learning_rate": 0.00016496124031007754,
+      "loss": 0.8382,
+      "step": 575
+    },
+    {
+      "epoch": 0.17805255023183925,
+      "grad_norm": 0.12666314840316772,
+      "learning_rate": 0.00016489922480620158,
+      "loss": 0.8406,
+      "step": 576
+    },
+    {
+      "epoch": 0.17836166924265842,
+      "grad_norm": 0.12953141331672668,
+      "learning_rate": 0.0001648372093023256,
+      "loss": 0.7878,
+      "step": 577
+    },
+    {
+      "epoch": 0.1786707882534776,
+      "grad_norm": 0.10341861099004745,
+      "learning_rate": 0.0001647751937984496,
+      "loss": 0.8802,
+      "step": 578
+    },
+    {
+      "epoch": 0.17897990726429674,
+      "grad_norm": 0.11049186438322067,
+      "learning_rate": 0.00016471317829457364,
+      "loss": 0.7669,
+      "step": 579
+    },
+    {
+      "epoch": 0.17928902627511592,
+      "grad_norm": 0.11524353176355362,
+      "learning_rate": 0.00016465116279069768,
+      "loss": 0.7958,
+      "step": 580
+    },
+    {
+      "epoch": 0.1795981452859351,
+      "grad_norm": 0.12205459177494049,
+      "learning_rate": 0.00016458914728682172,
+      "loss": 0.6768,
+      "step": 581
+    },
+    {
+      "epoch": 0.17990726429675424,
+      "grad_norm": 0.10032919049263,
+      "learning_rate": 0.00016452713178294575,
+      "loss": 0.8596,
+      "step": 582
+    },
+    {
+      "epoch": 0.18021638330757342,
+      "grad_norm": 0.10666303336620331,
+      "learning_rate": 0.00016446511627906977,
+      "loss": 0.8106,
+      "step": 583
+    },
+    {
+      "epoch": 0.1805255023183926,
+      "grad_norm": 0.09989852458238602,
+      "learning_rate": 0.0001644031007751938,
+      "loss": 0.7455,
+      "step": 584
+    },
+    {
+      "epoch": 0.18083462132921174,
+      "grad_norm": 0.11961805075407028,
+      "learning_rate": 0.00016434108527131784,
+      "loss": 0.8289,
+      "step": 585
+    },
+    {
+      "epoch": 0.18114374034003092,
+      "grad_norm": 0.13421611487865448,
+      "learning_rate": 0.00016427906976744188,
+      "loss": 0.6841,
+      "step": 586
+    },
+    {
+      "epoch": 0.18145285935085007,
+      "grad_norm": 0.11855993419885635,
+      "learning_rate": 0.0001642170542635659,
+      "loss": 0.8299,
+      "step": 587
+    },
+    {
+      "epoch": 0.18176197836166924,
+      "grad_norm": 0.11767081916332245,
+      "learning_rate": 0.00016415503875968993,
+      "loss": 0.7878,
+      "step": 588
+    },
+    {
+      "epoch": 0.18207109737248842,
+      "grad_norm": 0.12189806997776031,
+      "learning_rate": 0.00016409302325581397,
+      "loss": 0.863,
+      "step": 589
+    },
+    {
+      "epoch": 0.18238021638330756,
+      "grad_norm": 0.09818772226572037,
+      "learning_rate": 0.00016403100775193798,
+      "loss": 0.7039,
+      "step": 590
+    },
+    {
+      "epoch": 0.18268933539412674,
+      "grad_norm": 0.13206005096435547,
+      "learning_rate": 0.00016396899224806202,
+      "loss": 0.7623,
+      "step": 591
+    },
+    {
+      "epoch": 0.18299845440494591,
+      "grad_norm": 0.12963028252124786,
+      "learning_rate": 0.00016390697674418606,
+      "loss": 0.7811,
+      "step": 592
+    },
+    {
+      "epoch": 0.18330757341576506,
+      "grad_norm": 0.11753853410482407,
+      "learning_rate": 0.00016384496124031007,
+      "loss": 0.8228,
+      "step": 593
+    },
+    {
+      "epoch": 0.18361669242658424,
+      "grad_norm": 0.10470208525657654,
+      "learning_rate": 0.0001637829457364341,
+      "loss": 0.8566,
+      "step": 594
+    },
+    {
+      "epoch": 0.1839258114374034,
+      "grad_norm": 0.10645218938589096,
+      "learning_rate": 0.00016372093023255815,
+      "loss": 0.7536,
+      "step": 595
+    },
+    {
+      "epoch": 0.18423493044822256,
+      "grad_norm": 0.11461575329303741,
+      "learning_rate": 0.0001636589147286822,
+      "loss": 0.7328,
+      "step": 596
+    },
+    {
+      "epoch": 0.18454404945904174,
+      "grad_norm": 0.1035584807395935,
+      "learning_rate": 0.0001635968992248062,
+      "loss": 0.7841,
+      "step": 597
+    },
+    {
+      "epoch": 0.18485316846986088,
+      "grad_norm": 0.12418399751186371,
+      "learning_rate": 0.00016353488372093024,
+      "loss": 0.7869,
+      "step": 598
+    },
+    {
+      "epoch": 0.18516228748068006,
+      "grad_norm": 0.12286582589149475,
+      "learning_rate": 0.00016347286821705428,
+      "loss": 0.8917,
+      "step": 599
+    },
+    {
+      "epoch": 0.18547140649149924,
+      "grad_norm": 0.11596380174160004,
+      "learning_rate": 0.00016341085271317832,
+      "loss": 0.7949,
+      "step": 600
+    },
+    {
+      "epoch": 0.18578052550231838,
+      "grad_norm": 0.12381494790315628,
+      "learning_rate": 0.00016334883720930235,
+      "loss": 0.8331,
+      "step": 601
+    },
+    {
+      "epoch": 0.18608964451313756,
+      "grad_norm": 0.12967997789382935,
+      "learning_rate": 0.00016328682170542637,
+      "loss": 0.8003,
+      "step": 602
+    },
+    {
+      "epoch": 0.18639876352395673,
+      "grad_norm": 0.11383350193500519,
+      "learning_rate": 0.0001632248062015504,
+      "loss": 0.7191,
+      "step": 603
+    },
+    {
+      "epoch": 0.18670788253477588,
+      "grad_norm": 0.12088557332754135,
+      "learning_rate": 0.00016316279069767442,
+      "loss": 0.772,
+      "step": 604
+    },
+    {
+      "epoch": 0.18701700154559506,
+      "grad_norm": 0.1383604258298874,
+      "learning_rate": 0.00016310077519379846,
+      "loss": 0.8449,
+      "step": 605
+    },
+    {
+      "epoch": 0.18732612055641423,
+      "grad_norm": 0.13077442348003387,
+      "learning_rate": 0.0001630387596899225,
+      "loss": 0.817,
+      "step": 606
+    },
+    {
+      "epoch": 0.18763523956723338,
+      "grad_norm": 0.12592090666294098,
+      "learning_rate": 0.0001629767441860465,
+      "loss": 0.8615,
+      "step": 607
+    },
+    {
+      "epoch": 0.18794435857805256,
+      "grad_norm": 0.11555439233779907,
+      "learning_rate": 0.00016291472868217054,
+      "loss": 0.7587,
+      "step": 608
+    },
+    {
+      "epoch": 0.1882534775888717,
+      "grad_norm": 0.12637798488140106,
+      "learning_rate": 0.00016285271317829458,
+      "loss": 0.7867,
+      "step": 609
+    },
+    {
+      "epoch": 0.18856259659969088,
+      "grad_norm": 0.123162180185318,
+      "learning_rate": 0.00016279069767441862,
+      "loss": 0.8271,
+      "step": 610
+    },
+    {
+      "epoch": 0.18887171561051005,
+      "grad_norm": 0.133504718542099,
+      "learning_rate": 0.00016272868217054263,
+      "loss": 0.8287,
+      "step": 611
+    },
+    {
+      "epoch": 0.1891808346213292,
+      "grad_norm": 0.13327136635780334,
+      "learning_rate": 0.00016266666666666667,
+      "loss": 0.8069,
+      "step": 612
+    },
+    {
+      "epoch": 0.18948995363214838,
+      "grad_norm": 0.13675393164157867,
+      "learning_rate": 0.0001626046511627907,
+      "loss": 0.7992,
+      "step": 613
+    },
+    {
+      "epoch": 0.18979907264296755,
+      "grad_norm": 0.12681642174720764,
+      "learning_rate": 0.00016254263565891475,
+      "loss": 0.8107,
+      "step": 614
+    },
+    {
+      "epoch": 0.1901081916537867,
+      "grad_norm": 0.11314484477043152,
+      "learning_rate": 0.0001624806201550388,
+      "loss": 0.8308,
+      "step": 615
+    },
+    {
+      "epoch": 0.19041731066460588,
+      "grad_norm": 0.11311525851488113,
+      "learning_rate": 0.0001624186046511628,
+      "loss": 0.8193,
+      "step": 616
+    },
+    {
+      "epoch": 0.19072642967542505,
+      "grad_norm": 0.10104167461395264,
+      "learning_rate": 0.0001623565891472868,
+      "loss": 0.8109,
+      "step": 617
+    },
+    {
+      "epoch": 0.1910355486862442,
+      "grad_norm": 0.1340848058462143,
+      "learning_rate": 0.00016229457364341085,
+      "loss": 0.7129,
+      "step": 618
+    },
+    {
+      "epoch": 0.19134466769706338,
+      "grad_norm": 0.1362898200750351,
+      "learning_rate": 0.0001622325581395349,
+      "loss": 0.7536,
+      "step": 619
+    },
+    {
+      "epoch": 0.19165378670788252,
+      "grad_norm": 0.13276411592960358,
+      "learning_rate": 0.00016217054263565893,
+      "loss": 0.8628,
+      "step": 620
+    },
+    {
+      "epoch": 0.1919629057187017,
+      "grad_norm": 0.11686565726995468,
+      "learning_rate": 0.00016210852713178294,
+      "loss": 0.8807,
+      "step": 621
+    },
+    {
+      "epoch": 0.19227202472952087,
+      "grad_norm": 0.12405114620923996,
+      "learning_rate": 0.00016204651162790698,
+      "loss": 0.747,
+      "step": 622
+    },
+    {
+      "epoch": 0.19258114374034002,
+      "grad_norm": 0.12308946251869202,
+      "learning_rate": 0.00016198449612403102,
+      "loss": 0.7715,
+      "step": 623
+    },
+    {
+      "epoch": 0.1928902627511592,
+      "grad_norm": 0.14180584251880646,
+      "learning_rate": 0.00016192248062015506,
+      "loss": 0.9483,
+      "step": 624
+    },
+    {
+      "epoch": 0.19319938176197837,
+      "grad_norm": 0.1168755367398262,
+      "learning_rate": 0.00016186046511627907,
+      "loss": 0.7669,
+      "step": 625
+    },
+    {
+      "epoch": 0.19350850077279752,
+      "grad_norm": 0.11601582914590836,
+      "learning_rate": 0.0001617984496124031,
+      "loss": 0.8361,
+      "step": 626
+    },
+    {
+      "epoch": 0.1938176197836167,
+      "grad_norm": 0.11746050417423248,
+      "learning_rate": 0.00016173643410852715,
+      "loss": 0.844,
+      "step": 627
+    },
+    {
+      "epoch": 0.19412673879443587,
+      "grad_norm": 0.11663764715194702,
+      "learning_rate": 0.00016167441860465118,
+      "loss": 0.8162,
+      "step": 628
+    },
+    {
+      "epoch": 0.19443585780525502,
+      "grad_norm": 0.1258978247642517,
+      "learning_rate": 0.00016161240310077522,
+      "loss": 0.7729,
+      "step": 629
+    },
+    {
+      "epoch": 0.1947449768160742,
+      "grad_norm": 0.10496451705694199,
+      "learning_rate": 0.00016155038759689923,
+      "loss": 0.7739,
+      "step": 630
+    },
+    {
+      "epoch": 0.19505409582689334,
+      "grad_norm": 0.1313450038433075,
+      "learning_rate": 0.00016148837209302325,
+      "loss": 0.8157,
+      "step": 631
+    },
+    {
+      "epoch": 0.19536321483771252,
+      "grad_norm": 0.13735109567642212,
+      "learning_rate": 0.00016142635658914728,
+      "loss": 0.8494,
+      "step": 632
+    },
+    {
+      "epoch": 0.1956723338485317,
+      "grad_norm": 0.11428606510162354,
+      "learning_rate": 0.00016136434108527132,
+      "loss": 0.7957,
+      "step": 633
+    },
+    {
+      "epoch": 0.19598145285935084,
+      "grad_norm": 0.10336098074913025,
+      "learning_rate": 0.00016130232558139536,
+      "loss": 0.7244,
+      "step": 634
+    },
+    {
+      "epoch": 0.19629057187017002,
+      "grad_norm": 0.12851019203662872,
+      "learning_rate": 0.00016124031007751937,
+      "loss": 0.7908,
+      "step": 635
+    },
+    {
+      "epoch": 0.1965996908809892,
+      "grad_norm": 0.11097298562526703,
+      "learning_rate": 0.0001611782945736434,
+      "loss": 0.7819,
+      "step": 636
+    },
+    {
+      "epoch": 0.19690880989180834,
+      "grad_norm": 0.10702291131019592,
+      "learning_rate": 0.00016111627906976745,
+      "loss": 0.9083,
+      "step": 637
+    },
+    {
+      "epoch": 0.19721792890262752,
+      "grad_norm": 0.1185348853468895,
+      "learning_rate": 0.0001610542635658915,
+      "loss": 0.8115,
+      "step": 638
+    },
+    {
+      "epoch": 0.1975270479134467,
+      "grad_norm": 0.12385392189025879,
+      "learning_rate": 0.00016099224806201553,
+      "loss": 0.847,
+      "step": 639
+    },
+    {
+      "epoch": 0.19783616692426584,
+      "grad_norm": 0.13237705826759338,
+      "learning_rate": 0.00016093023255813954,
+      "loss": 0.7401,
+      "step": 640
+    },
+    {
+      "epoch": 0.198145285935085,
+      "grad_norm": 0.11597932875156403,
+      "learning_rate": 0.00016086821705426358,
+      "loss": 0.7639,
+      "step": 641
+    },
+    {
+      "epoch": 0.19845440494590416,
+      "grad_norm": 0.12065674364566803,
+      "learning_rate": 0.00016080620155038762,
+      "loss": 0.7425,
+      "step": 642
+    },
+    {
+      "epoch": 0.19876352395672334,
+      "grad_norm": 0.10582825541496277,
+      "learning_rate": 0.00016074418604651166,
+      "loss": 0.8191,
+      "step": 643
+    },
+    {
+      "epoch": 0.1990726429675425,
+      "grad_norm": 0.1253654509782791,
+      "learning_rate": 0.00016068217054263567,
+      "loss": 0.7928,
+      "step": 644
+    },
+    {
+      "epoch": 0.19938176197836166,
+      "grad_norm": 0.13197046518325806,
+      "learning_rate": 0.00016062015503875968,
+      "loss": 0.7402,
+      "step": 645
+    },
+    {
+      "epoch": 0.19969088098918084,
+      "grad_norm": 0.12224183231592178,
+      "learning_rate": 0.00016055813953488372,
+      "loss": 0.6898,
+      "step": 646
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.13466641306877136,
+      "learning_rate": 0.00016049612403100776,
+      "loss": 0.814,
+      "step": 647
+    },
+    {
+      "epoch": 0.20030911901081916,
+      "grad_norm": 0.10830813646316528,
+      "learning_rate": 0.0001604341085271318,
+      "loss": 0.8274,
+      "step": 648
+    },
+    {
+      "epoch": 0.20061823802163833,
+      "grad_norm": 0.1332327425479889,
+      "learning_rate": 0.0001603720930232558,
+      "loss": 0.8042,
+      "step": 649
+    },
+    {
+      "epoch": 0.2009273570324575,
+      "grad_norm": 0.1263049840927124,
+      "learning_rate": 0.00016031007751937985,
+      "loss": 0.8042,
+      "step": 650
+    },
+    {
+      "epoch": 0.20123647604327666,
+      "grad_norm": 0.13426467776298523,
+      "learning_rate": 0.00016024806201550389,
+      "loss": 0.8429,
+      "step": 651
+    },
+    {
+      "epoch": 0.20154559505409583,
+      "grad_norm": 0.1457086056470871,
+      "learning_rate": 0.00016018604651162792,
+      "loss": 0.8961,
+      "step": 652
+    },
+    {
+      "epoch": 0.20185471406491498,
+      "grad_norm": 0.11996602267026901,
+      "learning_rate": 0.00016012403100775196,
+      "loss": 0.7045,
+      "step": 653
+    },
+    {
+      "epoch": 0.20216383307573416,
+      "grad_norm": 0.10409342497587204,
+      "learning_rate": 0.00016006201550387597,
+      "loss": 0.6763,
+      "step": 654
+    },
+    {
+      "epoch": 0.20247295208655333,
+      "grad_norm": 0.11956805735826492,
+      "learning_rate": 0.00016,
+      "loss": 0.8169,
+      "step": 655
+    },
+    {
+      "epoch": 0.20278207109737248,
+      "grad_norm": 0.11860910803079605,
+      "learning_rate": 0.00015993798449612405,
+      "loss": 0.74,
+      "step": 656
+    },
+    {
+      "epoch": 0.20309119010819165,
+      "grad_norm": 0.12545433640480042,
+      "learning_rate": 0.00015987596899224806,
+      "loss": 0.8134,
+      "step": 657
+    },
+    {
+      "epoch": 0.20340030911901083,
+      "grad_norm": 0.10243546217679977,
+      "learning_rate": 0.0001598139534883721,
+      "loss": 0.7597,
+      "step": 658
+    },
+    {
+      "epoch": 0.20370942812982998,
+      "grad_norm": 0.11689910292625427,
+      "learning_rate": 0.00015975193798449611,
+      "loss": 0.7734,
+      "step": 659
+    },
+    {
+      "epoch": 0.20401854714064915,
+      "grad_norm": 0.12140754610300064,
+      "learning_rate": 0.00015968992248062015,
+      "loss": 0.8014,
+      "step": 660
+    },
+    {
+      "epoch": 0.20432766615146833,
+      "grad_norm": 0.11417256295681,
+      "learning_rate": 0.0001596279069767442,
+      "loss": 0.8837,
+      "step": 661
+    },
+    {
+      "epoch": 0.20463678516228748,
+      "grad_norm": 0.16317220032215118,
+      "learning_rate": 0.00015956589147286823,
+      "loss": 0.894,
+      "step": 662
+    },
+    {
+      "epoch": 0.20494590417310665,
+      "grad_norm": 0.1096215769648552,
+      "learning_rate": 0.00015950387596899227,
+      "loss": 0.6989,
+      "step": 663
+    },
+    {
+      "epoch": 0.2052550231839258,
+      "grad_norm": 0.11856718361377716,
+      "learning_rate": 0.00015944186046511628,
+      "loss": 0.8425,
+      "step": 664
+    },
+    {
+      "epoch": 0.20556414219474498,
+      "grad_norm": 0.11110817641019821,
+      "learning_rate": 0.00015937984496124032,
+      "loss": 0.8838,
+      "step": 665
+    },
+    {
+      "epoch": 0.20587326120556415,
+      "grad_norm": 0.1287812888622284,
+      "learning_rate": 0.00015931782945736436,
+      "loss": 0.7789,
+      "step": 666
+    },
+    {
+      "epoch": 0.2061823802163833,
+      "grad_norm": 0.0988534688949585,
+      "learning_rate": 0.0001592558139534884,
+      "loss": 0.7962,
+      "step": 667
+    },
+    {
+      "epoch": 0.20649149922720247,
+      "grad_norm": 0.12815728783607483,
+      "learning_rate": 0.0001591937984496124,
+      "loss": 0.6849,
+      "step": 668
+    },
+    {
+      "epoch": 0.20680061823802165,
+      "grad_norm": 0.12397190928459167,
+      "learning_rate": 0.00015913178294573645,
+      "loss": 0.771,
+      "step": 669
+    },
+    {
+      "epoch": 0.2071097372488408,
+      "grad_norm": 0.13357098400592804,
+      "learning_rate": 0.00015906976744186046,
+      "loss": 0.9005,
+      "step": 670
+    },
+    {
+      "epoch": 0.20741885625965997,
+      "grad_norm": 0.10197357833385468,
+      "learning_rate": 0.0001590077519379845,
+      "loss": 0.7802,
+      "step": 671
+    },
+    {
+      "epoch": 0.20772797527047915,
+      "grad_norm": 0.12450321763753891,
+      "learning_rate": 0.00015894573643410854,
+      "loss": 0.7157,
+      "step": 672
+    },
+    {
+      "epoch": 0.2080370942812983,
+      "grad_norm": 0.13183358311653137,
+      "learning_rate": 0.00015888372093023255,
+      "loss": 0.8082,
+      "step": 673
+    },
+    {
+      "epoch": 0.20834621329211747,
+      "grad_norm": 0.12288426607847214,
+      "learning_rate": 0.0001588217054263566,
+      "loss": 0.8119,
+      "step": 674
+    },
+    {
+      "epoch": 0.20865533230293662,
+      "grad_norm": 0.11869386583566666,
+      "learning_rate": 0.00015875968992248063,
+      "loss": 0.7948,
+      "step": 675
+    },
+    {
+      "epoch": 0.2089644513137558,
+      "grad_norm": 0.11692944169044495,
+      "learning_rate": 0.00015869767441860466,
+      "loss": 0.7011,
+      "step": 676
+    },
+    {
+      "epoch": 0.20927357032457497,
+      "grad_norm": 0.11799801886081696,
+      "learning_rate": 0.0001586356589147287,
+      "loss": 0.9002,
+      "step": 677
+    },
+    {
+      "epoch": 0.20958268933539412,
+      "grad_norm": 0.14463946223258972,
+      "learning_rate": 0.00015857364341085271,
+      "loss": 0.7562,
+      "step": 678
+    },
+    {
+      "epoch": 0.2098918083462133,
+      "grad_norm": 0.14642862975597382,
+      "learning_rate": 0.00015851162790697675,
+      "loss": 0.6619,
+      "step": 679
+    },
+    {
+      "epoch": 0.21020092735703247,
+      "grad_norm": 0.11202511936426163,
+      "learning_rate": 0.0001584496124031008,
+      "loss": 0.6871,
+      "step": 680
+    },
+    {
+      "epoch": 0.21051004636785162,
+      "grad_norm": 0.12495609372854233,
+      "learning_rate": 0.00015838759689922483,
+      "loss": 0.8702,
+      "step": 681
+    },
+    {
+      "epoch": 0.2108191653786708,
+      "grad_norm": 0.1246587410569191,
+      "learning_rate": 0.00015832558139534884,
+      "loss": 0.7562,
+      "step": 682
+    },
+    {
+      "epoch": 0.21112828438948997,
+      "grad_norm": 0.11207133531570435,
+      "learning_rate": 0.00015826356589147285,
+      "loss": 0.7948,
+      "step": 683
+    },
+    {
+      "epoch": 0.21143740340030912,
+      "grad_norm": 0.10788938403129578,
+      "learning_rate": 0.0001582015503875969,
+      "loss": 0.7807,
+      "step": 684
+    },
+    {
+      "epoch": 0.2117465224111283,
+      "grad_norm": 0.12020035088062286,
+      "learning_rate": 0.00015813953488372093,
+      "loss": 0.8755,
+      "step": 685
+    },
+    {
+      "epoch": 0.21205564142194744,
+      "grad_norm": 0.13727609813213348,
+      "learning_rate": 0.00015807751937984497,
+      "loss": 0.9059,
+      "step": 686
+    },
+    {
+      "epoch": 0.2123647604327666,
+      "grad_norm": 0.1516505777835846,
+      "learning_rate": 0.00015801550387596898,
+      "loss": 0.8297,
+      "step": 687
+    },
+    {
+      "epoch": 0.2126738794435858,
+      "grad_norm": 0.13136766850948334,
+      "learning_rate": 0.00015795348837209302,
+      "loss": 0.8096,
+      "step": 688
+    },
+    {
+      "epoch": 0.21298299845440494,
+      "grad_norm": 0.11303882300853729,
+      "learning_rate": 0.00015789147286821706,
+      "loss": 0.7967,
+      "step": 689
+    },
+    {
+      "epoch": 0.2132921174652241,
+      "grad_norm": 0.13250216841697693,
+      "learning_rate": 0.0001578294573643411,
+      "loss": 0.8846,
+      "step": 690
+    },
+    {
+      "epoch": 0.2136012364760433,
+      "grad_norm": 0.125604048371315,
+      "learning_rate": 0.00015776744186046514,
+      "loss": 0.8468,
+      "step": 691
+    },
+    {
+      "epoch": 0.21391035548686244,
+      "grad_norm": 0.11047331243753433,
+      "learning_rate": 0.00015770542635658915,
+      "loss": 0.7068,
+      "step": 692
+    },
+    {
+      "epoch": 0.2142194744976816,
+      "grad_norm": 0.12153135985136032,
+      "learning_rate": 0.0001576434108527132,
+      "loss": 0.7166,
+      "step": 693
+    },
+    {
+      "epoch": 0.21452859350850076,
+      "grad_norm": 0.11161539703607559,
+      "learning_rate": 0.00015758139534883723,
+      "loss": 0.8544,
+      "step": 694
+    },
+    {
+      "epoch": 0.21483771251931993,
+      "grad_norm": 0.11793739348649979,
+      "learning_rate": 0.00015751937984496126,
+      "loss": 0.8196,
+      "step": 695
+    },
+    {
+      "epoch": 0.2151468315301391,
+      "grad_norm": 0.12710191309452057,
+      "learning_rate": 0.00015745736434108528,
+      "loss": 0.8249,
+      "step": 696
+    },
+    {
+      "epoch": 0.21545595054095826,
+      "grad_norm": 0.12436945736408234,
+      "learning_rate": 0.0001573953488372093,
+      "loss": 0.7147,
+      "step": 697
+    },
+    {
+      "epoch": 0.21576506955177743,
+      "grad_norm": 0.10970059782266617,
+      "learning_rate": 0.00015733333333333333,
+      "loss": 0.8297,
+      "step": 698
+    },
+    {
+      "epoch": 0.2160741885625966,
+      "grad_norm": 0.1195238009095192,
+      "learning_rate": 0.00015727131782945737,
+      "loss": 0.7347,
+      "step": 699
+    },
+    {
+      "epoch": 0.21638330757341576,
+      "grad_norm": 0.11942408978939056,
+      "learning_rate": 0.0001572093023255814,
+      "loss": 0.7825,
+      "step": 700
+    },
+    {
+      "epoch": 0.21669242658423493,
+      "grad_norm": 0.11630623042583466,
+      "learning_rate": 0.00015714728682170544,
+      "loss": 0.8442,
+      "step": 701
+    },
+    {
+      "epoch": 0.2170015455950541,
+      "grad_norm": 0.12051951140165329,
+      "learning_rate": 0.00015708527131782945,
+      "loss": 0.6505,
+      "step": 702
+    },
+    {
+      "epoch": 0.21731066460587325,
+      "grad_norm": 0.12351769208908081,
+      "learning_rate": 0.0001570232558139535,
+      "loss": 0.8298,
+      "step": 703
+    },
+    {
+      "epoch": 0.21761978361669243,
+      "grad_norm": 0.11859402805566788,
+      "learning_rate": 0.00015696124031007753,
+      "loss": 0.6916,
+      "step": 704
+    },
+    {
+      "epoch": 0.21792890262751158,
+      "grad_norm": 0.13090530037879944,
+      "learning_rate": 0.00015689922480620157,
+      "loss": 0.9438,
+      "step": 705
+    },
+    {
+      "epoch": 0.21823802163833075,
+      "grad_norm": 0.12880273163318634,
+      "learning_rate": 0.00015683720930232558,
+      "loss": 0.8416,
+      "step": 706
+    },
+    {
+      "epoch": 0.21854714064914993,
+      "grad_norm": 0.12142153829336166,
+      "learning_rate": 0.00015677519379844962,
+      "loss": 0.7073,
+      "step": 707
+    },
+    {
+      "epoch": 0.21885625965996908,
+      "grad_norm": 0.12591804563999176,
+      "learning_rate": 0.00015671317829457366,
+      "loss": 0.7311,
+      "step": 708
+    },
+    {
+      "epoch": 0.21916537867078825,
+      "grad_norm": 0.1107344701886177,
+      "learning_rate": 0.0001566511627906977,
+      "loss": 0.6529,
+      "step": 709
+    },
+    {
+      "epoch": 0.21947449768160743,
+      "grad_norm": 0.10950338840484619,
+      "learning_rate": 0.0001565891472868217,
+      "loss": 0.6766,
+      "step": 710
+    },
+    {
+      "epoch": 0.21978361669242658,
+      "grad_norm": 0.1164025217294693,
+      "learning_rate": 0.00015652713178294572,
+      "loss": 0.7021,
+      "step": 711
+    },
+    {
+      "epoch": 0.22009273570324575,
+      "grad_norm": 0.13531538844108582,
+      "learning_rate": 0.00015646511627906976,
+      "loss": 0.739,
+      "step": 712
+    },
+    {
+      "epoch": 0.22040185471406493,
+      "grad_norm": 0.12463265657424927,
+      "learning_rate": 0.0001564031007751938,
+      "loss": 0.7832,
+      "step": 713
+    },
+    {
+      "epoch": 0.22071097372488407,
+      "grad_norm": 0.13127584755420685,
+      "learning_rate": 0.00015634108527131784,
+      "loss": 0.7652,
+      "step": 714
+    },
+    {
+      "epoch": 0.22102009273570325,
+      "grad_norm": 0.13645778596401215,
+      "learning_rate": 0.00015627906976744188,
+      "loss": 0.8023,
+      "step": 715
+    },
+    {
+      "epoch": 0.2213292117465224,
+      "grad_norm": 0.12551645934581757,
+      "learning_rate": 0.0001562170542635659,
+      "loss": 0.7572,
+      "step": 716
+    },
+    {
+      "epoch": 0.22163833075734157,
+      "grad_norm": 0.1264384686946869,
+      "learning_rate": 0.00015615503875968993,
+      "loss": 0.7972,
+      "step": 717
+    },
+    {
+      "epoch": 0.22194744976816075,
+      "grad_norm": 0.12361495941877365,
+      "learning_rate": 0.00015609302325581397,
+      "loss": 0.7932,
+      "step": 718
+    },
+    {
+      "epoch": 0.2222565687789799,
+      "grad_norm": 0.12109937518835068,
+      "learning_rate": 0.000156031007751938,
+      "loss": 0.7977,
+      "step": 719
+    },
+    {
+      "epoch": 0.22256568778979907,
+      "grad_norm": 0.11111821234226227,
+      "learning_rate": 0.00015596899224806202,
+      "loss": 0.7563,
+      "step": 720
+    },
+    {
+      "epoch": 0.22287480680061825,
+      "grad_norm": 0.11457593739032745,
+      "learning_rate": 0.00015590697674418606,
+      "loss": 0.8042,
+      "step": 721
+    },
+    {
+      "epoch": 0.2231839258114374,
+      "grad_norm": 0.11488046497106552,
+      "learning_rate": 0.0001558449612403101,
+      "loss": 0.7316,
+      "step": 722
+    },
+    {
+      "epoch": 0.22349304482225657,
+      "grad_norm": 0.1125350072979927,
+      "learning_rate": 0.0001557829457364341,
+      "loss": 0.9362,
+      "step": 723
+    },
+    {
+      "epoch": 0.22380216383307575,
+      "grad_norm": 0.13179023563861847,
+      "learning_rate": 0.00015572093023255814,
+      "loss": 0.8695,
+      "step": 724
+    },
+    {
+      "epoch": 0.2241112828438949,
+      "grad_norm": 0.11290204524993896,
+      "learning_rate": 0.00015565891472868218,
+      "loss": 0.7914,
+      "step": 725
+    },
+    {
+      "epoch": 0.22442040185471407,
+      "grad_norm": 0.10253167897462845,
+      "learning_rate": 0.0001555968992248062,
+      "loss": 0.7789,
+      "step": 726
+    },
+    {
+      "epoch": 0.22472952086553322,
+      "grad_norm": 0.13726738095283508,
+      "learning_rate": 0.00015553488372093023,
+      "loss": 0.8706,
+      "step": 727
+    },
+    {
+      "epoch": 0.2250386398763524,
+      "grad_norm": 0.11310728639364243,
+      "learning_rate": 0.00015547286821705427,
+      "loss": 0.816,
+      "step": 728
+    },
+    {
+      "epoch": 0.22534775888717157,
+      "grad_norm": 0.10052375495433807,
+      "learning_rate": 0.0001554108527131783,
+      "loss": 0.8172,
+      "step": 729
+    },
+    {
+      "epoch": 0.22565687789799072,
+      "grad_norm": 0.12013030052185059,
+      "learning_rate": 0.00015534883720930232,
+      "loss": 0.7367,
+      "step": 730
+    },
+    {
+      "epoch": 0.2259659969088099,
+      "grad_norm": 0.12074479460716248,
+      "learning_rate": 0.00015528682170542636,
+      "loss": 0.7325,
+      "step": 731
+    },
+    {
+      "epoch": 0.22627511591962907,
+      "grad_norm": 0.11103136837482452,
+      "learning_rate": 0.0001552248062015504,
+      "loss": 0.7697,
+      "step": 732
+    },
+    {
+      "epoch": 0.22658423493044821,
+      "grad_norm": 0.1295919120311737,
+      "learning_rate": 0.00015516279069767444,
+      "loss": 0.8268,
+      "step": 733
+    },
+    {
+      "epoch": 0.2268933539412674,
+      "grad_norm": 0.11158143728971481,
+      "learning_rate": 0.00015510077519379848,
+      "loss": 0.8241,
+      "step": 734
+    },
+    {
+      "epoch": 0.22720247295208656,
+      "grad_norm": 0.11632904410362244,
+      "learning_rate": 0.0001550387596899225,
+      "loss": 0.8937,
+      "step": 735
+    },
+    {
+      "epoch": 0.2275115919629057,
+      "grad_norm": 0.11036121845245361,
+      "learning_rate": 0.00015497674418604653,
+      "loss": 0.7435,
+      "step": 736
+    },
+    {
+      "epoch": 0.2278207109737249,
+      "grad_norm": 0.11522484570741653,
+      "learning_rate": 0.00015491472868217054,
+      "loss": 0.7337,
+      "step": 737
+    },
+    {
+      "epoch": 0.22812982998454404,
+      "grad_norm": 0.11675230413675308,
+      "learning_rate": 0.00015485271317829458,
+      "loss": 0.8623,
+      "step": 738
+    },
+    {
+      "epoch": 0.2284389489953632,
+      "grad_norm": 0.114603690803051,
+      "learning_rate": 0.00015479069767441862,
+      "loss": 0.7344,
+      "step": 739
+    },
+    {
+      "epoch": 0.2287480680061824,
+      "grad_norm": 0.1253465712070465,
+      "learning_rate": 0.00015472868217054263,
+      "loss": 0.7111,
+      "step": 740
+    },
+    {
+      "epoch": 0.22905718701700153,
+      "grad_norm": 0.1126297116279602,
+      "learning_rate": 0.00015466666666666667,
+      "loss": 0.7948,
+      "step": 741
+    },
+    {
+      "epoch": 0.2293663060278207,
+      "grad_norm": 0.1282925307750702,
+      "learning_rate": 0.0001546046511627907,
+      "loss": 0.8041,
+      "step": 742
+    },
+    {
+      "epoch": 0.22967542503863989,
+      "grad_norm": 0.11763650923967361,
+      "learning_rate": 0.00015454263565891475,
+      "loss": 0.7548,
+      "step": 743
+    },
+    {
+      "epoch": 0.22998454404945903,
+      "grad_norm": 0.12256699800491333,
+      "learning_rate": 0.00015448062015503876,
+      "loss": 0.8664,
+      "step": 744
+    },
+    {
+      "epoch": 0.2302936630602782,
+      "grad_norm": 0.12481536716222763,
+      "learning_rate": 0.0001544186046511628,
+      "loss": 0.8986,
+      "step": 745
+    },
+    {
+      "epoch": 0.23060278207109738,
+      "grad_norm": 0.11068347096443176,
+      "learning_rate": 0.00015435658914728683,
+      "loss": 0.8181,
+      "step": 746
+    },
+    {
+      "epoch": 0.23091190108191653,
+      "grad_norm": 0.13589359819889069,
+      "learning_rate": 0.00015429457364341087,
+      "loss": 0.6341,
+      "step": 747
+    },
+    {
+      "epoch": 0.2312210200927357,
+      "grad_norm": 0.10451477766036987,
+      "learning_rate": 0.0001542325581395349,
+      "loss": 0.7122,
+      "step": 748
+    },
+    {
+      "epoch": 0.23153013910355486,
+      "grad_norm": 0.129670187830925,
+      "learning_rate": 0.00015417054263565892,
+      "loss": 0.906,
+      "step": 749
+    },
+    {
+      "epoch": 0.23183925811437403,
+      "grad_norm": 0.11718375980854034,
+      "learning_rate": 0.00015410852713178293,
+      "loss": 0.7083,
+      "step": 750
+    },
+    {
+      "epoch": 0.2321483771251932,
+      "grad_norm": 0.10383883118629456,
+      "learning_rate": 0.00015404651162790697,
+      "loss": 0.8419,
+      "step": 751
+    },
+    {
+      "epoch": 0.23245749613601235,
+      "grad_norm": 0.11427688598632812,
+      "learning_rate": 0.000153984496124031,
+      "loss": 0.8832,
+      "step": 752
+    },
+    {
+      "epoch": 0.23276661514683153,
+      "grad_norm": 0.10943648964166641,
+      "learning_rate": 0.00015392248062015505,
+      "loss": 0.7645,
+      "step": 753
+    },
+    {
+      "epoch": 0.2330757341576507,
+      "grad_norm": 0.10078372806310654,
+      "learning_rate": 0.00015386046511627906,
+      "loss": 0.8181,
+      "step": 754
+    },
+    {
+      "epoch": 0.23338485316846985,
+      "grad_norm": 0.10082436352968216,
+      "learning_rate": 0.0001537984496124031,
+      "loss": 0.9162,
+      "step": 755
+    },
+    {
+      "epoch": 0.23369397217928903,
+      "grad_norm": 0.1165718212723732,
+      "learning_rate": 0.00015373643410852714,
+      "loss": 0.7518,
+      "step": 756
+    },
+    {
+      "epoch": 0.2340030911901082,
+      "grad_norm": 0.11954308301210403,
+      "learning_rate": 0.00015367441860465118,
+      "loss": 0.684,
+      "step": 757
+    },
+    {
+      "epoch": 0.23431221020092735,
+      "grad_norm": 0.12400692701339722,
+      "learning_rate": 0.0001536124031007752,
+      "loss": 0.8917,
+      "step": 758
+    },
+    {
+      "epoch": 0.23462132921174653,
+      "grad_norm": 0.10413803160190582,
+      "learning_rate": 0.00015355038759689923,
+      "loss": 0.7598,
+      "step": 759
+    },
+    {
+      "epoch": 0.23493044822256567,
+      "grad_norm": 0.11822440475225449,
+      "learning_rate": 0.00015348837209302327,
+      "loss": 0.7463,
+      "step": 760
+    },
+    {
+      "epoch": 0.23523956723338485,
+      "grad_norm": 0.11296241730451584,
+      "learning_rate": 0.0001534263565891473,
+      "loss": 0.7842,
+      "step": 761
+    },
+    {
+      "epoch": 0.23554868624420403,
+      "grad_norm": 0.1275034248828888,
+      "learning_rate": 0.00015336434108527135,
+      "loss": 0.7418,
+      "step": 762
+    },
+    {
+      "epoch": 0.23585780525502317,
+      "grad_norm": 0.12039622664451599,
+      "learning_rate": 0.00015330232558139536,
+      "loss": 0.7858,
+      "step": 763
+    },
+    {
+      "epoch": 0.23616692426584235,
+      "grad_norm": 0.12102185189723969,
+      "learning_rate": 0.00015324031007751937,
+      "loss": 0.7543,
+      "step": 764
+    },
+    {
+      "epoch": 0.23647604327666152,
+      "grad_norm": 0.11714228242635727,
+      "learning_rate": 0.0001531782945736434,
+      "loss": 0.7918,
+      "step": 765
+    },
+    {
+      "epoch": 0.23678516228748067,
+      "grad_norm": 0.1297132819890976,
+      "learning_rate": 0.00015311627906976745,
+      "loss": 0.7579,
+      "step": 766
+    },
+    {
+      "epoch": 0.23709428129829985,
+      "grad_norm": 0.1544187366962433,
+      "learning_rate": 0.00015305426356589149,
+      "loss": 0.9518,
+      "step": 767
+    },
+    {
+      "epoch": 0.23740340030911902,
+      "grad_norm": 0.1462169736623764,
+      "learning_rate": 0.0001529922480620155,
+      "loss": 0.8472,
+      "step": 768
+    },
+    {
+      "epoch": 0.23771251931993817,
+      "grad_norm": 0.12060233950614929,
+      "learning_rate": 0.00015293023255813954,
+      "loss": 0.7098,
+      "step": 769
+    },
+    {
+      "epoch": 0.23802163833075735,
+      "grad_norm": 0.10534477233886719,
+      "learning_rate": 0.00015286821705426357,
+      "loss": 0.692,
+      "step": 770
+    },
+    {
+      "epoch": 0.2383307573415765,
+      "grad_norm": 0.12921524047851562,
+      "learning_rate": 0.0001528062015503876,
+      "loss": 0.7307,
+      "step": 771
+    },
+    {
+      "epoch": 0.23863987635239567,
+      "grad_norm": 0.11627444624900818,
+      "learning_rate": 0.00015274418604651165,
+      "loss": 0.7488,
+      "step": 772
+    },
+    {
+      "epoch": 0.23894899536321484,
+      "grad_norm": 0.12365692108869553,
+      "learning_rate": 0.00015268217054263566,
+      "loss": 0.8608,
+      "step": 773
+    },
+    {
+      "epoch": 0.239258114374034,
+      "grad_norm": 0.12448560446500778,
+      "learning_rate": 0.0001526201550387597,
+      "loss": 0.744,
+      "step": 774
+    },
+    {
+      "epoch": 0.23956723338485317,
+      "grad_norm": 0.11701495200395584,
+      "learning_rate": 0.00015255813953488374,
+      "loss": 0.7273,
+      "step": 775
+    },
+    {
+      "epoch": 0.23987635239567234,
+      "grad_norm": 0.14910434186458588,
+      "learning_rate": 0.00015249612403100778,
+      "loss": 0.8649,
+      "step": 776
+    },
+    {
+      "epoch": 0.2401854714064915,
+      "grad_norm": 0.12013334035873413,
+      "learning_rate": 0.0001524341085271318,
+      "loss": 0.763,
+      "step": 777
+    },
+    {
+      "epoch": 0.24049459041731067,
+      "grad_norm": 0.13918770849704742,
+      "learning_rate": 0.0001523720930232558,
+      "loss": 0.7846,
+      "step": 778
+    },
+    {
+      "epoch": 0.24080370942812984,
+      "grad_norm": 0.1394704133272171,
+      "learning_rate": 0.00015231007751937984,
+      "loss": 0.8571,
+      "step": 779
+    },
+    {
+      "epoch": 0.241112828438949,
+      "grad_norm": 0.1315182000398636,
+      "learning_rate": 0.00015224806201550388,
+      "loss": 0.9,
+      "step": 780
+    },
+    {
+      "epoch": 0.24142194744976817,
+      "grad_norm": 0.11989207565784454,
+      "learning_rate": 0.00015218604651162792,
+      "loss": 0.7906,
+      "step": 781
+    },
+    {
+      "epoch": 0.2417310664605873,
+      "grad_norm": 0.12133822590112686,
+      "learning_rate": 0.00015212403100775193,
+      "loss": 0.8559,
+      "step": 782
+    },
+    {
+      "epoch": 0.2420401854714065,
+      "grad_norm": 0.12769554555416107,
+      "learning_rate": 0.00015206201550387597,
+      "loss": 0.6996,
+      "step": 783
+    },
+    {
+      "epoch": 0.24234930448222566,
+      "grad_norm": 0.11488951742649078,
+      "learning_rate": 0.000152,
+      "loss": 0.667,
+      "step": 784
+    },
+    {
+      "epoch": 0.2426584234930448,
+      "grad_norm": 0.12931592762470245,
+      "learning_rate": 0.00015193798449612405,
+      "loss": 0.8865,
+      "step": 785
+    },
+    {
+      "epoch": 0.242967542503864,
+      "grad_norm": 0.1383776217699051,
+      "learning_rate": 0.00015187596899224809,
+      "loss": 0.7648,
+      "step": 786
+    },
+    {
+      "epoch": 0.24327666151468316,
+      "grad_norm": 0.13125276565551758,
+      "learning_rate": 0.0001518139534883721,
+      "loss": 0.8103,
+      "step": 787
+    },
+    {
+      "epoch": 0.2435857805255023,
+      "grad_norm": 0.11506158858537674,
+      "learning_rate": 0.00015175193798449614,
+      "loss": 0.7935,
+      "step": 788
+    },
+    {
+      "epoch": 0.24389489953632149,
+      "grad_norm": 0.1170530617237091,
+      "learning_rate": 0.00015168992248062017,
+      "loss": 0.861,
+      "step": 789
+    },
+    {
+      "epoch": 0.24420401854714066,
+      "grad_norm": 0.1097881942987442,
+      "learning_rate": 0.0001516279069767442,
+      "loss": 0.8071,
+      "step": 790
+    },
+    {
+      "epoch": 0.2445131375579598,
+      "grad_norm": 0.12315784394741058,
+      "learning_rate": 0.00015156589147286823,
+      "loss": 0.8053,
+      "step": 791
+    },
+    {
+      "epoch": 0.24482225656877898,
+      "grad_norm": 0.1385902613401413,
+      "learning_rate": 0.00015150387596899224,
+      "loss": 0.766,
+      "step": 792
+    },
+    {
+      "epoch": 0.24513137557959813,
+      "grad_norm": 0.12784931063652039,
+      "learning_rate": 0.00015144186046511628,
+      "loss": 0.6802,
+      "step": 793
+    },
+    {
+      "epoch": 0.2454404945904173,
+      "grad_norm": 0.12145421653985977,
+      "learning_rate": 0.00015137984496124031,
+      "loss": 0.7079,
+      "step": 794
+    },
+    {
+      "epoch": 0.24574961360123648,
+      "grad_norm": 0.11476317793130875,
+      "learning_rate": 0.00015131782945736435,
+      "loss": 0.7618,
+      "step": 795
+    },
+    {
+      "epoch": 0.24605873261205563,
+      "grad_norm": 0.1074838861823082,
+      "learning_rate": 0.0001512558139534884,
+      "loss": 0.7939,
+      "step": 796
+    },
+    {
+      "epoch": 0.2463678516228748,
+      "grad_norm": 0.11968334019184113,
+      "learning_rate": 0.0001511937984496124,
+      "loss": 0.8351,
+      "step": 797
+    },
+    {
+      "epoch": 0.24667697063369398,
+      "grad_norm": 0.11175213009119034,
+      "learning_rate": 0.00015113178294573644,
+      "loss": 0.8101,
+      "step": 798
+    },
+    {
+      "epoch": 0.24698608964451313,
+      "grad_norm": 0.125063955783844,
+      "learning_rate": 0.00015106976744186048,
+      "loss": 0.769,
+      "step": 799
+    },
+    {
+      "epoch": 0.2472952086553323,
+      "grad_norm": 0.12018170952796936,
+      "learning_rate": 0.00015100775193798452,
+      "loss": 0.8536,
+      "step": 800
+    },
+    {
+      "epoch": 0.24760432766615148,
+      "grad_norm": 0.1252349615097046,
+      "learning_rate": 0.00015094573643410853,
+      "loss": 0.813,
+      "step": 801
+    },
+    {
+      "epoch": 0.24791344667697063,
+      "grad_norm": 0.11219511181116104,
+      "learning_rate": 0.00015088372093023257,
+      "loss": 0.749,
+      "step": 802
+    },
+    {
+      "epoch": 0.2482225656877898,
+      "grad_norm": 0.11993087828159332,
+      "learning_rate": 0.00015082170542635658,
+      "loss": 0.8223,
+      "step": 803
+    },
+    {
+      "epoch": 0.24853168469860895,
+      "grad_norm": 0.19286490976810455,
+      "learning_rate": 0.00015075968992248062,
+      "loss": 0.7728,
+      "step": 804
+    },
+    {
+      "epoch": 0.24884080370942813,
+      "grad_norm": 0.1317611187696457,
+      "learning_rate": 0.00015069767441860466,
+      "loss": 0.8072,
+      "step": 805
+    },
+    {
+      "epoch": 0.2491499227202473,
+      "grad_norm": 0.1411685198545456,
+      "learning_rate": 0.00015063565891472867,
+      "loss": 0.7539,
+      "step": 806
+    },
+    {
+      "epoch": 0.24945904173106645,
+      "grad_norm": 0.12156263738870621,
+      "learning_rate": 0.0001505736434108527,
+      "loss": 0.7394,
+      "step": 807
+    },
+    {
+      "epoch": 0.24976816074188563,
+      "grad_norm": 0.17997561395168304,
+      "learning_rate": 0.00015051162790697675,
+      "loss": 0.829,
+      "step": 808
+    },
+    {
+      "epoch": 0.2500772797527048,
+      "grad_norm": 0.11623260378837585,
+      "learning_rate": 0.0001504496124031008,
+      "loss": 0.6734,
+      "step": 809
+    },
+    {
+      "epoch": 0.250386398763524,
+      "grad_norm": 0.12638065218925476,
+      "learning_rate": 0.00015038759689922483,
+      "loss": 0.743,
+      "step": 810
+    },
+    {
+      "epoch": 0.2506955177743431,
+      "grad_norm": 0.11130564659833908,
+      "learning_rate": 0.00015032558139534884,
+      "loss": 0.7584,
+      "step": 811
+    },
+    {
+      "epoch": 0.25100463678516227,
+      "grad_norm": 0.11362282186746597,
+      "learning_rate": 0.00015026356589147288,
+      "loss": 0.8049,
+      "step": 812
+    },
+    {
+      "epoch": 0.2513137557959815,
+      "grad_norm": 0.12556937336921692,
+      "learning_rate": 0.00015020155038759692,
+      "loss": 0.8124,
+      "step": 813
+    },
+    {
+      "epoch": 0.2516228748068006,
+      "grad_norm": 0.12706847488880157,
+      "learning_rate": 0.00015013953488372095,
+      "loss": 0.8337,
+      "step": 814
+    },
+    {
+      "epoch": 0.25193199381761977,
+      "grad_norm": 0.1378735899925232,
+      "learning_rate": 0.00015007751937984497,
+      "loss": 0.7448,
+      "step": 815
+    },
+    {
+      "epoch": 0.252241112828439,
+      "grad_norm": 0.10803718119859695,
+      "learning_rate": 0.000150015503875969,
+      "loss": 0.7921,
+      "step": 816
+    },
+    {
+      "epoch": 0.2525502318392581,
+      "grad_norm": 0.13711851835250854,
+      "learning_rate": 0.00014995348837209302,
+      "loss": 0.806,
+      "step": 817
+    },
+    {
+      "epoch": 0.25285935085007727,
+      "grad_norm": 0.11921881139278412,
+      "learning_rate": 0.00014989147286821705,
+      "loss": 0.7221,
+      "step": 818
+    },
+    {
+      "epoch": 0.2531684698608965,
+      "grad_norm": 0.12782952189445496,
+      "learning_rate": 0.0001498294573643411,
+      "loss": 0.8501,
+      "step": 819
+    },
+    {
+      "epoch": 0.2534775888717156,
+      "grad_norm": 0.12477905303239822,
+      "learning_rate": 0.0001497674418604651,
+      "loss": 0.8073,
+      "step": 820
+    },
+    {
+      "epoch": 0.25378670788253477,
+      "grad_norm": 0.1095808669924736,
+      "learning_rate": 0.00014970542635658914,
+      "loss": 0.7556,
+      "step": 821
+    },
+    {
+      "epoch": 0.2540958268933539,
+      "grad_norm": 0.11897611618041992,
+      "learning_rate": 0.00014964341085271318,
+      "loss": 0.7244,
+      "step": 822
+    },
+    {
+      "epoch": 0.2544049459041731,
+      "grad_norm": 0.12462172657251358,
+      "learning_rate": 0.00014958139534883722,
+      "loss": 0.8497,
+      "step": 823
+    },
+    {
+      "epoch": 0.25471406491499227,
+      "grad_norm": 0.11331510543823242,
+      "learning_rate": 0.00014951937984496126,
+      "loss": 0.8004,
+      "step": 824
+    },
+    {
+      "epoch": 0.2550231839258114,
+      "grad_norm": 0.1233968660235405,
+      "learning_rate": 0.00014945736434108527,
+      "loss": 0.8544,
+      "step": 825
+    },
+    {
+      "epoch": 0.2553323029366306,
+      "grad_norm": 0.12359130382537842,
+      "learning_rate": 0.0001493953488372093,
+      "loss": 0.7888,
+      "step": 826
+    },
+    {
+      "epoch": 0.25564142194744977,
+      "grad_norm": 0.1450347900390625,
+      "learning_rate": 0.00014933333333333335,
+      "loss": 0.8437,
+      "step": 827
+    },
+    {
+      "epoch": 0.2559505409582689,
+      "grad_norm": 0.14557255804538727,
+      "learning_rate": 0.0001492713178294574,
+      "loss": 0.8736,
+      "step": 828
+    },
+    {
+      "epoch": 0.2562596599690881,
+      "grad_norm": 0.11424949765205383,
+      "learning_rate": 0.0001492093023255814,
+      "loss": 0.8393,
+      "step": 829
+    },
+    {
+      "epoch": 0.25656877897990726,
+      "grad_norm": 0.12477642297744751,
+      "learning_rate": 0.0001491472868217054,
+      "loss": 0.8424,
+      "step": 830
+    },
+    {
+      "epoch": 0.2568778979907264,
+      "grad_norm": 0.1368608921766281,
+      "learning_rate": 0.00014908527131782945,
+      "loss": 0.787,
+      "step": 831
+    },
+    {
+      "epoch": 0.2571870170015456,
+      "grad_norm": 0.12159669399261475,
+      "learning_rate": 0.0001490232558139535,
+      "loss": 0.7776,
+      "step": 832
+    },
+    {
+      "epoch": 0.25749613601236476,
+      "grad_norm": 0.1223360225558281,
+      "learning_rate": 0.00014896124031007753,
+      "loss": 0.8551,
+      "step": 833
+    },
+    {
+      "epoch": 0.2578052550231839,
+      "grad_norm": 0.11618901044130325,
+      "learning_rate": 0.00014889922480620157,
+      "loss": 0.7106,
+      "step": 834
+    },
+    {
+      "epoch": 0.2581143740340031,
+      "grad_norm": 0.16739368438720703,
+      "learning_rate": 0.00014883720930232558,
+      "loss": 0.8328,
+      "step": 835
+    },
+    {
+      "epoch": 0.25842349304482226,
+      "grad_norm": 0.13085711002349854,
+      "learning_rate": 0.00014877519379844962,
+      "loss": 0.7686,
+      "step": 836
+    },
+    {
+      "epoch": 0.2587326120556414,
+      "grad_norm": 0.11446749418973923,
+      "learning_rate": 0.00014871317829457366,
+      "loss": 0.8441,
+      "step": 837
+    },
+    {
+      "epoch": 0.2590417310664606,
+      "grad_norm": 0.13658201694488525,
+      "learning_rate": 0.0001486511627906977,
+      "loss": 0.698,
+      "step": 838
+    },
+    {
+      "epoch": 0.25935085007727976,
+      "grad_norm": 0.132501482963562,
+      "learning_rate": 0.0001485891472868217,
+      "loss": 0.8407,
+      "step": 839
+    },
+    {
+      "epoch": 0.2596599690880989,
+      "grad_norm": 0.1130068451166153,
+      "learning_rate": 0.00014852713178294574,
+      "loss": 0.845,
+      "step": 840
+    },
+    {
+      "epoch": 0.2599690880989181,
+      "grad_norm": 0.12523633241653442,
+      "learning_rate": 0.00014846511627906978,
+      "loss": 0.7873,
+      "step": 841
+    },
+    {
+      "epoch": 0.26027820710973726,
+      "grad_norm": 0.21349893510341644,
+      "learning_rate": 0.00014840310077519382,
+      "loss": 0.9231,
+      "step": 842
+    },
+    {
+      "epoch": 0.2605873261205564,
+      "grad_norm": 0.13039101660251617,
+      "learning_rate": 0.00014834108527131783,
+      "loss": 0.7926,
+      "step": 843
+    },
+    {
+      "epoch": 0.26089644513137555,
+      "grad_norm": 0.15471790730953217,
+      "learning_rate": 0.00014827906976744185,
+      "loss": 0.7219,
+      "step": 844
+    },
+    {
+      "epoch": 0.26120556414219476,
+      "grad_norm": 0.19512821733951569,
+      "learning_rate": 0.00014821705426356588,
+      "loss": 0.7653,
+      "step": 845
+    },
+    {
+      "epoch": 0.2615146831530139,
+      "grad_norm": 0.12139850109815598,
+      "learning_rate": 0.00014815503875968992,
+      "loss": 0.7593,
+      "step": 846
+    },
+    {
+      "epoch": 0.26182380216383305,
+      "grad_norm": 0.14223287999629974,
+      "learning_rate": 0.00014809302325581396,
+      "loss": 0.8023,
+      "step": 847
+    },
+    {
+      "epoch": 0.26213292117465226,
+      "grad_norm": 0.12319888919591904,
+      "learning_rate": 0.000148031007751938,
+      "loss": 0.7967,
+      "step": 848
+    },
+    {
+      "epoch": 0.2624420401854714,
+      "grad_norm": 0.14263351261615753,
+      "learning_rate": 0.000147968992248062,
+      "loss": 0.8804,
+      "step": 849
+    },
+    {
+      "epoch": 0.26275115919629055,
+      "grad_norm": 0.11919604986906052,
+      "learning_rate": 0.00014790697674418605,
+      "loss": 0.8823,
+      "step": 850
+    },
+    {
+      "epoch": 0.26306027820710975,
+      "grad_norm": 0.13258209824562073,
+      "learning_rate": 0.0001478449612403101,
+      "loss": 0.7271,
+      "step": 851
+    },
+    {
+      "epoch": 0.2633693972179289,
+      "grad_norm": 0.11424367874860764,
+      "learning_rate": 0.00014778294573643413,
+      "loss": 0.741,
+      "step": 852
+    },
+    {
+      "epoch": 0.26367851622874805,
+      "grad_norm": 0.12254701554775238,
+      "learning_rate": 0.00014772093023255814,
+      "loss": 0.7135,
+      "step": 853
+    },
+    {
+      "epoch": 0.26398763523956725,
+      "grad_norm": 0.1269705444574356,
+      "learning_rate": 0.00014765891472868218,
+      "loss": 0.7848,
+      "step": 854
+    },
+    {
+      "epoch": 0.2642967542503864,
+      "grad_norm": 0.12801006436347961,
+      "learning_rate": 0.00014759689922480622,
+      "loss": 0.7662,
+      "step": 855
+    },
+    {
+      "epoch": 0.26460587326120555,
+      "grad_norm": 0.12760306894779205,
+      "learning_rate": 0.00014753488372093026,
+      "loss": 0.8694,
+      "step": 856
+    },
+    {
+      "epoch": 0.26491499227202475,
+      "grad_norm": 0.10601752996444702,
+      "learning_rate": 0.00014747286821705427,
+      "loss": 0.8976,
+      "step": 857
+    },
+    {
+      "epoch": 0.2652241112828439,
+      "grad_norm": 0.11408428847789764,
+      "learning_rate": 0.0001474108527131783,
+      "loss": 0.8152,
+      "step": 858
+    },
+    {
+      "epoch": 0.26553323029366305,
+      "grad_norm": 0.11453750729560852,
+      "learning_rate": 0.00014734883720930232,
+      "loss": 0.7036,
+      "step": 859
+    },
+    {
+      "epoch": 0.26584234930448225,
+      "grad_norm": 0.1266554743051529,
+      "learning_rate": 0.00014728682170542636,
+      "loss": 0.8151,
+      "step": 860
+    },
+    {
+      "epoch": 0.2661514683153014,
+      "grad_norm": 0.11620058864355087,
+      "learning_rate": 0.0001472248062015504,
+      "loss": 0.8732,
+      "step": 861
+    },
+    {
+      "epoch": 0.26646058732612055,
+      "grad_norm": 0.1301504373550415,
+      "learning_rate": 0.00014716279069767443,
+      "loss": 0.801,
+      "step": 862
+    },
+    {
+      "epoch": 0.26676970633693975,
+      "grad_norm": 0.11662990599870682,
+      "learning_rate": 0.00014710077519379845,
+      "loss": 0.7293,
+      "step": 863
+    },
+    {
+      "epoch": 0.2670788253477589,
+      "grad_norm": 0.13666480779647827,
+      "learning_rate": 0.00014703875968992248,
+      "loss": 0.6958,
+      "step": 864
+    },
+    {
+      "epoch": 0.26738794435857804,
+      "grad_norm": 0.12055882066488266,
+      "learning_rate": 0.00014697674418604652,
+      "loss": 0.701,
+      "step": 865
+    },
+    {
+      "epoch": 0.2676970633693972,
+      "grad_norm": 0.11646155267953873,
+      "learning_rate": 0.00014691472868217056,
+      "loss": 0.8044,
+      "step": 866
+    },
+    {
+      "epoch": 0.2680061823802164,
+      "grad_norm": 0.13146454095840454,
+      "learning_rate": 0.0001468527131782946,
+      "loss": 0.8652,
+      "step": 867
+    },
+    {
+      "epoch": 0.26831530139103554,
+      "grad_norm": 0.11620502918958664,
+      "learning_rate": 0.0001467906976744186,
+      "loss": 0.815,
+      "step": 868
+    },
+    {
+      "epoch": 0.2686244204018547,
+      "grad_norm": 0.1345463991165161,
+      "learning_rate": 0.00014672868217054265,
+      "loss": 0.8375,
+      "step": 869
+    },
+    {
+      "epoch": 0.2689335394126739,
+      "grad_norm": 0.11036497354507446,
+      "learning_rate": 0.00014666666666666666,
+      "loss": 0.7705,
+      "step": 870
+    },
+    {
+      "epoch": 0.26924265842349304,
+      "grad_norm": 0.1117565780878067,
+      "learning_rate": 0.0001466046511627907,
+      "loss": 0.8034,
+      "step": 871
+    },
+    {
+      "epoch": 0.2695517774343122,
+      "grad_norm": 0.12002184987068176,
+      "learning_rate": 0.00014654263565891474,
+      "loss": 0.6356,
+      "step": 872
+    },
+    {
+      "epoch": 0.2698608964451314,
+      "grad_norm": 0.11224567890167236,
+      "learning_rate": 0.00014648062015503875,
+      "loss": 0.6498,
+      "step": 873
+    },
+    {
+      "epoch": 0.27017001545595054,
+      "grad_norm": 0.12627927958965302,
+      "learning_rate": 0.0001464186046511628,
+      "loss": 0.7575,
+      "step": 874
+    },
+    {
+      "epoch": 0.2704791344667697,
+      "grad_norm": 0.12623284757137299,
+      "learning_rate": 0.00014635658914728683,
+      "loss": 0.7678,
+      "step": 875
+    },
+    {
+      "epoch": 0.2707882534775889,
+      "grad_norm": 0.12953105568885803,
+      "learning_rate": 0.00014629457364341087,
+      "loss": 0.7884,
+      "step": 876
+    },
+    {
+      "epoch": 0.27109737248840804,
+      "grad_norm": 0.13573655486106873,
+      "learning_rate": 0.00014623255813953488,
+      "loss": 0.7948,
+      "step": 877
+    },
+    {
+      "epoch": 0.2714064914992272,
+      "grad_norm": 0.12704919278621674,
+      "learning_rate": 0.00014617054263565892,
+      "loss": 0.8609,
+      "step": 878
+    },
+    {
+      "epoch": 0.2717156105100464,
+      "grad_norm": 0.1407371610403061,
+      "learning_rate": 0.00014610852713178296,
+      "loss": 0.7222,
+      "step": 879
+    },
+    {
+      "epoch": 0.27202472952086554,
+      "grad_norm": 0.11318446695804596,
+      "learning_rate": 0.000146046511627907,
+      "loss": 0.8666,
+      "step": 880
+    },
+    {
+      "epoch": 0.2723338485316847,
+      "grad_norm": 0.12386681139469147,
+      "learning_rate": 0.00014598449612403103,
+      "loss": 0.7838,
+      "step": 881
+    },
+    {
+      "epoch": 0.2726429675425039,
+      "grad_norm": 0.1111859530210495,
+      "learning_rate": 0.00014592248062015505,
+      "loss": 0.7663,
+      "step": 882
+    },
+    {
+      "epoch": 0.27295208655332304,
+      "grad_norm": 0.13240239024162292,
+      "learning_rate": 0.00014586046511627906,
+      "loss": 0.7194,
+      "step": 883
+    },
+    {
+      "epoch": 0.2732612055641422,
+      "grad_norm": 0.12492766976356506,
+      "learning_rate": 0.0001457984496124031,
+      "loss": 0.8904,
+      "step": 884
+    },
+    {
+      "epoch": 0.2735703245749614,
+      "grad_norm": 0.11625178158283234,
+      "learning_rate": 0.00014573643410852714,
+      "loss": 0.892,
+      "step": 885
+    },
+    {
+      "epoch": 0.27387944358578054,
+      "grad_norm": 0.12176412343978882,
+      "learning_rate": 0.00014567441860465117,
+      "loss": 0.6733,
+      "step": 886
+    },
+    {
+      "epoch": 0.2741885625965997,
+      "grad_norm": 0.12597818672657013,
+      "learning_rate": 0.00014561240310077519,
+      "loss": 0.7992,
+      "step": 887
+    },
+    {
+      "epoch": 0.27449768160741883,
+      "grad_norm": 0.12471161782741547,
+      "learning_rate": 0.00014555038759689922,
+      "loss": 0.6884,
+      "step": 888
+    },
+    {
+      "epoch": 0.27480680061823803,
+      "grad_norm": 0.11098852753639221,
+      "learning_rate": 0.00014548837209302326,
+      "loss": 0.6681,
+      "step": 889
+    },
+    {
+      "epoch": 0.2751159196290572,
+      "grad_norm": 0.10870758444070816,
+      "learning_rate": 0.0001454263565891473,
+      "loss": 0.8065,
+      "step": 890
+    },
+    {
+      "epoch": 0.27542503863987633,
+      "grad_norm": 0.1273547261953354,
+      "learning_rate": 0.00014536434108527131,
+      "loss": 0.7186,
+      "step": 891
+    },
+    {
+      "epoch": 0.27573415765069553,
+      "grad_norm": 0.14047206938266754,
+      "learning_rate": 0.00014530232558139535,
+      "loss": 0.7146,
+      "step": 892
+    },
+    {
+      "epoch": 0.2760432766615147,
+      "grad_norm": 0.12800012528896332,
+      "learning_rate": 0.0001452403100775194,
+      "loss": 0.8439,
+      "step": 893
+    },
+    {
+      "epoch": 0.27635239567233383,
+      "grad_norm": 0.11552654951810837,
+      "learning_rate": 0.00014517829457364343,
+      "loss": 0.8069,
+      "step": 894
+    },
+    {
+      "epoch": 0.27666151468315303,
+      "grad_norm": 0.11438272893428802,
+      "learning_rate": 0.00014511627906976747,
+      "loss": 0.7723,
+      "step": 895
+    },
+    {
+      "epoch": 0.2769706336939722,
+      "grad_norm": 0.13710401952266693,
+      "learning_rate": 0.00014505426356589148,
+      "loss": 0.9134,
+      "step": 896
+    },
+    {
+      "epoch": 0.2772797527047913,
+      "grad_norm": 0.13901177048683167,
+      "learning_rate": 0.0001449922480620155,
+      "loss": 0.9356,
+      "step": 897
+    },
+    {
+      "epoch": 0.27758887171561053,
+      "grad_norm": 0.1236179992556572,
+      "learning_rate": 0.00014493023255813953,
+      "loss": 0.8756,
+      "step": 898
+    },
+    {
+      "epoch": 0.2778979907264297,
+      "grad_norm": 0.11315148323774338,
+      "learning_rate": 0.00014486821705426357,
+      "loss": 0.762,
+      "step": 899
+    },
+    {
+      "epoch": 0.2782071097372488,
+      "grad_norm": 0.14273928105831146,
+      "learning_rate": 0.0001448062015503876,
+      "loss": 0.7974,
+      "step": 900
+    },
+    {
+      "epoch": 0.27851622874806803,
+      "grad_norm": 0.12433210015296936,
+      "learning_rate": 0.00014474418604651162,
+      "loss": 0.8364,
+      "step": 901
+    },
+    {
+      "epoch": 0.2788253477588872,
+      "grad_norm": 0.13087347149848938,
+      "learning_rate": 0.00014468217054263566,
+      "loss": 0.8037,
+      "step": 902
+    },
+    {
+      "epoch": 0.2791344667697063,
+      "grad_norm": 0.1182572990655899,
+      "learning_rate": 0.0001446201550387597,
+      "loss": 0.8244,
+      "step": 903
+    },
+    {
+      "epoch": 0.2794435857805255,
+      "grad_norm": 0.11682897806167603,
+      "learning_rate": 0.00014455813953488374,
+      "loss": 0.823,
+      "step": 904
+    },
+    {
+      "epoch": 0.2797527047913447,
+      "grad_norm": 0.12286652624607086,
+      "learning_rate": 0.00014449612403100777,
+      "loss": 0.8253,
+      "step": 905
+    },
+    {
+      "epoch": 0.2800618238021638,
+      "grad_norm": 0.1269593983888626,
+      "learning_rate": 0.0001444341085271318,
+      "loss": 0.573,
+      "step": 906
+    },
+    {
+      "epoch": 0.280370942812983,
+      "grad_norm": 0.11785610765218735,
+      "learning_rate": 0.00014437209302325583,
+      "loss": 0.7233,
+      "step": 907
+    },
+    {
+      "epoch": 0.2806800618238022,
+      "grad_norm": 0.1237734779715538,
+      "learning_rate": 0.00014431007751937986,
+      "loss": 0.82,
+      "step": 908
+    },
+    {
+      "epoch": 0.2809891808346213,
+      "grad_norm": 0.12301222234964371,
+      "learning_rate": 0.0001442480620155039,
+      "loss": 0.8205,
+      "step": 909
+    },
+    {
+      "epoch": 0.28129829984544047,
+      "grad_norm": 0.13933341205120087,
+      "learning_rate": 0.00014418604651162791,
+      "loss": 0.8397,
+      "step": 910
+    },
+    {
+      "epoch": 0.2816074188562597,
+      "grad_norm": 0.13418903946876526,
+      "learning_rate": 0.00014412403100775193,
+      "loss": 0.7872,
+      "step": 911
+    },
+    {
+      "epoch": 0.2819165378670788,
+      "grad_norm": 0.11472947895526886,
+      "learning_rate": 0.00014406201550387596,
+      "loss": 0.8015,
+      "step": 912
+    },
+    {
+      "epoch": 0.28222565687789797,
+      "grad_norm": 0.11485429853200912,
+      "learning_rate": 0.000144,
+      "loss": 0.803,
+      "step": 913
+    },
+    {
+      "epoch": 0.28253477588871717,
+      "grad_norm": 0.12704961001873016,
+      "learning_rate": 0.00014393798449612404,
+      "loss": 0.9043,
+      "step": 914
+    },
+    {
+      "epoch": 0.2828438948995363,
+      "grad_norm": 0.12076624482870102,
+      "learning_rate": 0.00014387596899224805,
+      "loss": 0.8441,
+      "step": 915
+    },
+    {
+      "epoch": 0.28315301391035547,
+      "grad_norm": 0.12196331471204758,
+      "learning_rate": 0.0001438139534883721,
+      "loss": 0.7216,
+      "step": 916
+    },
+    {
+      "epoch": 0.28346213292117467,
+      "grad_norm": 0.12628835439682007,
+      "learning_rate": 0.00014375193798449613,
+      "loss": 0.7536,
+      "step": 917
+    },
+    {
+      "epoch": 0.2837712519319938,
+      "grad_norm": 0.12595216929912567,
+      "learning_rate": 0.00014368992248062017,
+      "loss": 0.8964,
+      "step": 918
+    },
+    {
+      "epoch": 0.28408037094281297,
+      "grad_norm": 0.10500409454107285,
+      "learning_rate": 0.0001436279069767442,
+      "loss": 0.8401,
+      "step": 919
+    },
+    {
+      "epoch": 0.28438948995363217,
+      "grad_norm": 0.12638381123542786,
+      "learning_rate": 0.00014356589147286822,
+      "loss": 0.796,
+      "step": 920
+    },
+    {
+      "epoch": 0.2846986089644513,
+      "grad_norm": 0.14120124280452728,
+      "learning_rate": 0.00014350387596899226,
+      "loss": 0.8018,
+      "step": 921
+    },
+    {
+      "epoch": 0.28500772797527046,
+      "grad_norm": 0.12073471397161484,
+      "learning_rate": 0.0001434418604651163,
+      "loss": 0.8203,
+      "step": 922
+    },
+    {
+      "epoch": 0.28531684698608967,
+      "grad_norm": 0.10508771985769272,
+      "learning_rate": 0.0001433798449612403,
+      "loss": 0.7027,
+      "step": 923
+    },
+    {
+      "epoch": 0.2856259659969088,
+      "grad_norm": 0.12278520315885544,
+      "learning_rate": 0.00014331782945736435,
+      "loss": 0.7783,
+      "step": 924
+    },
+    {
+      "epoch": 0.28593508500772796,
+      "grad_norm": 0.10832314193248749,
+      "learning_rate": 0.00014325581395348836,
+      "loss": 0.7251,
+      "step": 925
+    },
+    {
+      "epoch": 0.28624420401854717,
+      "grad_norm": 0.12536031007766724,
+      "learning_rate": 0.0001431937984496124,
+      "loss": 0.7232,
+      "step": 926
+    },
+    {
+      "epoch": 0.2865533230293663,
+      "grad_norm": 0.143062561750412,
+      "learning_rate": 0.00014313178294573644,
+      "loss": 0.7258,
+      "step": 927
+    },
+    {
+      "epoch": 0.28686244204018546,
+      "grad_norm": 0.11144435405731201,
+      "learning_rate": 0.00014306976744186048,
+      "loss": 0.7562,
+      "step": 928
+    },
+    {
+      "epoch": 0.2871715610510046,
+      "grad_norm": 0.12134916335344315,
+      "learning_rate": 0.00014300775193798452,
+      "loss": 0.7109,
+      "step": 929
+    },
+    {
+      "epoch": 0.2874806800618238,
+      "grad_norm": 0.1274683177471161,
+      "learning_rate": 0.00014294573643410853,
+      "loss": 0.8072,
+      "step": 930
+    },
+    {
+      "epoch": 0.28778979907264296,
+      "grad_norm": 0.13728466629981995,
+      "learning_rate": 0.00014288372093023257,
+      "loss": 0.7393,
+      "step": 931
+    },
+    {
+      "epoch": 0.2880989180834621,
+      "grad_norm": 0.13668936491012573,
+      "learning_rate": 0.0001428217054263566,
+      "loss": 0.8375,
+      "step": 932
+    },
+    {
+      "epoch": 0.2884080370942813,
+      "grad_norm": 0.14077217876911163,
+      "learning_rate": 0.00014275968992248064,
+      "loss": 0.7897,
+      "step": 933
+    },
+    {
+      "epoch": 0.28871715610510046,
+      "grad_norm": 0.13246707618236542,
+      "learning_rate": 0.00014269767441860465,
+      "loss": 0.8227,
+      "step": 934
+    },
+    {
+      "epoch": 0.2890262751159196,
+      "grad_norm": 0.11966849118471146,
+      "learning_rate": 0.0001426356589147287,
+      "loss": 0.8414,
+      "step": 935
+    },
+    {
+      "epoch": 0.2893353941267388,
+      "grad_norm": 0.12089065462350845,
+      "learning_rate": 0.00014257364341085273,
+      "loss": 0.7711,
+      "step": 936
+    },
+    {
+      "epoch": 0.28964451313755796,
+      "grad_norm": 0.1274116039276123,
+      "learning_rate": 0.00014251162790697674,
+      "loss": 0.6885,
+      "step": 937
+    },
+    {
+      "epoch": 0.2899536321483771,
+      "grad_norm": 0.13811667263507843,
+      "learning_rate": 0.00014244961240310078,
+      "loss": 0.6859,
+      "step": 938
+    },
+    {
+      "epoch": 0.2902627511591963,
+      "grad_norm": 0.1394423097372055,
+      "learning_rate": 0.0001423875968992248,
+      "loss": 0.7569,
+      "step": 939
+    },
+    {
+      "epoch": 0.29057187017001546,
+      "grad_norm": 0.11885955184698105,
+      "learning_rate": 0.00014232558139534883,
+      "loss": 0.7627,
+      "step": 940
+    },
+    {
+      "epoch": 0.2908809891808346,
+      "grad_norm": 0.12418286502361298,
+      "learning_rate": 0.00014226356589147287,
+      "loss": 0.7303,
+      "step": 941
+    },
+    {
+      "epoch": 0.2911901081916538,
+      "grad_norm": 0.12816603481769562,
+      "learning_rate": 0.0001422015503875969,
+      "loss": 0.747,
+      "step": 942
+    },
+    {
+      "epoch": 0.29149922720247295,
+      "grad_norm": 0.10445892065763474,
+      "learning_rate": 0.00014213953488372095,
+      "loss": 0.7347,
+      "step": 943
+    },
+    {
+      "epoch": 0.2918083462132921,
+      "grad_norm": 0.13160108029842377,
+      "learning_rate": 0.00014207751937984496,
+      "loss": 0.7364,
+      "step": 944
+    },
+    {
+      "epoch": 0.2921174652241113,
+      "grad_norm": 0.116938017308712,
+      "learning_rate": 0.000142015503875969,
+      "loss": 0.8129,
+      "step": 945
+    },
+    {
+      "epoch": 0.29242658423493045,
+      "grad_norm": 0.13014064729213715,
+      "learning_rate": 0.00014195348837209304,
+      "loss": 0.8444,
+      "step": 946
+    },
+    {
+      "epoch": 0.2927357032457496,
+      "grad_norm": 0.12289168685674667,
+      "learning_rate": 0.00014189147286821708,
+      "loss": 0.8332,
+      "step": 947
+    },
+    {
+      "epoch": 0.2930448222565688,
+      "grad_norm": 0.11512966454029083,
+      "learning_rate": 0.0001418294573643411,
+      "loss": 0.7877,
+      "step": 948
+    },
+    {
+      "epoch": 0.29335394126738795,
+      "grad_norm": 0.11333896219730377,
+      "learning_rate": 0.00014176744186046513,
+      "loss": 0.733,
+      "step": 949
+    },
+    {
+      "epoch": 0.2936630602782071,
+      "grad_norm": 0.1227252408862114,
+      "learning_rate": 0.00014170542635658914,
+      "loss": 0.7675,
+      "step": 950
+    },
+    {
+      "epoch": 0.29397217928902625,
+      "grad_norm": 0.11178798228502274,
+      "learning_rate": 0.00014164341085271318,
+      "loss": 0.6638,
+      "step": 951
+    },
+    {
+      "epoch": 0.29428129829984545,
+      "grad_norm": 0.11883097887039185,
+      "learning_rate": 0.00014158139534883722,
+      "loss": 0.7909,
+      "step": 952
+    },
+    {
+      "epoch": 0.2945904173106646,
+      "grad_norm": 0.1324370801448822,
+      "learning_rate": 0.00014151937984496126,
+      "loss": 0.717,
+      "step": 953
+    },
+    {
+      "epoch": 0.29489953632148375,
+      "grad_norm": 0.1309555619955063,
+      "learning_rate": 0.00014145736434108527,
+      "loss": 0.7538,
+      "step": 954
+    },
+    {
+      "epoch": 0.29520865533230295,
+      "grad_norm": 0.12644729018211365,
+      "learning_rate": 0.0001413953488372093,
+      "loss": 0.6985,
+      "step": 955
+    },
+    {
+      "epoch": 0.2955177743431221,
+      "grad_norm": 0.10941684246063232,
+      "learning_rate": 0.00014133333333333334,
+      "loss": 0.8046,
+      "step": 956
+    },
+    {
+      "epoch": 0.29582689335394124,
+      "grad_norm": 0.1376543492078781,
+      "learning_rate": 0.00014127131782945738,
+      "loss": 0.7915,
+      "step": 957
+    },
+    {
+      "epoch": 0.29613601236476045,
+      "grad_norm": 0.14741478860378265,
+      "learning_rate": 0.0001412093023255814,
+      "loss": 0.9085,
+      "step": 958
+    },
+    {
+      "epoch": 0.2964451313755796,
+      "grad_norm": 0.12666583061218262,
+      "learning_rate": 0.00014114728682170543,
+      "loss": 0.793,
+      "step": 959
+    },
+    {
+      "epoch": 0.29675425038639874,
+      "grad_norm": 0.12379190325737,
+      "learning_rate": 0.00014108527131782947,
+      "loss": 0.8256,
+      "step": 960
+    },
+    {
+      "epoch": 0.29706336939721795,
+      "grad_norm": 0.11084531992673874,
+      "learning_rate": 0.0001410232558139535,
+      "loss": 0.796,
+      "step": 961
+    },
+    {
+      "epoch": 0.2973724884080371,
+      "grad_norm": 0.12731553614139557,
+      "learning_rate": 0.00014096124031007752,
+      "loss": 0.68,
+      "step": 962
+    },
+    {
+      "epoch": 0.29768160741885624,
+      "grad_norm": 0.1280289590358734,
+      "learning_rate": 0.00014089922480620153,
+      "loss": 0.8189,
+      "step": 963
+    },
+    {
+      "epoch": 0.29799072642967545,
+      "grad_norm": 0.12680752575397491,
+      "learning_rate": 0.00014083720930232557,
+      "loss": 0.8237,
+      "step": 964
+    },
+    {
+      "epoch": 0.2982998454404946,
+      "grad_norm": 0.13440905511379242,
+      "learning_rate": 0.0001407751937984496,
+      "loss": 0.7518,
+      "step": 965
+    },
+    {
+      "epoch": 0.29860896445131374,
+      "grad_norm": 0.112543486058712,
+      "learning_rate": 0.00014071317829457365,
+      "loss": 0.7725,
+      "step": 966
+    },
+    {
+      "epoch": 0.29891808346213294,
+      "grad_norm": 0.126234233379364,
+      "learning_rate": 0.0001406511627906977,
+      "loss": 0.6715,
+      "step": 967
+    },
+    {
+      "epoch": 0.2992272024729521,
+      "grad_norm": 0.13545869290828705,
+      "learning_rate": 0.0001405891472868217,
+      "loss": 0.7503,
+      "step": 968
+    },
+    {
+      "epoch": 0.29953632148377124,
+      "grad_norm": 0.12928856909275055,
+      "learning_rate": 0.00014052713178294574,
+      "loss": 0.7488,
+      "step": 969
+    },
+    {
+      "epoch": 0.29984544049459044,
+      "grad_norm": 0.1343362033367157,
+      "learning_rate": 0.00014046511627906978,
+      "loss": 0.7984,
+      "step": 970
+    },
+    {
+      "epoch": 0.3001545595054096,
+      "grad_norm": 0.13031892478466034,
+      "learning_rate": 0.00014040310077519382,
+      "loss": 0.7409,
+      "step": 971
+    },
+    {
+      "epoch": 0.30046367851622874,
+      "grad_norm": 0.12235540896654129,
+      "learning_rate": 0.00014034108527131783,
+      "loss": 0.8135,
+      "step": 972
+    },
+    {
+      "epoch": 0.3007727975270479,
+      "grad_norm": 0.1327418088912964,
+      "learning_rate": 0.00014027906976744187,
+      "loss": 0.8359,
+      "step": 973
+    },
+    {
+      "epoch": 0.3010819165378671,
+      "grad_norm": 0.12208300828933716,
+      "learning_rate": 0.0001402170542635659,
+      "loss": 0.8496,
+      "step": 974
+    },
+    {
+      "epoch": 0.30139103554868624,
+      "grad_norm": 0.11996152997016907,
+      "learning_rate": 0.00014015503875968995,
+      "loss": 0.7629,
+      "step": 975
+    },
+    {
+      "epoch": 0.3017001545595054,
+      "grad_norm": 0.12489623576402664,
+      "learning_rate": 0.00014009302325581398,
+      "loss": 0.7716,
+      "step": 976
+    },
+    {
+      "epoch": 0.3020092735703246,
+      "grad_norm": 0.11581925302743912,
+      "learning_rate": 0.00014003100775193797,
+      "loss": 0.881,
+      "step": 977
+    },
+    {
+      "epoch": 0.30231839258114374,
+      "grad_norm": 0.12472864985466003,
+      "learning_rate": 0.000139968992248062,
+      "loss": 0.7534,
+      "step": 978
+    },
+    {
+      "epoch": 0.3026275115919629,
+      "grad_norm": 0.11038485169410706,
+      "learning_rate": 0.00013990697674418605,
+      "loss": 0.6802,
+      "step": 979
+    },
+    {
+      "epoch": 0.3029366306027821,
+      "grad_norm": 0.12170151621103287,
+      "learning_rate": 0.00013984496124031008,
+      "loss": 0.7849,
+      "step": 980
+    },
+    {
+      "epoch": 0.30324574961360123,
+      "grad_norm": 0.12583118677139282,
+      "learning_rate": 0.00013978294573643412,
+      "loss": 0.6808,
+      "step": 981
+    },
+    {
+      "epoch": 0.3035548686244204,
+      "grad_norm": 0.12267141789197922,
+      "learning_rate": 0.00013972093023255813,
+      "loss": 0.8894,
+      "step": 982
+    },
+    {
+      "epoch": 0.3038639876352396,
+      "grad_norm": 0.12336152046918869,
+      "learning_rate": 0.00013965891472868217,
+      "loss": 0.7588,
+      "step": 983
+    },
+    {
+      "epoch": 0.30417310664605873,
+      "grad_norm": 0.13550814986228943,
+      "learning_rate": 0.0001395968992248062,
+      "loss": 0.7588,
+      "step": 984
+    },
+    {
+      "epoch": 0.3044822256568779,
+      "grad_norm": 0.12295803427696228,
+      "learning_rate": 0.00013953488372093025,
+      "loss": 0.8387,
+      "step": 985
+    },
+    {
+      "epoch": 0.3047913446676971,
+      "grad_norm": 0.12663382291793823,
+      "learning_rate": 0.00013947286821705426,
+      "loss": 0.7513,
+      "step": 986
+    },
+    {
+      "epoch": 0.30510046367851623,
+      "grad_norm": 0.1203293651342392,
+      "learning_rate": 0.0001394108527131783,
+      "loss": 0.8078,
+      "step": 987
+    },
+    {
+      "epoch": 0.3054095826893354,
+      "grad_norm": 0.13784480094909668,
+      "learning_rate": 0.00013934883720930234,
+      "loss": 0.8303,
+      "step": 988
+    },
+    {
+      "epoch": 0.3057187017001546,
+      "grad_norm": 0.13811154663562775,
+      "learning_rate": 0.00013928682170542638,
+      "loss": 0.7439,
+      "step": 989
+    },
+    {
+      "epoch": 0.30602782071097373,
+      "grad_norm": 0.1402239203453064,
+      "learning_rate": 0.0001392248062015504,
+      "loss": 0.835,
+      "step": 990
+    },
+    {
+      "epoch": 0.3063369397217929,
+      "grad_norm": 0.1344003528356552,
+      "learning_rate": 0.00013916279069767443,
+      "loss": 0.7973,
+      "step": 991
+    },
+    {
+      "epoch": 0.3066460587326121,
+      "grad_norm": 0.10925968736410141,
+      "learning_rate": 0.00013910077519379844,
+      "loss": 0.7921,
+      "step": 992
+    },
+    {
+      "epoch": 0.30695517774343123,
+      "grad_norm": 0.12327813357114792,
+      "learning_rate": 0.00013903875968992248,
+      "loss": 0.8377,
+      "step": 993
+    },
+    {
+      "epoch": 0.3072642967542504,
+      "grad_norm": 0.12558946013450623,
+      "learning_rate": 0.00013897674418604652,
+      "loss": 0.6776,
+      "step": 994
+    },
+    {
+      "epoch": 0.3075734157650695,
+      "grad_norm": 0.1224449872970581,
+      "learning_rate": 0.00013891472868217056,
+      "loss": 0.7526,
+      "step": 995
+    },
+    {
+      "epoch": 0.3078825347758887,
+      "grad_norm": 0.11907488107681274,
+      "learning_rate": 0.00013885271317829457,
+      "loss": 0.752,
+      "step": 996
+    },
+    {
+      "epoch": 0.3081916537867079,
+      "grad_norm": 0.12344703823328018,
+      "learning_rate": 0.0001387906976744186,
+      "loss": 0.7244,
+      "step": 997
+    },
+    {
+      "epoch": 0.308500772797527,
+      "grad_norm": 0.10863327980041504,
+      "learning_rate": 0.00013872868217054265,
+      "loss": 0.7937,
+      "step": 998
+    },
+    {
+      "epoch": 0.3088098918083462,
+      "grad_norm": 0.11824218183755875,
+      "learning_rate": 0.00013866666666666669,
+      "loss": 0.8872,
+      "step": 999
+    },
+    {
+      "epoch": 0.3091190108191654,
+      "grad_norm": 0.11574976146221161,
+      "learning_rate": 0.00013860465116279072,
+      "loss": 0.7953,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 3235,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "total_flos": 1.2994588591340913e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}