diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,7925 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 563148,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0026635982015384943,
+      "grad_norm": 0.44449880719184875,
+      "learning_rate": 0.0001996,
+      "loss": 9.0366,
+      "step": 500
+    },
+    {
+      "epoch": 0.005327196403076989,
+      "grad_norm": 0.18657611310482025,
+      "learning_rate": 0.0003996,
+      "loss": 7.4314,
+      "step": 1000
+    },
+    {
+      "epoch": 0.007990794604615483,
+      "grad_norm": 0.4415804147720337,
+      "learning_rate": 0.0005996,
+      "loss": 7.3209,
+      "step": 1500
+    },
+    {
+      "epoch": 0.010654392806153977,
+      "grad_norm": 0.5994516611099243,
+      "learning_rate": 0.0007996,
+      "loss": 7.0489,
+      "step": 2000
+    },
+    {
+      "epoch": 0.013317991007692471,
+      "grad_norm": 0.4792259931564331,
+      "learning_rate": 0.0009996,
+      "loss": 6.8814,
+      "step": 2500
+    },
+    {
+      "epoch": 0.015981589209230967,
+      "grad_norm": 0.517628014087677,
+      "learning_rate": 0.0009991099584766199,
+      "loss": 6.7583,
+      "step": 3000
+    },
+    {
+      "epoch": 0.01864518741076946,
+      "grad_norm": 0.6621844172477722,
+      "learning_rate": 0.0009982181333028923,
+      "loss": 6.6864,
+      "step": 3500
+    },
+    {
+      "epoch": 0.021308785612307955,
+      "grad_norm": 0.9153323173522949,
+      "learning_rate": 0.0009973263081291647,
+      "loss": 6.6285,
+      "step": 4000
+    },
+    {
+      "epoch": 0.02397238381384645,
+      "grad_norm": 0.6160380840301514,
+      "learning_rate": 0.0009964362666057848,
+      "loss": 6.5925,
+      "step": 4500
+    },
+    {
+      "epoch": 0.026635982015384942,
+      "grad_norm": 0.952617883682251,
+      "learning_rate": 0.0009955444414320573,
+      "loss": 6.5313,
+      "step": 5000
+    },
+    {
+      "epoch": 0.029299580216923436,
+      "grad_norm": 0.552077054977417,
+      "learning_rate": 0.0009946526162583297,
+      "loss": 6.4612,
+      "step": 5500
+    },
+    {
+      "epoch": 0.031963178418461934,
+      "grad_norm": 0.7289395332336426,
+      "learning_rate": 0.0009937607910846021,
+      "loss": 6.4108,
+      "step": 6000
+    },
+    {
+      "epoch": 0.034626776620000424,
+      "grad_norm": 0.6313225626945496,
+      "learning_rate": 0.0009928689659108746,
+      "loss": 6.3711,
+      "step": 6500
+    },
+    {
+      "epoch": 0.03729037482153892,
+      "grad_norm": 0.6914771795272827,
+      "learning_rate": 0.000991977140737147,
+      "loss": 6.3561,
+      "step": 7000
+    },
+    {
+      "epoch": 0.03995397302307741,
+      "grad_norm": 0.5898305773735046,
+      "learning_rate": 0.0009910870992137668,
+      "loss": 6.3347,
+      "step": 7500
+    },
+    {
+      "epoch": 0.04261757122461591,
+      "grad_norm": 0.7071697115898132,
+      "learning_rate": 0.0009901952740400395,
+      "loss": 6.3035,
+      "step": 8000
+    },
+    {
+      "epoch": 0.045281169426154406,
+      "grad_norm": 0.6536933183670044,
+      "learning_rate": 0.000989303448866312,
+      "loss": 6.2981,
+      "step": 8500
+    },
+    {
+      "epoch": 0.0479447676276929,
+      "grad_norm": 0.5664694309234619,
+      "learning_rate": 0.0009884116236925844,
+      "loss": 6.2829,
+      "step": 9000
+    },
+    {
+      "epoch": 0.050608365829231394,
+      "grad_norm": 0.6218783259391785,
+      "learning_rate": 0.0009875197985188568,
+      "loss": 6.2588,
+      "step": 9500
+    },
+    {
+      "epoch": 0.053271964030769885,
+      "grad_norm": 0.753494918346405,
+      "learning_rate": 0.0009866279733451292,
+      "loss": 6.2158,
+      "step": 10000
+    },
+    {
+      "epoch": 0.05593556223230838,
+      "grad_norm": 0.7004017233848572,
+      "learning_rate": 0.000985737931821749,
+      "loss": 6.1671,
+      "step": 10500
+    },
+    {
+      "epoch": 0.05859916043384687,
+      "grad_norm": 0.823192298412323,
+      "learning_rate": 0.0009848461066480215,
+      "loss": 6.1248,
+      "step": 11000
+    },
+    {
+      "epoch": 0.06126275863538537,
+      "grad_norm": 0.9359510540962219,
+      "learning_rate": 0.000983954281474294,
+      "loss": 6.0659,
+      "step": 11500
+    },
+    {
+      "epoch": 0.06392635683692387,
+      "grad_norm": 1.012602686882019,
+      "learning_rate": 0.0009830624563005664,
+      "loss": 6.0177,
+      "step": 12000
+    },
+    {
+      "epoch": 0.06658995503846236,
+      "grad_norm": 1.175893783569336,
+      "learning_rate": 0.0009821724147771865,
+      "loss": 5.9664,
+      "step": 12500
+    },
+    {
+      "epoch": 0.06925355324000085,
+      "grad_norm": 1.2990820407867432,
+      "learning_rate": 0.000981280589603459,
+      "loss": 5.9002,
+      "step": 13000
+    },
+    {
+      "epoch": 0.07191715144153935,
+      "grad_norm": 1.5142649412155151,
+      "learning_rate": 0.0009803887644297313,
+      "loss": 5.8395,
+      "step": 13500
+    },
+    {
+      "epoch": 0.07458074964307784,
+      "grad_norm": 1.1063514947891235,
+      "learning_rate": 0.0009794969392560038,
+      "loss": 5.7873,
+      "step": 14000
+    },
+    {
+      "epoch": 0.07724434784461634,
+      "grad_norm": 1.1882684230804443,
+      "learning_rate": 0.0009786068977326236,
+      "loss": 5.7462,
+      "step": 14500
+    },
+    {
+      "epoch": 0.07990794604615482,
+      "grad_norm": 1.1717172861099243,
+      "learning_rate": 0.000977715072558896,
+      "loss": 5.7139,
+      "step": 15000
+    },
+    {
+      "epoch": 0.08257154424769332,
+      "grad_norm": 1.0602678060531616,
+      "learning_rate": 0.0009768232473851685,
+      "loss": 5.6954,
+      "step": 15500
+    },
+    {
+      "epoch": 0.08523514244923182,
+      "grad_norm": 1.3342951536178589,
+      "learning_rate": 0.000975931422211441,
+      "loss": 5.6708,
+      "step": 16000
+    },
+    {
+      "epoch": 0.08789874065077032,
+      "grad_norm": 1.1896706819534302,
+      "learning_rate": 0.0009750395970377135,
+      "loss": 5.6496,
+      "step": 16500
+    },
+    {
+      "epoch": 0.09056233885230881,
+      "grad_norm": 1.2416741847991943,
+      "learning_rate": 0.0009741495555143335,
+      "loss": 5.6257,
+      "step": 17000
+    },
+    {
+      "epoch": 0.0932259370538473,
+      "grad_norm": 1.1610051393508911,
+      "learning_rate": 0.000973257730340606,
+      "loss": 5.6045,
+      "step": 17500
+    },
+    {
+      "epoch": 0.0958895352553858,
+      "grad_norm": 1.377124309539795,
+      "learning_rate": 0.0009723659051668784,
+      "loss": 5.5915,
+      "step": 18000
+    },
+    {
+      "epoch": 0.09855313345692429,
+      "grad_norm": 1.2425626516342163,
+      "learning_rate": 0.0009714740799931508,
+      "loss": 5.5665,
+      "step": 18500
+    },
+    {
+      "epoch": 0.10121673165846279,
+      "grad_norm": 1.3102902173995972,
+      "learning_rate": 0.0009705840384697707,
+      "loss": 5.5016,
+      "step": 19000
+    },
+    {
+      "epoch": 0.10388032986000127,
+      "grad_norm": 1.1269280910491943,
+      "learning_rate": 0.0009696922132960431,
+      "loss": 5.3915,
+      "step": 19500
+    },
+    {
+      "epoch": 0.10654392806153977,
+      "grad_norm": 1.0961062908172607,
+      "learning_rate": 0.0009688003881223157,
+      "loss": 5.3182,
+      "step": 20000
+    },
+    {
+      "epoch": 0.10920752626307827,
+      "grad_norm": 1.030776023864746,
+      "learning_rate": 0.0009679085629485881,
+      "loss": 5.2717,
+      "step": 20500
+    },
+    {
+      "epoch": 0.11187112446461676,
+      "grad_norm": 1.1483319997787476,
+      "learning_rate": 0.000967018521425208,
+      "loss": 5.2376,
+      "step": 21000
+    },
+    {
+      "epoch": 0.11453472266615526,
+      "grad_norm": 1.011655330657959,
+      "learning_rate": 0.0009661266962514804,
+      "loss": 5.2126,
+      "step": 21500
+    },
+    {
+      "epoch": 0.11719832086769374,
+      "grad_norm": 1.0027350187301636,
+      "learning_rate": 0.0009652348710777528,
+      "loss": 5.1876,
+      "step": 22000
+    },
+    {
+      "epoch": 0.11986191906923224,
+      "grad_norm": 0.9846087694168091,
+      "learning_rate": 0.0009643430459040254,
+      "loss": 5.1627,
+      "step": 22500
+    },
+    {
+      "epoch": 0.12252551727077074,
+      "grad_norm": 1.0151575803756714,
+      "learning_rate": 0.0009634512207302978,
+      "loss": 5.1478,
+      "step": 23000
+    },
+    {
+      "epoch": 0.12518911547230924,
+      "grad_norm": 0.9792256355285645,
+      "learning_rate": 0.0009625611792069178,
+      "loss": 5.124,
+      "step": 23500
+    },
+    {
+      "epoch": 0.12785271367384773,
+      "grad_norm": 0.9928046464920044,
+      "learning_rate": 0.0009616693540331902,
+      "loss": 5.1147,
+      "step": 24000
+    },
+    {
+      "epoch": 0.13051631187538623,
+      "grad_norm": 1.0437482595443726,
+      "learning_rate": 0.0009607775288594626,
+      "loss": 5.0957,
+      "step": 24500
+    },
+    {
+      "epoch": 0.13317991007692473,
+      "grad_norm": 0.9204614162445068,
+      "learning_rate": 0.0009598857036857352,
+      "loss": 5.0938,
+      "step": 25000
+    },
+    {
+      "epoch": 0.1358435082784632,
+      "grad_norm": 0.9496144652366638,
+      "learning_rate": 0.0009589938785120076,
+      "loss": 5.0705,
+      "step": 25500
+    },
+    {
+      "epoch": 0.1385071064800017,
+      "grad_norm": 0.8606376647949219,
+      "learning_rate": 0.00095810205333828,
+      "loss": 5.0659,
+      "step": 26000
+    },
+    {
+      "epoch": 0.1411707046815402,
+      "grad_norm": 0.8681563138961792,
+      "learning_rate": 0.0009572120118148999,
+      "loss": 5.0512,
+      "step": 26500
+    },
+    {
+      "epoch": 0.1438343028830787,
+      "grad_norm": 0.9975363612174988,
+      "learning_rate": 0.0009563201866411723,
+      "loss": 5.0447,
+      "step": 27000
+    },
+    {
+      "epoch": 0.1464979010846172,
+      "grad_norm": 1.0180999040603638,
+      "learning_rate": 0.0009554283614674449,
+      "loss": 5.025,
+      "step": 27500
+    },
+    {
+      "epoch": 0.14916149928615569,
+      "grad_norm": 0.8448518514633179,
+      "learning_rate": 0.0009545365362937173,
+      "loss": 5.0264,
+      "step": 28000
+    },
+    {
+      "epoch": 0.15182509748769418,
+      "grad_norm": 0.8591077327728271,
+      "learning_rate": 0.0009536447111199897,
+      "loss": 5.0093,
+      "step": 28500
+    },
+    {
+      "epoch": 0.15448869568923268,
+      "grad_norm": 0.8553301095962524,
+      "learning_rate": 0.0009527528859462622,
+      "loss": 5.0058,
+      "step": 29000
+    },
+    {
+      "epoch": 0.15715229389077118,
+      "grad_norm": 0.8588173389434814,
+      "learning_rate": 0.0009518610607725346,
+      "loss": 4.9948,
+      "step": 29500
+    },
+    {
+      "epoch": 0.15981589209230965,
+      "grad_norm": 0.9031510949134827,
+      "learning_rate": 0.0009509692355988071,
+      "loss": 4.9913,
+      "step": 30000
+    },
+    {
+      "epoch": 0.16247949029384814,
+      "grad_norm": 0.8640721440315247,
+      "learning_rate": 0.000950079194075427,
+      "loss": 4.9867,
+      "step": 30500
+    },
+    {
+      "epoch": 0.16514308849538664,
+      "grad_norm": 0.8439059853553772,
+      "learning_rate": 0.0009491873689016994,
+      "loss": 4.9821,
+      "step": 31000
+    },
+    {
+      "epoch": 0.16780668669692514,
+      "grad_norm": 0.8463523983955383,
+      "learning_rate": 0.0009482955437279719,
+      "loss": 4.9663,
+      "step": 31500
+    },
+    {
+      "epoch": 0.17047028489846364,
+      "grad_norm": 0.8896917104721069,
+      "learning_rate": 0.0009474037185542443,
+      "loss": 4.9684,
+      "step": 32000
+    },
+    {
+      "epoch": 0.17313388310000213,
+      "grad_norm": 0.8256401419639587,
+      "learning_rate": 0.0009465136770308644,
+      "loss": 4.9629,
+      "step": 32500
+    },
+    {
+      "epoch": 0.17579748130154063,
+      "grad_norm": 0.9592456221580505,
+      "learning_rate": 0.0009456236355074842,
+      "loss": 4.9561,
+      "step": 33000
+    },
+    {
+      "epoch": 0.17846107950307913,
+      "grad_norm": 0.9278562068939209,
+      "learning_rate": 0.0009447318103337567,
+      "loss": 4.9461,
+      "step": 33500
+    },
+    {
+      "epoch": 0.18112467770461763,
+      "grad_norm": 0.8999398946762085,
+      "learning_rate": 0.0009438399851600291,
+      "loss": 4.9543,
+      "step": 34000
+    },
+    {
+      "epoch": 0.1837882759061561,
+      "grad_norm": 0.9635962843894958,
+      "learning_rate": 0.0009429481599863015,
+      "loss": 4.9381,
+      "step": 34500
+    },
+    {
+      "epoch": 0.1864518741076946,
+      "grad_norm": 0.9490432143211365,
+      "learning_rate": 0.0009420563348125741,
+      "loss": 4.9307,
+      "step": 35000
+    },
+    {
+      "epoch": 0.1891154723092331,
+      "grad_norm": 0.9016521573066711,
+      "learning_rate": 0.0009411645096388465,
+      "loss": 4.9327,
+      "step": 35500
+    },
+    {
+      "epoch": 0.1917790705107716,
+      "grad_norm": 0.8551514744758606,
+      "learning_rate": 0.0009402726844651189,
+      "loss": 4.9249,
+      "step": 36000
+    },
+    {
+      "epoch": 0.19444266871231008,
+      "grad_norm": 0.8358152508735657,
+      "learning_rate": 0.0009393808592913914,
+      "loss": 4.9229,
+      "step": 36500
+    },
+    {
+      "epoch": 0.19710626691384858,
+      "grad_norm": 0.8498304486274719,
+      "learning_rate": 0.0009384908177680113,
+      "loss": 4.9215,
+      "step": 37000
+    },
+    {
+      "epoch": 0.19976986511538708,
+      "grad_norm": 0.8565486073493958,
+      "learning_rate": 0.0009375989925942838,
+      "loss": 4.9184,
+      "step": 37500
+    },
+    {
+      "epoch": 0.20243346331692558,
+      "grad_norm": 0.8608818650245667,
+      "learning_rate": 0.0009367071674205563,
+      "loss": 4.9151,
+      "step": 38000
+    },
+    {
+      "epoch": 0.20509706151846407,
+      "grad_norm": 0.9130340218544006,
+      "learning_rate": 0.0009358153422468287,
+      "loss": 4.917,
+      "step": 38500
+    },
+    {
+      "epoch": 0.20776065972000254,
+      "grad_norm": 0.9141052961349487,
+      "learning_rate": 0.0009349253007234486,
+      "loss": 4.9087,
+      "step": 39000
+    },
+    {
+      "epoch": 0.21042425792154104,
+      "grad_norm": 0.828787624835968,
+      "learning_rate": 0.000934033475549721,
+      "loss": 4.9072,
+      "step": 39500
+    },
+    {
+      "epoch": 0.21308785612307954,
+      "grad_norm": 0.8560599088668823,
+      "learning_rate": 0.0009331416503759935,
+      "loss": 4.8994,
+      "step": 40000
+    },
+    {
+      "epoch": 0.21575145432461804,
+      "grad_norm": 0.8474921584129333,
+      "learning_rate": 0.000932249825202266,
+      "loss": 4.9011,
+      "step": 40500
+    },
+    {
+      "epoch": 0.21841505252615653,
+      "grad_norm": 0.7966994047164917,
+      "learning_rate": 0.0009313597836788859,
+      "loss": 4.8958,
+      "step": 41000
+    },
+    {
+      "epoch": 0.22107865072769503,
+      "grad_norm": 0.8658061623573303,
+      "learning_rate": 0.0009304679585051583,
+      "loss": 4.8971,
+      "step": 41500
+    },
+    {
+      "epoch": 0.22374224892923353,
+      "grad_norm": 0.8644976019859314,
+      "learning_rate": 0.0009295761333314307,
+      "loss": 4.8881,
+      "step": 42000
+    },
+    {
+      "epoch": 0.22640584713077203,
+      "grad_norm": 0.8099656105041504,
+      "learning_rate": 0.0009286843081577032,
+      "loss": 4.8828,
+      "step": 42500
+    },
+    {
+      "epoch": 0.22906944533231052,
+      "grad_norm": 0.8898533582687378,
+      "learning_rate": 0.0009277942666343233,
+      "loss": 4.8901,
+      "step": 43000
+    },
+    {
+      "epoch": 0.231733043533849,
+      "grad_norm": 0.8771415948867798,
+      "learning_rate": 0.0009269024414605957,
+      "loss": 4.8766,
+      "step": 43500
+    },
+    {
+      "epoch": 0.2343966417353875,
+      "grad_norm": 0.8513174653053284,
+      "learning_rate": 0.0009260106162868681,
+      "loss": 4.8767,
+      "step": 44000
+    },
+    {
+      "epoch": 0.237060239936926,
+      "grad_norm": 0.8937121629714966,
+      "learning_rate": 0.0009251187911131405,
+      "loss": 4.8754,
+      "step": 44500
+    },
+    {
+      "epoch": 0.23972383813846448,
+      "grad_norm": 0.981132447719574,
+      "learning_rate": 0.0009242287495897604,
+      "loss": 4.8788,
+      "step": 45000
+    },
+    {
+      "epoch": 0.24238743634000298,
+      "grad_norm": 0.784052312374115,
+      "learning_rate": 0.000923336924416033,
+      "loss": 4.8671,
+      "step": 45500
+    },
+    {
+      "epoch": 0.24505103454154148,
+      "grad_norm": 0.8733552694320679,
+      "learning_rate": 0.0009224450992423054,
+      "loss": 4.8681,
+      "step": 46000
+    },
+    {
+      "epoch": 0.24771463274307998,
+      "grad_norm": 0.8183045983314514,
+      "learning_rate": 0.0009215532740685778,
+      "loss": 4.8696,
+      "step": 46500
+    },
+    {
+      "epoch": 0.2503782309446185,
+      "grad_norm": 0.8594405651092529,
+      "learning_rate": 0.0009206632325451977,
+      "loss": 4.8637,
+      "step": 47000
+    },
+    {
+      "epoch": 0.25304182914615697,
+      "grad_norm": 0.8543962240219116,
+      "learning_rate": 0.0009197714073714701,
+      "loss": 4.864,
+      "step": 47500
+    },
+    {
+      "epoch": 0.25570542734769547,
+      "grad_norm": 0.7812336683273315,
+      "learning_rate": 0.0009188795821977425,
+      "loss": 4.8583,
+      "step": 48000
+    },
+    {
+      "epoch": 0.25836902554923397,
+      "grad_norm": 0.8478542566299438,
+      "learning_rate": 0.0009179877570240151,
+      "loss": 4.8507,
+      "step": 48500
+    },
+    {
+      "epoch": 0.26103262375077246,
+      "grad_norm": 0.8426432013511658,
+      "learning_rate": 0.0009170959318502875,
+      "loss": 4.8512,
+      "step": 49000
+    },
+    {
+      "epoch": 0.26369622195231096,
+      "grad_norm": 0.8853486180305481,
+      "learning_rate": 0.0009162058903269075,
+      "loss": 4.8563,
+      "step": 49500
+    },
+    {
+      "epoch": 0.26635982015384946,
+      "grad_norm": 0.883021891117096,
+      "learning_rate": 0.0009153140651531799,
+      "loss": 4.8507,
+      "step": 50000
+    },
+    {
+      "epoch": 0.2690234183553879,
+      "grad_norm": 0.8407544493675232,
+      "learning_rate": 0.0009144222399794523,
+      "loss": 4.8554,
+      "step": 50500
+    },
+    {
+      "epoch": 0.2716870165569264,
+      "grad_norm": 0.8120921850204468,
+      "learning_rate": 0.0009135304148057249,
+      "loss": 4.8485,
+      "step": 51000
+    },
+    {
+      "epoch": 0.2743506147584649,
+      "grad_norm": 0.8241139054298401,
+      "learning_rate": 0.0009126403732823447,
+      "loss": 4.8508,
+      "step": 51500
+    },
+    {
+      "epoch": 0.2770142129600034,
+      "grad_norm": 0.7940220236778259,
+      "learning_rate": 0.0009117485481086172,
+      "loss": 4.8465,
+      "step": 52000
+    },
+    {
+      "epoch": 0.2796778111615419,
+      "grad_norm": 0.7913591265678406,
+      "learning_rate": 0.0009108567229348896,
+      "loss": 4.8467,
+      "step": 52500
+    },
+    {
+      "epoch": 0.2823414093630804,
+      "grad_norm": 0.7899219393730164,
+      "learning_rate": 0.000909964897761162,
+      "loss": 4.8379,
+      "step": 53000
+    },
+    {
+      "epoch": 0.2850050075646189,
+      "grad_norm": 0.7952625751495361,
+      "learning_rate": 0.000909074856237782,
+      "loss": 4.847,
+      "step": 53500
+    },
+    {
+      "epoch": 0.2876686057661574,
+      "grad_norm": 0.8424190878868103,
+      "learning_rate": 0.0009081830310640544,
+      "loss": 4.8368,
+      "step": 54000
+    },
+    {
+      "epoch": 0.2903322039676959,
+      "grad_norm": 0.8853405714035034,
+      "learning_rate": 0.0009072912058903269,
+      "loss": 4.8442,
+      "step": 54500
+    },
+    {
+      "epoch": 0.2929958021692344,
+      "grad_norm": 0.8321651220321655,
+      "learning_rate": 0.0009063993807165993,
+      "loss": 4.8369,
+      "step": 55000
+    },
+    {
+      "epoch": 0.2956594003707729,
+      "grad_norm": 0.7945202589035034,
+      "learning_rate": 0.0009055093391932193,
+      "loss": 4.8276,
+      "step": 55500
+    },
+    {
+      "epoch": 0.29832299857231137,
+      "grad_norm": 0.8524190187454224,
+      "learning_rate": 0.0009046175140194918,
+      "loss": 4.8284,
+      "step": 56000
+    },
+    {
+      "epoch": 0.30098659677384987,
+      "grad_norm": 0.7767360210418701,
+      "learning_rate": 0.0009037256888457643,
+      "loss": 4.8282,
+      "step": 56500
+    },
+    {
+      "epoch": 0.30365019497538837,
+      "grad_norm": 0.7614521980285645,
+      "learning_rate": 0.0009028338636720367,
+      "loss": 4.8309,
+      "step": 57000
+    },
+    {
+      "epoch": 0.30631379317692686,
+      "grad_norm": 0.8267444372177124,
+      "learning_rate": 0.0009019438221486565,
+      "loss": 4.8297,
+      "step": 57500
+    },
+    {
+      "epoch": 0.30897739137846536,
+      "grad_norm": 0.8024168014526367,
+      "learning_rate": 0.000901051996974929,
+      "loss": 4.828,
+      "step": 58000
+    },
+    {
+      "epoch": 0.31164098958000386,
+      "grad_norm": 0.8234706521034241,
+      "learning_rate": 0.0009001601718012014,
+      "loss": 4.8239,
+      "step": 58500
+    },
+    {
+      "epoch": 0.31430458778154235,
+      "grad_norm": 0.7709868550300598,
+      "learning_rate": 0.000899268346627474,
+      "loss": 4.8287,
+      "step": 59000
+    },
+    {
+      "epoch": 0.3169681859830808,
+      "grad_norm": 0.8301928043365479,
+      "learning_rate": 0.0008983783051040939,
+      "loss": 4.8283,
+      "step": 59500
+    },
+    {
+      "epoch": 0.3196317841846193,
+      "grad_norm": 0.8093799948692322,
+      "learning_rate": 0.0008974864799303664,
+      "loss": 4.82,
+      "step": 60000
+    },
+    {
+      "epoch": 0.3222953823861578,
+      "grad_norm": 0.866875410079956,
+      "learning_rate": 0.0008965946547566388,
+      "loss": 4.8259,
+      "step": 60500
+    },
+    {
+      "epoch": 0.3249589805876963,
+      "grad_norm": 0.9615957140922546,
+      "learning_rate": 0.0008957028295829112,
+      "loss": 4.8148,
+      "step": 61000
+    },
+    {
+      "epoch": 0.3276225787892348,
+      "grad_norm": 0.8391242623329163,
+      "learning_rate": 0.0008948127880595312,
+      "loss": 4.8227,
+      "step": 61500
+    },
+    {
+      "epoch": 0.3302861769907733,
+      "grad_norm": 0.8788413405418396,
+      "learning_rate": 0.0008939209628858036,
+      "loss": 4.8087,
+      "step": 62000
+    },
+    {
+      "epoch": 0.3329497751923118,
+      "grad_norm": 0.8100627660751343,
+      "learning_rate": 0.000893029137712076,
+      "loss": 4.8097,
+      "step": 62500
+    },
+    {
+      "epoch": 0.3356133733938503,
+      "grad_norm": 0.8943531513214111,
+      "learning_rate": 0.0008921373125383485,
+      "loss": 4.819,
+      "step": 63000
+    },
+    {
+      "epoch": 0.3382769715953888,
+      "grad_norm": 0.8895285725593567,
+      "learning_rate": 0.0008912472710149683,
+      "loss": 4.8069,
+      "step": 63500
+    },
+    {
+      "epoch": 0.3409405697969273,
+      "grad_norm": 0.777802586555481,
+      "learning_rate": 0.0008903554458412409,
+      "loss": 4.8116,
+      "step": 64000
+    },
+    {
+      "epoch": 0.34360416799846577,
+      "grad_norm": 1.0155904293060303,
+      "learning_rate": 0.0008894636206675133,
+      "loss": 4.8112,
+      "step": 64500
+    },
+    {
+      "epoch": 0.34626776620000427,
+      "grad_norm": 0.7913381457328796,
+      "learning_rate": 0.0008885717954937858,
+      "loss": 4.8138,
+      "step": 65000
+    },
+    {
+      "epoch": 0.34893136440154277,
+      "grad_norm": 0.8168381452560425,
+      "learning_rate": 0.0008876799703200582,
+      "loss": 4.8065,
+      "step": 65500
+    },
+    {
+      "epoch": 0.35159496260308126,
+      "grad_norm": 0.8038260340690613,
+      "learning_rate": 0.0008867899287966782,
+      "loss": 4.8006,
+      "step": 66000
+    },
+    {
+      "epoch": 0.35425856080461976,
+      "grad_norm": 0.781873881816864,
+      "learning_rate": 0.0008858981036229507,
+      "loss": 4.809,
+      "step": 66500
+    },
+    {
+      "epoch": 0.35692215900615826,
+      "grad_norm": 0.9184179306030273,
+      "learning_rate": 0.0008850062784492231,
+      "loss": 4.8035,
+      "step": 67000
+    },
+    {
+      "epoch": 0.35958575720769675,
+      "grad_norm": 0.7746654748916626,
+      "learning_rate": 0.0008841144532754956,
+      "loss": 4.8099,
+      "step": 67500
+    },
+    {
+      "epoch": 0.36224935540923525,
+      "grad_norm": 0.8979808688163757,
+      "learning_rate": 0.0008832244117521154,
+      "loss": 4.7976,
+      "step": 68000
+    },
+    {
+      "epoch": 0.36491295361077375,
+      "grad_norm": 0.8198953866958618,
+      "learning_rate": 0.0008823325865783879,
+      "loss": 4.8001,
+      "step": 68500
+    },
+    {
+      "epoch": 0.3675765518123122,
+      "grad_norm": 0.8266115784645081,
+      "learning_rate": 0.0008814407614046604,
+      "loss": 4.8043,
+      "step": 69000
+    },
+    {
+      "epoch": 0.3702401500138507,
+      "grad_norm": 0.8325560688972473,
+      "learning_rate": 0.0008805489362309328,
+      "loss": 4.7989,
+      "step": 69500
+    },
+    {
+      "epoch": 0.3729037482153892,
+      "grad_norm": 0.7824032306671143,
+      "learning_rate": 0.0008796588947075527,
+      "loss": 4.7945,
+      "step": 70000
+    },
+    {
+      "epoch": 0.3755673464169277,
+      "grad_norm": 0.7960947155952454,
+      "learning_rate": 0.0008787670695338251,
+      "loss": 4.7975,
+      "step": 70500
+    },
+    {
+      "epoch": 0.3782309446184662,
+      "grad_norm": 0.8350421190261841,
+      "learning_rate": 0.0008778752443600976,
+      "loss": 4.7977,
+      "step": 71000
+    },
+    {
+      "epoch": 0.3808945428200047,
+      "grad_norm": 0.7750975489616394,
+      "learning_rate": 0.00087698341918637,
+      "loss": 4.7944,
+      "step": 71500
+    },
+    {
+      "epoch": 0.3835581410215432,
+      "grad_norm": 0.8240845799446106,
+      "learning_rate": 0.0008760933776629901,
+      "loss": 4.7908,
+      "step": 72000
+    },
+    {
+      "epoch": 0.3862217392230817,
+      "grad_norm": 0.8309052586555481,
+      "learning_rate": 0.0008752015524892625,
+      "loss": 4.8001,
+      "step": 72500
+    },
+    {
+      "epoch": 0.38888533742462017,
+      "grad_norm": 0.8170336484909058,
+      "learning_rate": 0.0008743097273155349,
+      "loss": 4.7964,
+      "step": 73000
+    },
+    {
+      "epoch": 0.39154893562615867,
+      "grad_norm": 0.8648092746734619,
+      "learning_rate": 0.0008734179021418074,
+      "loss": 4.7826,
+      "step": 73500
+    },
+    {
+      "epoch": 0.39421253382769716,
+      "grad_norm": 0.8146944046020508,
+      "learning_rate": 0.0008725278606184272,
+      "loss": 4.7906,
+      "step": 74000
+    },
+    {
+      "epoch": 0.39687613202923566,
+      "grad_norm": 0.792269229888916,
+      "learning_rate": 0.0008716360354446998,
+      "loss": 4.7852,
+      "step": 74500
+    },
+    {
+      "epoch": 0.39953973023077416,
+      "grad_norm": 0.7599817514419556,
+      "learning_rate": 0.0008707442102709722,
+      "loss": 4.7917,
+      "step": 75000
+    },
+    {
+      "epoch": 0.40220332843231266,
+      "grad_norm": 0.799649178981781,
+      "learning_rate": 0.0008698523850972446,
+      "loss": 4.7846,
+      "step": 75500
+    },
+    {
+      "epoch": 0.40486692663385115,
+      "grad_norm": 0.7801626324653625,
+      "learning_rate": 0.0008689623435738645,
+      "loss": 4.7879,
+      "step": 76000
+    },
+    {
+      "epoch": 0.40753052483538965,
+      "grad_norm": 0.8832575082778931,
+      "learning_rate": 0.0008680705184001369,
+      "loss": 4.7881,
+      "step": 76500
+    },
+    {
+      "epoch": 0.41019412303692815,
+      "grad_norm": 0.848629355430603,
+      "learning_rate": 0.0008671786932264095,
+      "loss": 4.7933,
+      "step": 77000
+    },
+    {
+      "epoch": 0.41285772123846665,
+      "grad_norm": 0.8427609205245972,
+      "learning_rate": 0.0008662868680526819,
+      "loss": 4.7912,
+      "step": 77500
+    },
+    {
+      "epoch": 0.4155213194400051,
+      "grad_norm": 0.767152726650238,
+      "learning_rate": 0.0008653968265293019,
+      "loss": 4.7904,
+      "step": 78000
+    },
+    {
+      "epoch": 0.4181849176415436,
+      "grad_norm": 0.8218587636947632,
+      "learning_rate": 0.0008645050013555743,
+      "loss": 4.7793,
+      "step": 78500
+    },
+    {
+      "epoch": 0.4208485158430821,
+      "grad_norm": 0.8102436065673828,
+      "learning_rate": 0.0008636131761818467,
+      "loss": 4.7775,
+      "step": 79000
+    },
+    {
+      "epoch": 0.4235121140446206,
+      "grad_norm": 0.7857397198677063,
+      "learning_rate": 0.0008627213510081193,
+      "loss": 4.7742,
+      "step": 79500
+    },
+    {
+      "epoch": 0.4261757122461591,
+      "grad_norm": 0.8044630885124207,
+      "learning_rate": 0.0008618313094847391,
+      "loss": 4.7851,
+      "step": 80000
+    },
+    {
+      "epoch": 0.4288393104476976,
+      "grad_norm": 0.7105129957199097,
+      "learning_rate": 0.0008609394843110116,
+      "loss": 4.7796,
+      "step": 80500
+    },
+    {
+      "epoch": 0.43150290864923607,
+      "grad_norm": 0.7851101160049438,
+      "learning_rate": 0.000860047659137284,
+      "loss": 4.7825,
+      "step": 81000
+    },
+    {
+      "epoch": 0.43416650685077457,
+      "grad_norm": 0.7503988742828369,
+      "learning_rate": 0.0008591558339635564,
+      "loss": 4.7825,
+      "step": 81500
+    },
+    {
+      "epoch": 0.43683010505231307,
+      "grad_norm": 0.7521843314170837,
+      "learning_rate": 0.0008582657924401764,
+      "loss": 4.7818,
+      "step": 82000
+    },
+    {
+      "epoch": 0.43949370325385156,
+      "grad_norm": 0.8569875955581665,
+      "learning_rate": 0.0008573739672664489,
+      "loss": 4.7768,
+      "step": 82500
+    },
+    {
+      "epoch": 0.44215730145539006,
+      "grad_norm": 0.7394946813583374,
+      "learning_rate": 0.0008564821420927214,
+      "loss": 4.7793,
+      "step": 83000
+    },
+    {
+      "epoch": 0.44482089965692856,
+      "grad_norm": 0.8162407279014587,
+      "learning_rate": 0.0008555903169189938,
+      "loss": 4.7695,
+      "step": 83500
+    },
+    {
+      "epoch": 0.44748449785846706,
+      "grad_norm": 0.829507052898407,
+      "learning_rate": 0.0008547002753956137,
+      "loss": 4.7726,
+      "step": 84000
+    },
+    {
+      "epoch": 0.45014809606000555,
+      "grad_norm": 0.8483043313026428,
+      "learning_rate": 0.0008538084502218861,
+      "loss": 4.7801,
+      "step": 84500
+    },
+    {
+      "epoch": 0.45281169426154405,
+      "grad_norm": 0.8971179127693176,
+      "learning_rate": 0.0008529166250481586,
+      "loss": 4.7693,
+      "step": 85000
+    },
+    {
+      "epoch": 0.45547529246308255,
+      "grad_norm": 0.8432018160820007,
+      "learning_rate": 0.0008520247998744311,
+      "loss": 4.7769,
+      "step": 85500
+    },
+    {
+      "epoch": 0.45813889066462105,
+      "grad_norm": 0.8253493905067444,
+      "learning_rate": 0.0008511347583510509,
+      "loss": 4.7711,
+      "step": 86000
+    },
+    {
+      "epoch": 0.46080248886615954,
+      "grad_norm": 0.7823784351348877,
+      "learning_rate": 0.0008502429331773234,
+      "loss": 4.7755,
+      "step": 86500
+    },
+    {
+      "epoch": 0.463466087067698,
+      "grad_norm": 0.9113832712173462,
+      "learning_rate": 0.0008493511080035958,
+      "loss": 4.7733,
+      "step": 87000
+    },
+    {
+      "epoch": 0.4661296852692365,
+      "grad_norm": 0.7511106729507446,
+      "learning_rate": 0.0008484592828298683,
+      "loss": 4.7628,
+      "step": 87500
+    },
+    {
+      "epoch": 0.468793283470775,
+      "grad_norm": 0.821972668170929,
+      "learning_rate": 0.0008475674576561408,
+      "loss": 4.7639,
+      "step": 88000
+    },
+    {
+      "epoch": 0.4714568816723135,
+      "grad_norm": 0.8578181862831116,
+      "learning_rate": 0.0008466774161327607,
+      "loss": 4.7632,
+      "step": 88500
+    },
+    {
+      "epoch": 0.474120479873852,
+      "grad_norm": 0.7680496573448181,
+      "learning_rate": 0.0008457855909590332,
+      "loss": 4.7715,
+      "step": 89000
+    },
+    {
+      "epoch": 0.47678407807539047,
+      "grad_norm": 0.7780221104621887,
+      "learning_rate": 0.0008448937657853056,
+      "loss": 4.7644,
+      "step": 89500
+    },
+    {
+      "epoch": 0.47944767627692897,
+      "grad_norm": 0.7615424394607544,
+      "learning_rate": 0.0008440019406115781,
+      "loss": 4.7647,
+      "step": 90000
+    },
+    {
+      "epoch": 0.48211127447846747,
+      "grad_norm": 0.8719656467437744,
+      "learning_rate": 0.000843111899088198,
+      "loss": 4.7745,
+      "step": 90500
+    },
+    {
+      "epoch": 0.48477487268000596,
+      "grad_norm": 0.7793582677841187,
+      "learning_rate": 0.0008422200739144704,
+      "loss": 4.7668,
+      "step": 91000
+    },
+    {
+      "epoch": 0.48743847088154446,
+      "grad_norm": 0.7653023600578308,
+      "learning_rate": 0.0008413282487407429,
+      "loss": 4.7646,
+      "step": 91500
+    },
+    {
+      "epoch": 0.49010206908308296,
+      "grad_norm": 0.8937133550643921,
+      "learning_rate": 0.0008404364235670153,
+      "loss": 4.7646,
+      "step": 92000
+    },
+    {
+      "epoch": 0.49276566728462146,
+      "grad_norm": 0.8305505514144897,
+      "learning_rate": 0.0008395463820436352,
+      "loss": 4.7647,
+      "step": 92500
+    },
+    {
+      "epoch": 0.49542926548615995,
+      "grad_norm": 0.8143522143363953,
+      "learning_rate": 0.0008386545568699077,
+      "loss": 4.7679,
+      "step": 93000
+    },
+    {
+      "epoch": 0.49809286368769845,
+      "grad_norm": 0.7998281121253967,
+      "learning_rate": 0.0008377627316961801,
+      "loss": 4.7655,
+      "step": 93500
+    },
+    {
+      "epoch": 0.500756461889237,
+      "grad_norm": 0.823118269443512,
+      "learning_rate": 0.0008368709065224526,
+      "loss": 4.7635,
+      "step": 94000
+    },
+    {
+      "epoch": 0.5034200600907754,
+      "grad_norm": 0.8964449167251587,
+      "learning_rate": 0.0008359808649990725,
+      "loss": 4.7575,
+      "step": 94500
+    },
+    {
+      "epoch": 0.5060836582923139,
+      "grad_norm": 0.7982577681541443,
+      "learning_rate": 0.000835089039825345,
+      "loss": 4.7669,
+      "step": 95000
+    },
+    {
+      "epoch": 0.5087472564938524,
+      "grad_norm": 0.8269961476325989,
+      "learning_rate": 0.0008341972146516175,
+      "loss": 4.7613,
+      "step": 95500
+    },
+    {
+      "epoch": 0.5114108546953909,
+      "grad_norm": 0.7937721610069275,
+      "learning_rate": 0.0008333053894778899,
+      "loss": 4.7591,
+      "step": 96000
+    },
+    {
+      "epoch": 0.5140744528969294,
+      "grad_norm": 0.867740273475647,
+      "learning_rate": 0.0008324153479545098,
+      "loss": 4.7596,
+      "step": 96500
+    },
+    {
+      "epoch": 0.5167380510984679,
+      "grad_norm": 0.8314835429191589,
+      "learning_rate": 0.0008315235227807822,
+      "loss": 4.7633,
+      "step": 97000
+    },
+    {
+      "epoch": 0.5194016493000064,
+      "grad_norm": 0.8014164566993713,
+      "learning_rate": 0.0008306316976070547,
+      "loss": 4.762,
+      "step": 97500
+    },
+    {
+      "epoch": 0.5220652475015449,
+      "grad_norm": 0.7812915444374084,
+      "learning_rate": 0.0008297398724333272,
+      "loss": 4.7634,
+      "step": 98000
+    },
+    {
+      "epoch": 0.5247288457030834,
+      "grad_norm": 0.7968524098396301,
+      "learning_rate": 0.0008288498309099471,
+      "loss": 4.7625,
+      "step": 98500
+    },
+    {
+      "epoch": 0.5273924439046219,
+      "grad_norm": 0.821968674659729,
+      "learning_rate": 0.0008279580057362195,
+      "loss": 4.752,
+      "step": 99000
+    },
+    {
+      "epoch": 0.5300560421061604,
+      "grad_norm": 0.7830886244773865,
+      "learning_rate": 0.0008270661805624919,
+      "loss": 4.7603,
+      "step": 99500
+    },
+    {
+      "epoch": 0.5327196403076989,
+      "grad_norm": 0.7848255634307861,
+      "learning_rate": 0.0008261743553887644,
+      "loss": 4.7571,
+      "step": 100000
+    },
+    {
+      "epoch": 0.5353832385092374,
+      "grad_norm": 0.7928926944732666,
+      "learning_rate": 0.0008252843138653843,
+      "loss": 4.7529,
+      "step": 100500
+    },
+    {
+      "epoch": 0.5380468367107758,
+      "grad_norm": 0.8001675605773926,
+      "learning_rate": 0.0008243924886916569,
+      "loss": 4.7616,
+      "step": 101000
+    },
+    {
+      "epoch": 0.5407104349123143,
+      "grad_norm": 0.8647136688232422,
+      "learning_rate": 0.0008235006635179293,
+      "loss": 4.7452,
+      "step": 101500
+    },
+    {
+      "epoch": 0.5433740331138528,
+      "grad_norm": 0.8823105692863464,
+      "learning_rate": 0.0008226088383442017,
+      "loss": 4.7549,
+      "step": 102000
+    },
+    {
+      "epoch": 0.5460376313153913,
+      "grad_norm": 0.8441142439842224,
+      "learning_rate": 0.0008217187968208216,
+      "loss": 4.7544,
+      "step": 102500
+    },
+    {
+      "epoch": 0.5487012295169298,
+      "grad_norm": 0.8819558620452881,
+      "learning_rate": 0.000820826971647094,
+      "loss": 4.7491,
+      "step": 103000
+    },
+    {
+      "epoch": 0.5513648277184683,
+      "grad_norm": 0.7855533361434937,
+      "learning_rate": 0.0008199351464733666,
+      "loss": 4.7591,
+      "step": 103500
+    },
+    {
+      "epoch": 0.5540284259200068,
+      "grad_norm": 0.8068869709968567,
+      "learning_rate": 0.000819043321299639,
+      "loss": 4.7534,
+      "step": 104000
+    },
+    {
+      "epoch": 0.5566920241215453,
+      "grad_norm": 0.8351749181747437,
+      "learning_rate": 0.0008181532797762589,
+      "loss": 4.7481,
+      "step": 104500
+    },
+    {
+      "epoch": 0.5593556223230838,
+      "grad_norm": 0.8479593992233276,
+      "learning_rate": 0.0008172614546025314,
+      "loss": 4.7575,
+      "step": 105000
+    },
+    {
+      "epoch": 0.5620192205246223,
+      "grad_norm": 0.8183143138885498,
+      "learning_rate": 0.0008163696294288038,
+      "loss": 4.7565,
+      "step": 105500
+    },
+    {
+      "epoch": 0.5646828187261608,
+      "grad_norm": 0.8138937950134277,
+      "learning_rate": 0.0008154778042550764,
+      "loss": 4.7482,
+      "step": 106000
+    },
+    {
+      "epoch": 0.5673464169276993,
+      "grad_norm": 0.8708425164222717,
+      "learning_rate": 0.0008145877627316962,
+      "loss": 4.7516,
+      "step": 106500
+    },
+    {
+      "epoch": 0.5700100151292378,
+      "grad_norm": 0.8439280986785889,
+      "learning_rate": 0.0008136959375579687,
+      "loss": 4.7523,
+      "step": 107000
+    },
+    {
+      "epoch": 0.5726736133307763,
+      "grad_norm": 0.8017052412033081,
+      "learning_rate": 0.0008128041123842411,
+      "loss": 4.745,
+      "step": 107500
+    },
+    {
+      "epoch": 0.5753372115323148,
+      "grad_norm": 0.846176266670227,
+      "learning_rate": 0.0008119122872105135,
+      "loss": 4.7539,
+      "step": 108000
+    },
+    {
+      "epoch": 0.5780008097338533,
+      "grad_norm": 0.8138134479522705,
+      "learning_rate": 0.0008110222456871334,
+      "loss": 4.7468,
+      "step": 108500
+    },
+    {
+      "epoch": 0.5806644079353918,
+      "grad_norm": 0.7649713754653931,
+      "learning_rate": 0.0008101304205134059,
+      "loss": 4.7467,
+      "step": 109000
+    },
+    {
+      "epoch": 0.5833280061369303,
+      "grad_norm": 0.8558058142662048,
+      "learning_rate": 0.0008092385953396784,
+      "loss": 4.7408,
+      "step": 109500
+    },
+    {
+      "epoch": 0.5859916043384688,
+      "grad_norm": 0.8179123401641846,
+      "learning_rate": 0.0008083467701659508,
+      "loss": 4.7526,
+      "step": 110000
+    },
+    {
+      "epoch": 0.5886552025400072,
+      "grad_norm": 0.8050591349601746,
+      "learning_rate": 0.0008074567286425708,
+      "loss": 4.7423,
+      "step": 110500
+    },
+    {
+      "epoch": 0.5913188007415457,
+      "grad_norm": 0.7940638661384583,
+      "learning_rate": 0.0008065649034688432,
+      "loss": 4.7467,
+      "step": 111000
+    },
+    {
+      "epoch": 0.5939823989430842,
+      "grad_norm": 0.7882602214813232,
+      "learning_rate": 0.0008056730782951157,
+      "loss": 4.7475,
+      "step": 111500
+    },
+    {
+      "epoch": 0.5966459971446227,
+      "grad_norm": 0.8111135959625244,
+      "learning_rate": 0.0008047812531213882,
+      "loss": 4.749,
+      "step": 112000
+    },
+    {
+      "epoch": 0.5993095953461612,
+      "grad_norm": 0.792116641998291,
+      "learning_rate": 0.000803891211598008,
+      "loss": 4.7507,
+      "step": 112500
+    },
+    {
+      "epoch": 0.6019731935476997,
+      "grad_norm": 0.8503523468971252,
+      "learning_rate": 0.0008029993864242805,
+      "loss": 4.7389,
+      "step": 113000
+    },
+    {
+      "epoch": 0.6046367917492382,
+      "grad_norm": 0.8201097846031189,
+      "learning_rate": 0.0008021075612505529,
+      "loss": 4.752,
+      "step": 113500
+    },
+    {
+      "epoch": 0.6073003899507767,
+      "grad_norm": 0.8428370952606201,
+      "learning_rate": 0.0008012157360768254,
+      "loss": 4.7435,
+      "step": 114000
+    },
+    {
+      "epoch": 0.6099639881523152,
+      "grad_norm": 0.7499297261238098,
+      "learning_rate": 0.0008003239109030979,
+      "loss": 4.7416,
+      "step": 114500
+    },
+    {
+      "epoch": 0.6126275863538537,
+      "grad_norm": 0.8179661631584167,
+      "learning_rate": 0.0007994338693797177,
+      "loss": 4.744,
+      "step": 115000
+    },
+    {
+      "epoch": 0.6152911845553922,
+      "grad_norm": 0.8121057152748108,
+      "learning_rate": 0.0007985420442059902,
+      "loss": 4.7434,
+      "step": 115500
+    },
+    {
+      "epoch": 0.6179547827569307,
+      "grad_norm": 0.7849302887916565,
+      "learning_rate": 0.0007976502190322626,
+      "loss": 4.7396,
+      "step": 116000
+    },
+    {
+      "epoch": 0.6206183809584692,
+      "grad_norm": 0.7559896111488342,
+      "learning_rate": 0.0007967583938585351,
+      "loss": 4.7444,
+      "step": 116500
+    },
+    {
+      "epoch": 0.6232819791600077,
+      "grad_norm": 0.801511824131012,
+      "learning_rate": 0.0007958683523351551,
+      "loss": 4.7391,
+      "step": 117000
+    },
+    {
+      "epoch": 0.6259455773615462,
+      "grad_norm": 0.752527117729187,
+      "learning_rate": 0.0007949765271614275,
+      "loss": 4.7398,
+      "step": 117500
+    },
+    {
+      "epoch": 0.6286091755630847,
+      "grad_norm": 0.8372392654418945,
+      "learning_rate": 0.0007940847019877,
+      "loss": 4.7401,
+      "step": 118000
+    },
+    {
+      "epoch": 0.6312727737646232,
+      "grad_norm": 0.842634379863739,
+      "learning_rate": 0.0007931928768139724,
+      "loss": 4.7437,
+      "step": 118500
+    },
+    {
+      "epoch": 0.6339363719661616,
+      "grad_norm": 0.7479714751243591,
+      "learning_rate": 0.0007923028352905924,
+      "loss": 4.7428,
+      "step": 119000
+    },
+    {
+      "epoch": 0.6365999701677001,
+      "grad_norm": 0.800308108329773,
+      "learning_rate": 0.0007914110101168648,
+      "loss": 4.7378,
+      "step": 119500
+    },
+    {
+      "epoch": 0.6392635683692386,
+      "grad_norm": 0.8084207773208618,
+      "learning_rate": 0.0007905191849431372,
+      "loss": 4.7334,
+      "step": 120000
+    },
+    {
+      "epoch": 0.6419271665707771,
+      "grad_norm": 0.8754898309707642,
+      "learning_rate": 0.0007896273597694097,
+      "loss": 4.7391,
+      "step": 120500
+    },
+    {
+      "epoch": 0.6445907647723156,
+      "grad_norm": 0.8357532620429993,
+      "learning_rate": 0.0007887373182460295,
+      "loss": 4.7394,
+      "step": 121000
+    },
+    {
+      "epoch": 0.6472543629738541,
+      "grad_norm": 0.7808672189712524,
+      "learning_rate": 0.000787845493072302,
+      "loss": 4.7422,
+      "step": 121500
+    },
+    {
+      "epoch": 0.6499179611753926,
+      "grad_norm": 0.8768132328987122,
+      "learning_rate": 0.0007869536678985745,
+      "loss": 4.7412,
+      "step": 122000
+    },
+    {
+      "epoch": 0.6525815593769311,
+      "grad_norm": 0.795536994934082,
+      "learning_rate": 0.0007860618427248469,
+      "loss": 4.7424,
+      "step": 122500
+    },
+    {
+      "epoch": 0.6552451575784696,
+      "grad_norm": 0.8333203792572021,
+      "learning_rate": 0.0007851718012014669,
+      "loss": 4.7363,
+      "step": 123000
+    },
+    {
+      "epoch": 0.6579087557800081,
+      "grad_norm": 0.8043723106384277,
+      "learning_rate": 0.0007842799760277393,
+      "loss": 4.7395,
+      "step": 123500
+    },
+    {
+      "epoch": 0.6605723539815466,
+      "grad_norm": 0.7881098985671997,
+      "learning_rate": 0.0007833881508540118,
+      "loss": 4.7373,
+      "step": 124000
+    },
+    {
+      "epoch": 0.6632359521830851,
+      "grad_norm": 0.8250852823257446,
+      "learning_rate": 0.0007824981093306317,
+      "loss": 4.7361,
+      "step": 124500
+    },
+    {
+      "epoch": 0.6658995503846236,
+      "grad_norm": 0.791354775428772,
+      "learning_rate": 0.0007816062841569042,
+      "loss": 4.7401,
+      "step": 125000
+    },
+    {
+      "epoch": 0.6685631485861621,
+      "grad_norm": 0.833494246006012,
+      "learning_rate": 0.0007807144589831766,
+      "loss": 4.7368,
+      "step": 125500
+    },
+    {
+      "epoch": 0.6712267467877006,
+      "grad_norm": 0.8371044993400574,
+      "learning_rate": 0.000779822633809449,
+      "loss": 4.7363,
+      "step": 126000
+    },
+    {
+      "epoch": 0.673890344989239,
+      "grad_norm": 0.9218412041664124,
+      "learning_rate": 0.0007789308086357215,
+      "loss": 4.7373,
+      "step": 126500
+    },
+    {
+      "epoch": 0.6765539431907776,
+      "grad_norm": 0.8172479867935181,
+      "learning_rate": 0.000778038983461994,
+      "loss": 4.7347,
+      "step": 127000
+    },
+    {
+      "epoch": 0.679217541392316,
+      "grad_norm": 0.8264776468276978,
+      "learning_rate": 0.0007771471582882665,
+      "loss": 4.7332,
+      "step": 127500
+    },
+    {
+      "epoch": 0.6818811395938545,
+      "grad_norm": 0.780692458152771,
+      "learning_rate": 0.0007762553331145389,
+      "loss": 4.7348,
+      "step": 128000
+    },
+    {
+      "epoch": 0.684544737795393,
+      "grad_norm": 0.794199526309967,
+      "learning_rate": 0.0007753652915911589,
+      "loss": 4.737,
+      "step": 128500
+    },
+    {
+      "epoch": 0.6872083359969315,
+      "grad_norm": 0.8050469756126404,
+      "learning_rate": 0.0007744734664174313,
+      "loss": 4.7261,
+      "step": 129000
+    },
+    {
+      "epoch": 0.68987193419847,
+      "grad_norm": 0.8591654300689697,
+      "learning_rate": 0.0007735816412437038,
+      "loss": 4.7311,
+      "step": 129500
+    },
+    {
+      "epoch": 0.6925355324000085,
+      "grad_norm": 0.8073732256889343,
+      "learning_rate": 0.0007726898160699763,
+      "loss": 4.7328,
+      "step": 130000
+    },
+    {
+      "epoch": 0.695199130601547,
+      "grad_norm": 0.8377549052238464,
+      "learning_rate": 0.0007717997745465961,
+      "loss": 4.7324,
+      "step": 130500
+    },
+    {
+      "epoch": 0.6978627288030855,
+      "grad_norm": 0.7627879977226257,
+      "learning_rate": 0.0007709079493728686,
+      "loss": 4.7304,
+      "step": 131000
+    },
+    {
+      "epoch": 0.700526327004624,
+      "grad_norm": 0.7747420072555542,
+      "learning_rate": 0.000770016124199141,
+      "loss": 4.7337,
+      "step": 131500
+    },
+    {
+      "epoch": 0.7031899252061625,
+      "grad_norm": 0.833777129650116,
+      "learning_rate": 0.0007691242990254135,
+      "loss": 4.7362,
+      "step": 132000
+    },
+    {
+      "epoch": 0.705853523407701,
+      "grad_norm": 0.8297452330589294,
+      "learning_rate": 0.0007682342575020334,
+      "loss": 4.7331,
+      "step": 132500
+    },
+    {
+      "epoch": 0.7085171216092395,
+      "grad_norm": 0.8135260343551636,
+      "learning_rate": 0.0007673424323283058,
+      "loss": 4.7317,
+      "step": 133000
+    },
+    {
+      "epoch": 0.711180719810778,
+      "grad_norm": 0.82469242811203,
+      "learning_rate": 0.0007664506071545783,
+      "loss": 4.7289,
+      "step": 133500
+    },
+    {
+      "epoch": 0.7138443180123165,
+      "grad_norm": 0.7857999801635742,
+      "learning_rate": 0.0007655587819808507,
+      "loss": 4.7316,
+      "step": 134000
+    },
+    {
+      "epoch": 0.716507916213855,
+      "grad_norm": 0.8272935748100281,
+      "learning_rate": 0.0007646687404574707,
+      "loss": 4.7298,
+      "step": 134500
+    },
+    {
+      "epoch": 0.7191715144153935,
+      "grad_norm": 0.8309085965156555,
+      "learning_rate": 0.0007637769152837432,
+      "loss": 4.7333,
+      "step": 135000
+    },
+    {
+      "epoch": 0.721835112616932,
+      "grad_norm": 0.9184426665306091,
+      "learning_rate": 0.0007628850901100156,
+      "loss": 4.7223,
+      "step": 135500
+    },
+    {
+      "epoch": 0.7244987108184705,
+      "grad_norm": 0.8144403100013733,
+      "learning_rate": 0.0007619932649362881,
+      "loss": 4.725,
+      "step": 136000
+    },
+    {
+      "epoch": 0.727162309020009,
+      "grad_norm": 0.8021435737609863,
+      "learning_rate": 0.0007611032234129079,
+      "loss": 4.7328,
+      "step": 136500
+    },
+    {
+      "epoch": 0.7298259072215475,
+      "grad_norm": 0.8207322955131531,
+      "learning_rate": 0.0007602113982391804,
+      "loss": 4.73,
+      "step": 137000
+    },
+    {
+      "epoch": 0.7324895054230859,
+      "grad_norm": 0.904644787311554,
+      "learning_rate": 0.0007593195730654529,
+      "loss": 4.7276,
+      "step": 137500
+    },
+    {
+      "epoch": 0.7351531036246244,
+      "grad_norm": 0.7794029712677002,
+      "learning_rate": 0.0007584277478917253,
+      "loss": 4.7274,
+      "step": 138000
+    },
+    {
+      "epoch": 0.7378167018261629,
+      "grad_norm": 0.7878913879394531,
+      "learning_rate": 0.0007575377063683452,
+      "loss": 4.7238,
+      "step": 138500
+    },
+    {
+      "epoch": 0.7404803000277014,
+      "grad_norm": 0.8159613013267517,
+      "learning_rate": 0.0007566458811946176,
+      "loss": 4.7259,
+      "step": 139000
+    },
+    {
+      "epoch": 0.7431438982292399,
+      "grad_norm": 0.7896559834480286,
+      "learning_rate": 0.00075575405602089,
+      "loss": 4.7223,
+      "step": 139500
+    },
+    {
+      "epoch": 0.7458074964307784,
+      "grad_norm": 0.8425673246383667,
+      "learning_rate": 0.0007548622308471626,
+      "loss": 4.7229,
+      "step": 140000
+    },
+    {
+      "epoch": 0.7484710946323169,
+      "grad_norm": 0.8656537532806396,
+      "learning_rate": 0.0007539721893237826,
+      "loss": 4.7319,
+      "step": 140500
+    },
+    {
+      "epoch": 0.7511346928338554,
+      "grad_norm": 0.792007327079773,
+      "learning_rate": 0.000753080364150055,
+      "loss": 4.7241,
+      "step": 141000
+    },
+    {
+      "epoch": 0.7537982910353939,
+      "grad_norm": 0.8079518675804138,
+      "learning_rate": 0.0007521885389763274,
+      "loss": 4.726,
+      "step": 141500
+    },
+    {
+      "epoch": 0.7564618892369324,
+      "grad_norm": 0.8287070393562317,
+      "learning_rate": 0.0007512967138025999,
+      "loss": 4.7248,
+      "step": 142000
+    },
+    {
+      "epoch": 0.7591254874384709,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0007504048886288724,
+      "loss": 4.7208,
+      "step": 142500
+    },
+    {
+      "epoch": 0.7617890856400094,
+      "grad_norm": 0.8842335343360901,
+      "learning_rate": 0.0007495148471054923,
+      "loss": 4.7186,
+      "step": 143000
+    },
+    {
+      "epoch": 0.7644526838415479,
+      "grad_norm": 0.8471961617469788,
+      "learning_rate": 0.0007486230219317647,
+      "loss": 4.7238,
+      "step": 143500
+    },
+    {
+      "epoch": 0.7671162820430864,
+      "grad_norm": 0.8359102010726929,
+      "learning_rate": 0.0007477311967580371,
+      "loss": 4.7236,
+      "step": 144000
+    },
+    {
+      "epoch": 0.7697798802446248,
+      "grad_norm": 0.8359571099281311,
+      "learning_rate": 0.0007468393715843096,
+      "loss": 4.7208,
+      "step": 144500
+    },
+    {
+      "epoch": 0.7724434784461633,
+      "grad_norm": 0.9100736379623413,
+      "learning_rate": 0.0007459493300609294,
+      "loss": 4.7247,
+      "step": 145000
+    },
+    {
+      "epoch": 0.7751070766477018,
+      "grad_norm": 0.8252699375152588,
+      "learning_rate": 0.000745057504887202,
+      "loss": 4.7192,
+      "step": 145500
+    },
+    {
+      "epoch": 0.7777706748492403,
+      "grad_norm": 0.7999640107154846,
+      "learning_rate": 0.0007441656797134744,
+      "loss": 4.7266,
+      "step": 146000
+    },
+    {
+      "epoch": 0.7804342730507788,
+      "grad_norm": 0.7765536308288574,
+      "learning_rate": 0.0007432738545397468,
+      "loss": 4.7193,
+      "step": 146500
+    },
+    {
+      "epoch": 0.7830978712523173,
+      "grad_norm": 0.8114664554595947,
+      "learning_rate": 0.0007423838130163668,
+      "loss": 4.7184,
+      "step": 147000
+    },
+    {
+      "epoch": 0.7857614694538558,
+      "grad_norm": 0.8485323786735535,
+      "learning_rate": 0.0007414919878426392,
+      "loss": 4.7246,
+      "step": 147500
+    },
+    {
+      "epoch": 0.7884250676553943,
+      "grad_norm": 0.828681468963623,
+      "learning_rate": 0.0007406001626689118,
+      "loss": 4.7197,
+      "step": 148000
+    },
+    {
+      "epoch": 0.7910886658569328,
+      "grad_norm": 0.8879855871200562,
+      "learning_rate": 0.0007397083374951842,
+      "loss": 4.7242,
+      "step": 148500
+    },
+    {
+      "epoch": 0.7937522640584713,
+      "grad_norm": 1.0155161619186401,
+      "learning_rate": 0.0007388182959718041,
+      "loss": 4.7196,
+      "step": 149000
+    },
+    {
+      "epoch": 0.7964158622600098,
+      "grad_norm": 0.7817535996437073,
+      "learning_rate": 0.0007379264707980765,
+      "loss": 4.7276,
+      "step": 149500
+    },
+    {
+      "epoch": 0.7990794604615483,
+      "grad_norm": 0.8676290512084961,
+      "learning_rate": 0.0007370346456243489,
+      "loss": 4.7235,
+      "step": 150000
+    },
+    {
+      "epoch": 0.8017430586630868,
+      "grad_norm": 0.8710722923278809,
+      "learning_rate": 0.0007361428204506215,
+      "loss": 4.7238,
+      "step": 150500
+    },
+    {
+      "epoch": 0.8044066568646253,
+      "grad_norm": 0.9807900786399841,
+      "learning_rate": 0.0007352527789272414,
+      "loss": 4.7265,
+      "step": 151000
+    },
+    {
+      "epoch": 0.8070702550661638,
+      "grad_norm": 0.8639500737190247,
+      "learning_rate": 0.0007343609537535139,
+      "loss": 4.7256,
+      "step": 151500
+    },
+    {
+      "epoch": 0.8097338532677023,
+      "grad_norm": 0.8448176383972168,
+      "learning_rate": 0.0007334691285797863,
+      "loss": 4.719,
+      "step": 152000
+    },
+    {
+      "epoch": 0.8123974514692408,
+      "grad_norm": 0.8320333361625671,
+      "learning_rate": 0.0007325773034060587,
+      "loss": 4.7209,
+      "step": 152500
+    },
+    {
+      "epoch": 0.8150610496707793,
+      "grad_norm": 0.7863089442253113,
+      "learning_rate": 0.0007316872618826787,
+      "loss": 4.7198,
+      "step": 153000
+    },
+    {
+      "epoch": 0.8177246478723178,
+      "grad_norm": 0.9616714715957642,
+      "learning_rate": 0.0007307954367089511,
+      "loss": 4.7184,
+      "step": 153500
+    },
+    {
+      "epoch": 0.8203882460738563,
+      "grad_norm": 0.8382904529571533,
+      "learning_rate": 0.0007299036115352236,
+      "loss": 4.7171,
+      "step": 154000
+    },
+    {
+      "epoch": 0.8230518442753948,
+      "grad_norm": 0.8196877837181091,
+      "learning_rate": 0.000729011786361496,
+      "loss": 4.7157,
+      "step": 154500
+    },
+    {
+      "epoch": 0.8257154424769333,
+      "grad_norm": 0.8712915182113647,
+      "learning_rate": 0.0007281217448381159,
+      "loss": 4.7224,
+      "step": 155000
+    },
+    {
+      "epoch": 0.8283790406784717,
+      "grad_norm": 0.776938259601593,
+      "learning_rate": 0.0007272299196643883,
+      "loss": 4.7196,
+      "step": 155500
+    },
+    {
+      "epoch": 0.8310426388800102,
+      "grad_norm": 0.8299930095672607,
+      "learning_rate": 0.0007263380944906608,
+      "loss": 4.7199,
+      "step": 156000
+    },
+    {
+      "epoch": 0.8337062370815487,
+      "grad_norm": 0.8253493905067444,
+      "learning_rate": 0.0007254462693169333,
+      "loss": 4.7143,
+      "step": 156500
+    },
+    {
+      "epoch": 0.8363698352830872,
+      "grad_norm": 0.8310771584510803,
+      "learning_rate": 0.0007245562277935532,
+      "loss": 4.7205,
+      "step": 157000
+    },
+    {
+      "epoch": 0.8390334334846257,
+      "grad_norm": 0.7761854529380798,
+      "learning_rate": 0.0007236644026198257,
+      "loss": 4.7225,
+      "step": 157500
+    },
+    {
+      "epoch": 0.8416970316861642,
+      "grad_norm": 0.8773240447044373,
+      "learning_rate": 0.0007227725774460981,
+      "loss": 4.7101,
+      "step": 158000
+    },
+    {
+      "epoch": 0.8443606298877027,
+      "grad_norm": 0.8560092449188232,
+      "learning_rate": 0.0007218807522723706,
+      "loss": 4.7119,
+      "step": 158500
+    },
+    {
+      "epoch": 0.8470242280892412,
+      "grad_norm": 0.8768864870071411,
+      "learning_rate": 0.0007209907107489905,
+      "loss": 4.7211,
+      "step": 159000
+    },
+    {
+      "epoch": 0.8496878262907797,
+      "grad_norm": 0.7614040970802307,
+      "learning_rate": 0.0007200988855752629,
+      "loss": 4.716,
+      "step": 159500
+    },
+    {
+      "epoch": 0.8523514244923182,
+      "grad_norm": 0.8368701934814453,
+      "learning_rate": 0.0007192070604015354,
+      "loss": 4.719,
+      "step": 160000
+    },
+    {
+      "epoch": 0.8550150226938567,
+      "grad_norm": 0.8005092144012451,
+      "learning_rate": 0.0007183152352278078,
+      "loss": 4.7179,
+      "step": 160500
+    },
+    {
+      "epoch": 0.8576786208953951,
+      "grad_norm": 0.7758657932281494,
+      "learning_rate": 0.0007174251937044278,
+      "loss": 4.7194,
+      "step": 161000
+    },
+    {
+      "epoch": 0.8603422190969336,
+      "grad_norm": 0.7939693927764893,
+      "learning_rate": 0.0007165333685307002,
+      "loss": 4.7197,
+      "step": 161500
+    },
+    {
+      "epoch": 0.8630058172984721,
+      "grad_norm": 0.8776530623435974,
+      "learning_rate": 0.0007156415433569726,
+      "loss": 4.7114,
+      "step": 162000
+    },
+    {
+      "epoch": 0.8656694155000106,
+      "grad_norm": 0.8468111753463745,
+      "learning_rate": 0.0007147497181832451,
+      "loss": 4.719,
+      "step": 162500
+    },
+    {
+      "epoch": 0.8683330137015491,
+      "grad_norm": 0.8999060988426208,
+      "learning_rate": 0.000713859676659865,
+      "loss": 4.7151,
+      "step": 163000
+    },
+    {
+      "epoch": 0.8709966119030876,
+      "grad_norm": 0.8831384181976318,
+      "learning_rate": 0.0007129678514861376,
+      "loss": 4.7127,
+      "step": 163500
+    },
+    {
+      "epoch": 0.8736602101046261,
+      "grad_norm": 0.9266347885131836,
+      "learning_rate": 0.00071207602631241,
+      "loss": 4.7082,
+      "step": 164000
+    },
+    {
+      "epoch": 0.8763238083061646,
+      "grad_norm": 0.8402259945869446,
+      "learning_rate": 0.0007111842011386824,
+      "loss": 4.7121,
+      "step": 164500
+    },
+    {
+      "epoch": 0.8789874065077031,
+      "grad_norm": 0.8024085760116577,
+      "learning_rate": 0.0007102923759649549,
+      "loss": 4.7128,
+      "step": 165000
+    },
+    {
+      "epoch": 0.8816510047092416,
+      "grad_norm": 0.80262690782547,
+      "learning_rate": 0.0007094023344415747,
+      "loss": 4.7165,
+      "step": 165500
+    },
+    {
+      "epoch": 0.8843146029107801,
+      "grad_norm": 0.8166842460632324,
+      "learning_rate": 0.0007085105092678472,
+      "loss": 4.7177,
+      "step": 166000
+    },
+    {
+      "epoch": 0.8869782011123186,
+      "grad_norm": 0.8241666555404663,
+      "learning_rate": 0.0007076186840941197,
+      "loss": 4.7107,
+      "step": 166500
+    },
+    {
+      "epoch": 0.8896417993138571,
+      "grad_norm": 0.792934775352478,
+      "learning_rate": 0.0007067268589203921,
+      "loss": 4.7065,
+      "step": 167000
+    },
+    {
+      "epoch": 0.8923053975153956,
+      "grad_norm": 0.8425348401069641,
+      "learning_rate": 0.000705836817397012,
+      "loss": 4.7119,
+      "step": 167500
+    },
+    {
+      "epoch": 0.8949689957169341,
+      "grad_norm": 0.8911672830581665,
+      "learning_rate": 0.0007049449922232844,
+      "loss": 4.7044,
+      "step": 168000
+    },
+    {
+      "epoch": 0.8976325939184726,
+      "grad_norm": 1.0209442377090454,
+      "learning_rate": 0.0007040531670495569,
+      "loss": 4.7031,
+      "step": 168500
+    },
+    {
+      "epoch": 0.9002961921200111,
+      "grad_norm": 0.8289418816566467,
+      "learning_rate": 0.0007031613418758294,
+      "loss": 4.7156,
+      "step": 169000
+    },
+    {
+      "epoch": 0.9029597903215496,
+      "grad_norm": 0.8719263672828674,
+      "learning_rate": 0.0007022713003524494,
+      "loss": 4.7063,
+      "step": 169500
+    },
+    {
+      "epoch": 0.9056233885230881,
+      "grad_norm": 0.797635555267334,
+      "learning_rate": 0.0007013794751787218,
+      "loss": 4.7081,
+      "step": 170000
+    },
+    {
+      "epoch": 0.9082869867246266,
+      "grad_norm": 0.7837144732475281,
+      "learning_rate": 0.0007004876500049942,
+      "loss": 4.7079,
+      "step": 170500
+    },
+    {
+      "epoch": 0.9109505849261651,
+      "grad_norm": 0.8080796003341675,
+      "learning_rate": 0.0006995958248312667,
+      "loss": 4.7126,
+      "step": 171000
+    },
+    {
+      "epoch": 0.9136141831277036,
+      "grad_norm": 0.8928093910217285,
+      "learning_rate": 0.0006987057833078866,
+      "loss": 4.7084,
+      "step": 171500
+    },
+    {
+      "epoch": 0.9162777813292421,
+      "grad_norm": 0.8997321724891663,
+      "learning_rate": 0.0006978139581341591,
+      "loss": 4.7081,
+      "step": 172000
+    },
+    {
+      "epoch": 0.9189413795307806,
+      "grad_norm": 0.8603807687759399,
+      "learning_rate": 0.0006969221329604315,
+      "loss": 4.7066,
+      "step": 172500
+    },
+    {
+      "epoch": 0.9216049777323191,
+      "grad_norm": 0.7852337956428528,
+      "learning_rate": 0.0006960303077867039,
+      "loss": 4.7099,
+      "step": 173000
+    },
+    {
+      "epoch": 0.9242685759338576,
+      "grad_norm": 0.8550631999969482,
+      "learning_rate": 0.0006951402662633238,
+      "loss": 4.711,
+      "step": 173500
+    },
+    {
+      "epoch": 0.926932174135396,
+      "grad_norm": 0.809356689453125,
+      "learning_rate": 0.0006942484410895963,
+      "loss": 4.7085,
+      "step": 174000
+    },
+    {
+      "epoch": 0.9295957723369345,
+      "grad_norm": 0.9777870774269104,
+      "learning_rate": 0.0006933566159158689,
+      "loss": 4.7119,
+      "step": 174500
+    },
+    {
+      "epoch": 0.932259370538473,
+      "grad_norm": 0.8507824540138245,
+      "learning_rate": 0.0006924647907421413,
+      "loss": 4.7129,
+      "step": 175000
+    },
+    {
+      "epoch": 0.9349229687400115,
+      "grad_norm": 0.831298291683197,
+      "learning_rate": 0.0006915747492187612,
+      "loss": 4.7159,
+      "step": 175500
+    },
+    {
+      "epoch": 0.93758656694155,
+      "grad_norm": 0.8560031056404114,
+      "learning_rate": 0.0006906829240450336,
+      "loss": 4.706,
+      "step": 176000
+    },
+    {
+      "epoch": 0.9402501651430885,
+      "grad_norm": 0.8773949146270752,
+      "learning_rate": 0.000689791098871306,
+      "loss": 4.698,
+      "step": 176500
+    },
+    {
+      "epoch": 0.942913763344627,
+      "grad_norm": 0.845332145690918,
+      "learning_rate": 0.0006888992736975786,
+      "loss": 4.7075,
+      "step": 177000
+    },
+    {
+      "epoch": 0.9455773615461655,
+      "grad_norm": 0.8635679483413696,
+      "learning_rate": 0.0006880092321741984,
+      "loss": 4.7049,
+      "step": 177500
+    },
+    {
+      "epoch": 0.948240959747704,
+      "grad_norm": 0.8583770990371704,
+      "learning_rate": 0.0006871174070004709,
+      "loss": 4.7019,
+      "step": 178000
+    },
+    {
+      "epoch": 0.9509045579492424,
+      "grad_norm": 0.8701212406158447,
+      "learning_rate": 0.0006862255818267433,
+      "loss": 4.7036,
+      "step": 178500
+    },
+    {
+      "epoch": 0.9535681561507809,
+      "grad_norm": 0.872428297996521,
+      "learning_rate": 0.0006853337566530157,
+      "loss": 4.7085,
+      "step": 179000
+    },
+    {
+      "epoch": 0.9562317543523194,
+      "grad_norm": 0.9032150506973267,
+      "learning_rate": 0.0006844437151296358,
+      "loss": 4.7116,
+      "step": 179500
+    },
+    {
+      "epoch": 0.9588953525538579,
+      "grad_norm": 0.886792004108429,
+      "learning_rate": 0.0006835518899559082,
+      "loss": 4.7074,
+      "step": 180000
+    },
+    {
+      "epoch": 0.9615589507553964,
+      "grad_norm": 0.8857409358024597,
+      "learning_rate": 0.0006826600647821807,
+      "loss": 4.6975,
+      "step": 180500
+    },
+    {
+      "epoch": 0.9642225489569349,
+      "grad_norm": 0.8472080826759338,
+      "learning_rate": 0.0006817682396084531,
+      "loss": 4.7102,
+      "step": 181000
+    },
+    {
+      "epoch": 0.9668861471584734,
+      "grad_norm": 0.8281969428062439,
+      "learning_rate": 0.000680878198085073,
+      "loss": 4.71,
+      "step": 181500
+    },
+    {
+      "epoch": 0.9695497453600119,
+      "grad_norm": 0.9546143412590027,
+      "learning_rate": 0.0006799863729113455,
+      "loss": 4.7032,
+      "step": 182000
+    },
+    {
+      "epoch": 0.9722133435615504,
+      "grad_norm": 0.8378576040267944,
+      "learning_rate": 0.0006790945477376179,
+      "loss": 4.7027,
+      "step": 182500
+    },
+    {
+      "epoch": 0.9748769417630889,
+      "grad_norm": 0.8105673789978027,
+      "learning_rate": 0.0006782027225638904,
+      "loss": 4.7096,
+      "step": 183000
+    },
+    {
+      "epoch": 0.9775405399646274,
+      "grad_norm": 0.8699236512184143,
+      "learning_rate": 0.0006773126810405102,
+      "loss": 4.7058,
+      "step": 183500
+    },
+    {
+      "epoch": 0.9802041381661659,
+      "grad_norm": 0.8328757286071777,
+      "learning_rate": 0.0006764208558667827,
+      "loss": 4.7049,
+      "step": 184000
+    },
+    {
+      "epoch": 0.9828677363677044,
+      "grad_norm": 0.8056396245956421,
+      "learning_rate": 0.0006755290306930552,
+      "loss": 4.7022,
+      "step": 184500
+    },
+    {
+      "epoch": 0.9855313345692429,
+      "grad_norm": 0.7749541997909546,
+      "learning_rate": 0.0006746372055193276,
+      "loss": 4.7069,
+      "step": 185000
+    },
+    {
+      "epoch": 0.9881949327707814,
+      "grad_norm": 0.9022479057312012,
+      "learning_rate": 0.0006737453803456001,
+      "loss": 4.713,
+      "step": 185500
+    },
+    {
+      "epoch": 0.9908585309723199,
+      "grad_norm": 0.8944171667098999,
+      "learning_rate": 0.00067285533882222,
+      "loss": 4.7115,
+      "step": 186000
+    },
+    {
+      "epoch": 0.9935221291738584,
+      "grad_norm": 0.9268381595611572,
+      "learning_rate": 0.0006719635136484925,
+      "loss": 4.7068,
+      "step": 186500
+    },
+    {
+      "epoch": 0.9961857273753969,
+      "grad_norm": 0.8826886415481567,
+      "learning_rate": 0.0006710716884747649,
+      "loss": 4.7071,
+      "step": 187000
+    },
+    {
+      "epoch": 0.9988493255769354,
+      "grad_norm": 0.8730109333992004,
+      "learning_rate": 0.0006701798633010374,
+      "loss": 4.7062,
+      "step": 187500
+    },
+    {
+      "epoch": 1.001512923778474,
+      "grad_norm": 0.8082478046417236,
+      "learning_rate": 0.0006692898217776573,
+      "loss": 4.7079,
+      "step": 188000
+    },
+    {
+      "epoch": 1.0041765219800123,
+      "grad_norm": 0.840177059173584,
+      "learning_rate": 0.0006683979966039297,
+      "loss": 4.7087,
+      "step": 188500
+    },
+    {
+      "epoch": 1.006840120181551,
+      "grad_norm": 0.9156913161277771,
+      "learning_rate": 0.0006675061714302022,
+      "loss": 4.6991,
+      "step": 189000
+    },
+    {
+      "epoch": 1.0095037183830893,
+      "grad_norm": 0.909474790096283,
+      "learning_rate": 0.0006666143462564746,
+      "loss": 4.7024,
+      "step": 189500
+    },
+    {
+      "epoch": 1.0121673165846279,
+      "grad_norm": 0.8102747201919556,
+      "learning_rate": 0.0006657243047330946,
+      "loss": 4.7031,
+      "step": 190000
+    },
+    {
+      "epoch": 1.0148309147861663,
+      "grad_norm": 0.8523674011230469,
+      "learning_rate": 0.000664832479559367,
+      "loss": 4.7095,
+      "step": 190500
+    },
+    {
+      "epoch": 1.0174945129877049,
+      "grad_norm": 0.7822126746177673,
+      "learning_rate": 0.0006639406543856394,
+      "loss": 4.7074,
+      "step": 191000
+    },
+    {
+      "epoch": 1.0201581111892433,
+      "grad_norm": 0.8011199831962585,
+      "learning_rate": 0.0006630488292119119,
+      "loss": 4.706,
+      "step": 191500
+    },
+    {
+      "epoch": 1.0228217093907819,
+      "grad_norm": 1.0168174505233765,
+      "learning_rate": 0.0006621605713388793,
+      "loss": 4.7017,
+      "step": 192000
+    },
+    {
+      "epoch": 1.0254853075923203,
+      "grad_norm": 1.1239140033721924,
+      "learning_rate": 0.0006612687461651517,
+      "loss": 4.6948,
+      "step": 192500
+    },
+    {
+      "epoch": 1.0281489057938589,
+      "grad_norm": 0.8248723745346069,
+      "learning_rate": 0.0006603769209914242,
+      "loss": 4.7031,
+      "step": 193000
+    },
+    {
+      "epoch": 1.0308125039953973,
+      "grad_norm": 0.8193596601486206,
+      "learning_rate": 0.0006594850958176967,
+      "loss": 4.697,
+      "step": 193500
+    },
+    {
+      "epoch": 1.0334761021969359,
+      "grad_norm": 0.8454943895339966,
+      "learning_rate": 0.0006585932706439691,
+      "loss": 4.7019,
+      "step": 194000
+    },
+    {
+      "epoch": 1.0361397003984743,
+      "grad_norm": 0.9663782715797424,
+      "learning_rate": 0.0006577014454702415,
+      "loss": 4.7044,
+      "step": 194500
+    },
+    {
+      "epoch": 1.0388032986000129,
+      "grad_norm": 0.8691510558128357,
+      "learning_rate": 0.0006568096202965141,
+      "loss": 4.6986,
+      "step": 195000
+    },
+    {
+      "epoch": 1.0414668968015512,
+      "grad_norm": 0.9316896200180054,
+      "learning_rate": 0.0006559177951227865,
+      "loss": 4.6983,
+      "step": 195500
+    },
+    {
+      "epoch": 1.0441304950030899,
+      "grad_norm": 0.8153261542320251,
+      "learning_rate": 0.000655025969949059,
+      "loss": 4.7022,
+      "step": 196000
+    },
+    {
+      "epoch": 1.0467940932046282,
+      "grad_norm": 0.8377756476402283,
+      "learning_rate": 0.0006541359284256788,
+      "loss": 4.6976,
+      "step": 196500
+    },
+    {
+      "epoch": 1.0494576914061668,
+      "grad_norm": 0.87883460521698,
+      "learning_rate": 0.0006532441032519512,
+      "loss": 4.6979,
+      "step": 197000
+    },
+    {
+      "epoch": 1.0521212896077052,
+      "grad_norm": 0.8691816926002502,
+      "learning_rate": 0.0006523522780782239,
+      "loss": 4.7022,
+      "step": 197500
+    },
+    {
+      "epoch": 1.0547848878092438,
+      "grad_norm": 0.8359382748603821,
+      "learning_rate": 0.0006514604529044963,
+      "loss": 4.6971,
+      "step": 198000
+    },
+    {
+      "epoch": 1.0574484860107822,
+      "grad_norm": 1.1580724716186523,
+      "learning_rate": 0.0006505704113811162,
+      "loss": 4.7096,
+      "step": 198500
+    },
+    {
+      "epoch": 1.0601120842123208,
+      "grad_norm": 0.9052878618240356,
+      "learning_rate": 0.0006496785862073886,
+      "loss": 4.6942,
+      "step": 199000
+    },
+    {
+      "epoch": 1.0627756824138592,
+      "grad_norm": 0.8252458572387695,
+      "learning_rate": 0.000648786761033661,
+      "loss": 4.6964,
+      "step": 199500
+    },
+    {
+      "epoch": 1.0654392806153978,
+      "grad_norm": 0.8529119491577148,
+      "learning_rate": 0.0006478949358599335,
+      "loss": 4.6997,
+      "step": 200000
+    },
+    {
+      "epoch": 1.0681028788169362,
+      "grad_norm": 0.7900977730751038,
+      "learning_rate": 0.0006470048943365535,
+      "loss": 4.6909,
+      "step": 200500
+    },
+    {
+      "epoch": 1.0707664770184748,
+      "grad_norm": 0.8846203088760376,
+      "learning_rate": 0.0006461130691628259,
+      "loss": 4.6958,
+      "step": 201000
+    },
+    {
+      "epoch": 1.0734300752200132,
+      "grad_norm": 0.9266300797462463,
+      "learning_rate": 0.0006452212439890983,
+      "loss": 4.699,
+      "step": 201500
+    },
+    {
+      "epoch": 1.0760936734215516,
+      "grad_norm": 0.8480575680732727,
+      "learning_rate": 0.0006443294188153707,
+      "loss": 4.702,
+      "step": 202000
+    },
+    {
+      "epoch": 1.0787572716230902,
+      "grad_norm": 0.8029345870018005,
+      "learning_rate": 0.0006434393772919907,
+      "loss": 4.698,
+      "step": 202500
+    },
+    {
+      "epoch": 1.0814208698246286,
+      "grad_norm": 0.9869500398635864,
+      "learning_rate": 0.0006425475521182633,
+      "loss": 4.6991,
+      "step": 203000
+    },
+    {
+      "epoch": 1.0840844680261672,
+      "grad_norm": 0.8242001533508301,
+      "learning_rate": 0.0006416557269445357,
+      "loss": 4.6968,
+      "step": 203500
+    },
+    {
+      "epoch": 1.0867480662277056,
+      "grad_norm": 0.9248818755149841,
+      "learning_rate": 0.0006407639017708081,
+      "loss": 4.6906,
+      "step": 204000
+    },
+    {
+      "epoch": 1.0894116644292442,
+      "grad_norm": 0.8327652812004089,
+      "learning_rate": 0.000639873860247428,
+      "loss": 4.6968,
+      "step": 204500
+    },
+    {
+      "epoch": 1.0920752626307826,
+      "grad_norm": 0.898684024810791,
+      "learning_rate": 0.0006389820350737004,
+      "loss": 4.693,
+      "step": 205000
+    },
+    {
+      "epoch": 1.0947388608323212,
+      "grad_norm": 0.826521635055542,
+      "learning_rate": 0.000638090209899973,
+      "loss": 4.7004,
+      "step": 205500
+    },
+    {
+      "epoch": 1.0974024590338596,
+      "grad_norm": 0.8696659803390503,
+      "learning_rate": 0.0006371983847262454,
+      "loss": 4.6999,
+      "step": 206000
+    },
+    {
+      "epoch": 1.1000660572353982,
+      "grad_norm": 0.8574073910713196,
+      "learning_rate": 0.0006363083432028652,
+      "loss": 4.6929,
+      "step": 206500
+    },
+    {
+      "epoch": 1.1027296554369366,
+      "grad_norm": 0.857872486114502,
+      "learning_rate": 0.0006354165180291377,
+      "loss": 4.6982,
+      "step": 207000
+    },
+    {
+      "epoch": 1.1053932536384752,
+      "grad_norm": 0.9049299359321594,
+      "learning_rate": 0.0006345246928554101,
+      "loss": 4.6986,
+      "step": 207500
+    },
+    {
+      "epoch": 1.1080568518400136,
+      "grad_norm": 0.8885313868522644,
+      "learning_rate": 0.0006336328676816825,
+      "loss": 4.6961,
+      "step": 208000
+    },
+    {
+      "epoch": 1.1107204500415522,
+      "grad_norm": 0.885249674320221,
+      "learning_rate": 0.0006327428261583026,
+      "loss": 4.6914,
+      "step": 208500
+    },
+    {
+      "epoch": 1.1133840482430906,
+      "grad_norm": 0.8557220101356506,
+      "learning_rate": 0.0006318510009845751,
+      "loss": 4.6979,
+      "step": 209000
+    },
+    {
+      "epoch": 1.1160476464446292,
+      "grad_norm": 0.850692868232727,
+      "learning_rate": 0.0006309591758108475,
+      "loss": 4.6923,
+      "step": 209500
+    },
+    {
+      "epoch": 1.1187112446461676,
+      "grad_norm": 0.8561812043190002,
+      "learning_rate": 0.0006300673506371199,
+      "loss": 4.696,
+      "step": 210000
+    },
+    {
+      "epoch": 1.1213748428477062,
+      "grad_norm": 0.8279117941856384,
+      "learning_rate": 0.0006291773091137398,
+      "loss": 4.6948,
+      "step": 210500
+    },
+    {
+      "epoch": 1.1240384410492446,
+      "grad_norm": 1.035873293876648,
+      "learning_rate": 0.0006282854839400123,
+      "loss": 4.6941,
+      "step": 211000
+    },
+    {
+      "epoch": 1.1267020392507832,
+      "grad_norm": 0.9458531141281128,
+      "learning_rate": 0.0006273936587662848,
+      "loss": 4.7018,
+      "step": 211500
+    },
+    {
+      "epoch": 1.1293656374523215,
+      "grad_norm": 0.9144226908683777,
+      "learning_rate": 0.0006265018335925572,
+      "loss": 4.694,
+      "step": 212000
+    },
+    {
+      "epoch": 1.1320292356538602,
+      "grad_norm": 0.8634624481201172,
+      "learning_rate": 0.000625611792069177,
+      "loss": 4.6982,
+      "step": 212500
+    },
+    {
+      "epoch": 1.1346928338553985,
+      "grad_norm": 0.8695416450500488,
+      "learning_rate": 0.0006247199668954495,
+      "loss": 4.6966,
+      "step": 213000
+    },
+    {
+      "epoch": 1.1373564320569371,
+      "grad_norm": 0.8389537930488586,
+      "learning_rate": 0.000623828141721722,
+      "loss": 4.6981,
+      "step": 213500
+    },
+    {
+      "epoch": 1.1400200302584755,
+      "grad_norm": 0.8467423915863037,
+      "learning_rate": 0.0006229363165479945,
+      "loss": 4.6901,
+      "step": 214000
+    },
+    {
+      "epoch": 1.1426836284600141,
+      "grad_norm": 0.8449163436889648,
+      "learning_rate": 0.0006220462750246144,
+      "loss": 4.6951,
+      "step": 214500
+    },
+    {
+      "epoch": 1.1453472266615525,
+      "grad_norm": 0.866750180721283,
+      "learning_rate": 0.0006211544498508869,
+      "loss": 4.6976,
+      "step": 215000
+    },
+    {
+      "epoch": 1.1480108248630911,
+      "grad_norm": 0.8245420455932617,
+      "learning_rate": 0.0006202626246771593,
+      "loss": 4.6912,
+      "step": 215500
+    },
+    {
+      "epoch": 1.1506744230646295,
+      "grad_norm": 0.8339635133743286,
+      "learning_rate": 0.0006193707995034318,
+      "loss": 4.6915,
+      "step": 216000
+    },
+    {
+      "epoch": 1.1533380212661681,
+      "grad_norm": 0.8900044560432434,
+      "learning_rate": 0.0006184807579800517,
+      "loss": 4.6919,
+      "step": 216500
+    },
+    {
+      "epoch": 1.1560016194677065,
+      "grad_norm": 0.9079304337501526,
+      "learning_rate": 0.0006175889328063241,
+      "loss": 4.6921,
+      "step": 217000
+    },
+    {
+      "epoch": 1.158665217669245,
+      "grad_norm": 0.8826993107795715,
+      "learning_rate": 0.0006166971076325966,
+      "loss": 4.6901,
+      "step": 217500
+    },
+    {
+      "epoch": 1.1613288158707835,
+      "grad_norm": 0.8574143052101135,
+      "learning_rate": 0.000615805282458869,
+      "loss": 4.6906,
+      "step": 218000
+    },
+    {
+      "epoch": 1.1639924140723221,
+      "grad_norm": 0.8516850471496582,
+      "learning_rate": 0.0006149152409354888,
+      "loss": 4.6973,
+      "step": 218500
+    },
+    {
+      "epoch": 1.1666560122738605,
+      "grad_norm": 1.0509278774261475,
+      "learning_rate": 0.0006140234157617614,
+      "loss": 4.6885,
+      "step": 219000
+    },
+    {
+      "epoch": 1.169319610475399,
+      "grad_norm": 0.8757687211036682,
+      "learning_rate": 0.0006131315905880338,
+      "loss": 4.6948,
+      "step": 219500
+    },
+    {
+      "epoch": 1.1719832086769375,
+      "grad_norm": 0.8677180409431458,
+      "learning_rate": 0.0006122397654143063,
+      "loss": 4.6943,
+      "step": 220000
+    },
+    {
+      "epoch": 1.1746468068784761,
+      "grad_norm": 0.8716105222702026,
+      "learning_rate": 0.0006113497238909262,
+      "loss": 4.695,
+      "step": 220500
+    },
+    {
+      "epoch": 1.1773104050800145,
+      "grad_norm": 0.8486727476119995,
+      "learning_rate": 0.0006104578987171987,
+      "loss": 4.6947,
+      "step": 221000
+    },
+    {
+      "epoch": 1.1799740032815529,
+      "grad_norm": 0.9231880307197571,
+      "learning_rate": 0.0006095660735434712,
+      "loss": 4.6864,
+      "step": 221500
+    },
+    {
+      "epoch": 1.1826376014830915,
+      "grad_norm": 0.9859126210212708,
+      "learning_rate": 0.0006086742483697436,
+      "loss": 4.6876,
+      "step": 222000
+    },
+    {
+      "epoch": 1.1853011996846299,
+      "grad_norm": 0.846367359161377,
+      "learning_rate": 0.0006077824231960161,
+      "loss": 4.6871,
+      "step": 222500
+    },
+    {
+      "epoch": 1.1879647978861685,
+      "grad_norm": 0.8780665397644043,
+      "learning_rate": 0.0006068923816726359,
+      "loss": 4.7,
+      "step": 223000
+    },
+    {
+      "epoch": 1.1906283960877069,
+      "grad_norm": 0.8220515847206116,
+      "learning_rate": 0.0006060005564989084,
+      "loss": 4.6904,
+      "step": 223500
+    },
+    {
+      "epoch": 1.1932919942892455,
+      "grad_norm": 0.8694311380386353,
+      "learning_rate": 0.0006051087313251809,
+      "loss": 4.6971,
+      "step": 224000
+    },
+    {
+      "epoch": 1.1959555924907839,
+      "grad_norm": 0.858805775642395,
+      "learning_rate": 0.0006042169061514533,
+      "loss": 4.6811,
+      "step": 224500
+    },
+    {
+      "epoch": 1.1986191906923225,
+      "grad_norm": 0.976883053779602,
+      "learning_rate": 0.0006033268646280733,
+      "loss": 4.7028,
+      "step": 225000
+    },
+    {
+      "epoch": 1.2012827888938609,
+      "grad_norm": 0.8692370653152466,
+      "learning_rate": 0.0006024350394543457,
+      "loss": 4.6903,
+      "step": 225500
+    },
+    {
+      "epoch": 1.2039463870953995,
+      "grad_norm": 0.929794192314148,
+      "learning_rate": 0.0006015432142806182,
+      "loss": 4.6896,
+      "step": 226000
+    },
+    {
+      "epoch": 1.2066099852969379,
+      "grad_norm": 0.8333790898323059,
+      "learning_rate": 0.0006006513891068907,
+      "loss": 4.691,
+      "step": 226500
+    },
+    {
+      "epoch": 1.2092735834984765,
+      "grad_norm": 0.8712317943572998,
+      "learning_rate": 0.0005997613475835106,
+      "loss": 4.6832,
+      "step": 227000
+    },
+    {
+      "epoch": 1.2119371817000149,
+      "grad_norm": 0.9365465641021729,
+      "learning_rate": 0.000598869522409783,
+      "loss": 4.6856,
+      "step": 227500
+    },
+    {
+      "epoch": 1.2146007799015535,
+      "grad_norm": 0.8496169447898865,
+      "learning_rate": 0.0005979776972360554,
+      "loss": 4.6923,
+      "step": 228000
+    },
+    {
+      "epoch": 1.2172643781030918,
+      "grad_norm": 0.9000328779220581,
+      "learning_rate": 0.0005970858720623279,
+      "loss": 4.6847,
+      "step": 228500
+    },
+    {
+      "epoch": 1.2199279763046305,
+      "grad_norm": 0.8945797681808472,
+      "learning_rate": 0.0005961958305389477,
+      "loss": 4.6902,
+      "step": 229000
+    },
+    {
+      "epoch": 1.2225915745061688,
+      "grad_norm": 0.8715533018112183,
+      "learning_rate": 0.0005953040053652203,
+      "loss": 4.6879,
+      "step": 229500
+    },
+    {
+      "epoch": 1.2252551727077075,
+      "grad_norm": 0.9229386448860168,
+      "learning_rate": 0.0005944121801914927,
+      "loss": 4.6897,
+      "step": 230000
+    },
+    {
+      "epoch": 1.2279187709092458,
+      "grad_norm": 0.8467351198196411,
+      "learning_rate": 0.0005935203550177651,
+      "loss": 4.6879,
+      "step": 230500
+    },
+    {
+      "epoch": 1.2305823691107844,
+      "grad_norm": 0.823901891708374,
+      "learning_rate": 0.0005926303134943851,
+      "loss": 4.6885,
+      "step": 231000
+    },
+    {
+      "epoch": 1.2332459673123228,
+      "grad_norm": 0.8735817074775696,
+      "learning_rate": 0.0005917384883206575,
+      "loss": 4.691,
+      "step": 231500
+    },
+    {
+      "epoch": 1.2359095655138614,
+      "grad_norm": 0.8728153109550476,
+      "learning_rate": 0.0005908466631469301,
+      "loss": 4.6857,
+      "step": 232000
+    },
+    {
+      "epoch": 1.2385731637153998,
+      "grad_norm": 0.8330144882202148,
+      "learning_rate": 0.0005899548379732025,
+      "loss": 4.6904,
+      "step": 232500
+    },
+    {
+      "epoch": 1.2412367619169384,
+      "grad_norm": 0.973419725894928,
+      "learning_rate": 0.0005890647964498224,
+      "loss": 4.6829,
+      "step": 233000
+    },
+    {
+      "epoch": 1.2439003601184768,
+      "grad_norm": 0.8705942034721375,
+      "learning_rate": 0.0005881729712760948,
+      "loss": 4.6946,
+      "step": 233500
+    },
+    {
+      "epoch": 1.2465639583200154,
+      "grad_norm": 0.8772411942481995,
+      "learning_rate": 0.0005872829297527147,
+      "loss": 4.6866,
+      "step": 234000
+    },
+    {
+      "epoch": 1.2492275565215538,
+      "grad_norm": 0.8856297135353088,
+      "learning_rate": 0.0005863911045789872,
+      "loss": 4.6899,
+      "step": 234500
+    },
+    {
+      "epoch": 1.2518911547230922,
+      "grad_norm": 0.9031875133514404,
+      "learning_rate": 0.0005854992794052596,
+      "loss": 4.6917,
+      "step": 235000
+    },
+    {
+      "epoch": 1.2545547529246308,
+      "grad_norm": 1.0048778057098389,
+      "learning_rate": 0.0005846074542315321,
+      "loss": 4.6837,
+      "step": 235500
+    },
+    {
+      "epoch": 1.2572183511261694,
+      "grad_norm": 0.8882681727409363,
+      "learning_rate": 0.0005837156290578045,
+      "loss": 4.6813,
+      "step": 236000
+    },
+    {
+      "epoch": 1.2598819493277078,
+      "grad_norm": 0.8688454627990723,
+      "learning_rate": 0.0005828238038840769,
+      "loss": 4.6774,
+      "step": 236500
+    },
+    {
+      "epoch": 1.2625455475292462,
+      "grad_norm": 0.9453760385513306,
+      "learning_rate": 0.0005819319787103495,
+      "loss": 4.6889,
+      "step": 237000
+    },
+    {
+      "epoch": 1.2652091457307848,
+      "grad_norm": 0.8512344360351562,
+      "learning_rate": 0.0005810401535366219,
+      "loss": 4.6835,
+      "step": 237500
+    },
+    {
+      "epoch": 1.2678727439323234,
+      "grad_norm": 0.858352541923523,
+      "learning_rate": 0.0005801501120132419,
+      "loss": 4.6872,
+      "step": 238000
+    },
+    {
+      "epoch": 1.2705363421338618,
+      "grad_norm": 0.8966683149337769,
+      "learning_rate": 0.0005792582868395143,
+      "loss": 4.6812,
+      "step": 238500
+    },
+    {
+      "epoch": 1.2731999403354002,
+      "grad_norm": 1.3160330057144165,
+      "learning_rate": 0.0005783664616657867,
+      "loss": 4.6867,
+      "step": 239000
+    },
+    {
+      "epoch": 1.2758635385369388,
+      "grad_norm": 0.8476753830909729,
+      "learning_rate": 0.0005774746364920593,
+      "loss": 4.6908,
+      "step": 239500
+    },
+    {
+      "epoch": 1.2785271367384774,
+      "grad_norm": 0.9309910535812378,
+      "learning_rate": 0.0005765845949686791,
+      "loss": 4.6763,
+      "step": 240000
+    },
+    {
+      "epoch": 1.2811907349400158,
+      "grad_norm": 0.8932083249092102,
+      "learning_rate": 0.0005756927697949516,
+      "loss": 4.6868,
+      "step": 240500
+    },
+    {
+      "epoch": 1.2838543331415542,
+      "grad_norm": 0.8718700408935547,
+      "learning_rate": 0.000574800944621224,
+      "loss": 4.6848,
+      "step": 241000
+    },
+    {
+      "epoch": 1.2865179313430928,
+      "grad_norm": 0.8954005837440491,
+      "learning_rate": 0.0005739091194474964,
+      "loss": 4.6781,
+      "step": 241500
+    },
+    {
+      "epoch": 1.2891815295446314,
+      "grad_norm": 0.8750497698783875,
+      "learning_rate": 0.0005730190779241163,
+      "loss": 4.6871,
+      "step": 242000
+    },
+    {
+      "epoch": 1.2918451277461698,
+      "grad_norm": 0.8953655362129211,
+      "learning_rate": 0.0005721272527503888,
+      "loss": 4.688,
+      "step": 242500
+    },
+    {
+      "epoch": 1.2945087259477082,
+      "grad_norm": 0.8756445050239563,
+      "learning_rate": 0.0005712354275766613,
+      "loss": 4.6832,
+      "step": 243000
+    },
+    {
+      "epoch": 1.2971723241492468,
+      "grad_norm": 0.842310905456543,
+      "learning_rate": 0.0005703436024029338,
+      "loss": 4.683,
+      "step": 243500
+    },
+    {
+      "epoch": 1.2998359223507852,
+      "grad_norm": 0.8197309970855713,
+      "learning_rate": 0.0005694535608795537,
+      "loss": 4.6839,
+      "step": 244000
+    },
+    {
+      "epoch": 1.3024995205523238,
+      "grad_norm": 0.8899139165878296,
+      "learning_rate": 0.0005685617357058261,
+      "loss": 4.6937,
+      "step": 244500
+    },
+    {
+      "epoch": 1.3051631187538622,
+      "grad_norm": 0.9787269830703735,
+      "learning_rate": 0.0005676699105320986,
+      "loss": 4.686,
+      "step": 245000
+    },
+    {
+      "epoch": 1.3078267169554008,
+      "grad_norm": 0.936326265335083,
+      "learning_rate": 0.0005667780853583711,
+      "loss": 4.6823,
+      "step": 245500
+    },
+    {
+      "epoch": 1.3104903151569391,
+      "grad_norm": 0.8806182742118835,
+      "learning_rate": 0.0005658880438349909,
+      "loss": 4.6825,
+      "step": 246000
+    },
+    {
+      "epoch": 1.3131539133584778,
+      "grad_norm": 0.9858034253120422,
+      "learning_rate": 0.0005649962186612634,
+      "loss": 4.6833,
+      "step": 246500
+    },
+    {
+      "epoch": 1.3158175115600161,
+      "grad_norm": 1.0451375246047974,
+      "learning_rate": 0.0005641043934875358,
+      "loss": 4.6828,
+      "step": 247000
+    },
+    {
+      "epoch": 1.3184811097615547,
+      "grad_norm": 0.9298591613769531,
+      "learning_rate": 0.0005632125683138083,
+      "loss": 4.6864,
+      "step": 247500
+    },
+    {
+      "epoch": 1.3211447079630931,
+      "grad_norm": 0.8836386799812317,
+      "learning_rate": 0.0005623207431400808,
+      "loss": 4.6793,
+      "step": 248000
+    },
+    {
+      "epoch": 1.3238083061646317,
+      "grad_norm": 0.820397138595581,
+      "learning_rate": 0.0005614307016167007,
+      "loss": 4.6895,
+      "step": 248500
+    },
+    {
+      "epoch": 1.3264719043661701,
+      "grad_norm": 0.9033796787261963,
+      "learning_rate": 0.0005605388764429732,
+      "loss": 4.6778,
+      "step": 249000
+    },
+    {
+      "epoch": 1.3291355025677087,
+      "grad_norm": 0.9165984988212585,
+      "learning_rate": 0.0005596470512692456,
+      "loss": 4.6823,
+      "step": 249500
+    },
+    {
+      "epoch": 1.3317991007692471,
+      "grad_norm": 0.8427574038505554,
+      "learning_rate": 0.0005587552260955181,
+      "loss": 4.6832,
+      "step": 250000
+    },
+    {
+      "epoch": 1.3344626989707857,
+      "grad_norm": 0.8803266286849976,
+      "learning_rate": 0.0005578634009217906,
+      "loss": 4.685,
+      "step": 250500
+    },
+    {
+      "epoch": 1.3371262971723241,
+      "grad_norm": 0.8542140126228333,
+      "learning_rate": 0.0005569733593984104,
+      "loss": 4.6851,
+      "step": 251000
+    },
+    {
+      "epoch": 1.3397898953738627,
+      "grad_norm": 0.9896337389945984,
+      "learning_rate": 0.0005560815342246829,
+      "loss": 4.6839,
+      "step": 251500
+    },
+    {
+      "epoch": 1.3424534935754011,
+      "grad_norm": 0.8505421876907349,
+      "learning_rate": 0.0005551897090509553,
+      "loss": 4.6808,
+      "step": 252000
+    },
+    {
+      "epoch": 1.3451170917769395,
+      "grad_norm": 0.9560419917106628,
+      "learning_rate": 0.0005542978838772278,
+      "loss": 4.6787,
+      "step": 252500
+    },
+    {
+      "epoch": 1.347780689978478,
+      "grad_norm": 0.9107364416122437,
+      "learning_rate": 0.0005534078423538477,
+      "loss": 4.6808,
+      "step": 253000
+    },
+    {
+      "epoch": 1.3504442881800167,
+      "grad_norm": 0.8997001647949219,
+      "learning_rate": 0.0005525160171801201,
+      "loss": 4.6799,
+      "step": 253500
+    },
+    {
+      "epoch": 1.353107886381555,
+      "grad_norm": 0.9192373752593994,
+      "learning_rate": 0.0005516241920063926,
+      "loss": 4.6837,
+      "step": 254000
+    },
+    {
+      "epoch": 1.3557714845830935,
+      "grad_norm": 1.058236837387085,
+      "learning_rate": 0.000550732366832665,
+      "loss": 4.6773,
+      "step": 254500
+    },
+    {
+      "epoch": 1.358435082784632,
+      "grad_norm": 0.949219286441803,
+      "learning_rate": 0.000549842325309285,
+      "loss": 4.6826,
+      "step": 255000
+    },
+    {
+      "epoch": 1.3610986809861707,
+      "grad_norm": 0.8578962087631226,
+      "learning_rate": 0.0005489505001355575,
+      "loss": 4.6865,
+      "step": 255500
+    },
+    {
+      "epoch": 1.363762279187709,
+      "grad_norm": 0.9393055438995361,
+      "learning_rate": 0.00054805867496183,
+      "loss": 4.6774,
+      "step": 256000
+    },
+    {
+      "epoch": 1.3664258773892475,
+      "grad_norm": 1.0173401832580566,
+      "learning_rate": 0.0005471668497881024,
+      "loss": 4.6783,
+      "step": 256500
+    },
+    {
+      "epoch": 1.369089475590786,
+      "grad_norm": 0.8577682971954346,
+      "learning_rate": 0.0005462768082647222,
+      "loss": 4.6789,
+      "step": 257000
+    },
+    {
+      "epoch": 1.3717530737923247,
+      "grad_norm": 0.9181286692619324,
+      "learning_rate": 0.0005453849830909947,
+      "loss": 4.6754,
+      "step": 257500
+    },
+    {
+      "epoch": 1.374416671993863,
+      "grad_norm": 0.8773962259292603,
+      "learning_rate": 0.0005444931579172672,
+      "loss": 4.6882,
+      "step": 258000
+    },
+    {
+      "epoch": 1.3770802701954015,
+      "grad_norm": 0.9522156715393066,
+      "learning_rate": 0.0005436013327435396,
+      "loss": 4.684,
+      "step": 258500
+    },
+    {
+      "epoch": 1.37974386839694,
+      "grad_norm": 0.8997749090194702,
+      "learning_rate": 0.0005427112912201595,
+      "loss": 4.6814,
+      "step": 259000
+    },
+    {
+      "epoch": 1.3824074665984787,
+      "grad_norm": 0.8679563403129578,
+      "learning_rate": 0.0005418194660464319,
+      "loss": 4.6849,
+      "step": 259500
+    },
+    {
+      "epoch": 1.385071064800017,
+      "grad_norm": 0.8472937345504761,
+      "learning_rate": 0.0005409276408727044,
+      "loss": 4.6765,
+      "step": 260000
+    },
+    {
+      "epoch": 1.3877346630015555,
+      "grad_norm": 0.9184697270393372,
+      "learning_rate": 0.0005400358156989769,
+      "loss": 4.6795,
+      "step": 260500
+    },
+    {
+      "epoch": 1.390398261203094,
+      "grad_norm": 0.8961514234542847,
+      "learning_rate": 0.0005391457741755969,
+      "loss": 4.6798,
+      "step": 261000
+    },
+    {
+      "epoch": 1.3930618594046325,
+      "grad_norm": 0.9035218954086304,
+      "learning_rate": 0.0005382539490018693,
+      "loss": 4.6804,
+      "step": 261500
+    },
+    {
+      "epoch": 1.395725457606171,
+      "grad_norm": 0.9542981386184692,
+      "learning_rate": 0.0005373621238281417,
+      "loss": 4.6763,
+      "step": 262000
+    },
+    {
+      "epoch": 1.3983890558077094,
+      "grad_norm": 0.8902364373207092,
+      "learning_rate": 0.0005364702986544142,
+      "loss": 4.679,
+      "step": 262500
+    },
+    {
+      "epoch": 1.401052654009248,
+      "grad_norm": 0.8759614825248718,
+      "learning_rate": 0.000535580257131034,
+      "loss": 4.6819,
+      "step": 263000
+    },
+    {
+      "epoch": 1.4037162522107864,
+      "grad_norm": 0.9290657043457031,
+      "learning_rate": 0.0005346884319573066,
+      "loss": 4.6787,
+      "step": 263500
+    },
+    {
+      "epoch": 1.406379850412325,
+      "grad_norm": 1.0657765865325928,
+      "learning_rate": 0.000533796606783579,
+      "loss": 4.6805,
+      "step": 264000
+    },
+    {
+      "epoch": 1.4090434486138634,
+      "grad_norm": 0.9341272711753845,
+      "learning_rate": 0.0005329047816098514,
+      "loss": 4.6824,
+      "step": 264500
+    },
+    {
+      "epoch": 1.411707046815402,
+      "grad_norm": 0.8521129488945007,
+      "learning_rate": 0.0005320147400864713,
+      "loss": 4.6773,
+      "step": 265000
+    },
+    {
+      "epoch": 1.4143706450169404,
+      "grad_norm": 0.9178290367126465,
+      "learning_rate": 0.0005311229149127437,
+      "loss": 4.6795,
+      "step": 265500
+    },
+    {
+      "epoch": 1.417034243218479,
+      "grad_norm": 0.9215536713600159,
+      "learning_rate": 0.0005302310897390163,
+      "loss": 4.6743,
+      "step": 266000
+    },
+    {
+      "epoch": 1.4196978414200174,
+      "grad_norm": 0.9139541387557983,
+      "learning_rate": 0.0005293392645652888,
+      "loss": 4.6732,
+      "step": 266500
+    },
+    {
+      "epoch": 1.422361439621556,
+      "grad_norm": 0.9697835445404053,
+      "learning_rate": 0.0005284492230419087,
+      "loss": 4.6846,
+      "step": 267000
+    },
+    {
+      "epoch": 1.4250250378230944,
+      "grad_norm": 0.9869498610496521,
+      "learning_rate": 0.0005275573978681811,
+      "loss": 4.6732,
+      "step": 267500
+    },
+    {
+      "epoch": 1.427688636024633,
+      "grad_norm": 0.8871563673019409,
+      "learning_rate": 0.0005266655726944535,
+      "loss": 4.6838,
+      "step": 268000
+    },
+    {
+      "epoch": 1.4303522342261714,
+      "grad_norm": 0.9272149205207825,
+      "learning_rate": 0.0005257737475207261,
+      "loss": 4.6749,
+      "step": 268500
+    },
+    {
+      "epoch": 1.43301583242771,
+      "grad_norm": 0.8581441640853882,
+      "learning_rate": 0.000524883705997346,
+      "loss": 4.677,
+      "step": 269000
+    },
+    {
+      "epoch": 1.4356794306292484,
+      "grad_norm": 0.9157629609107971,
+      "learning_rate": 0.0005239918808236184,
+      "loss": 4.6756,
+      "step": 269500
+    },
+    {
+      "epoch": 1.4383430288307868,
+      "grad_norm": 0.9694315195083618,
+      "learning_rate": 0.0005231000556498908,
+      "loss": 4.6721,
+      "step": 270000
+    },
+    {
+      "epoch": 1.4410066270323254,
+      "grad_norm": 0.905512809753418,
+      "learning_rate": 0.0005222082304761632,
+      "loss": 4.6782,
+      "step": 270500
+    },
+    {
+      "epoch": 1.443670225233864,
+      "grad_norm": 0.8765866160392761,
+      "learning_rate": 0.0005213181889527832,
+      "loss": 4.6692,
+      "step": 271000
+    },
+    {
+      "epoch": 1.4463338234354024,
+      "grad_norm": 0.9838495850563049,
+      "learning_rate": 0.0005204263637790558,
+      "loss": 4.6801,
+      "step": 271500
+    },
+    {
+      "epoch": 1.4489974216369408,
+      "grad_norm": 0.9424646496772766,
+      "learning_rate": 0.0005195345386053282,
+      "loss": 4.6701,
+      "step": 272000
+    },
+    {
+      "epoch": 1.4516610198384794,
+      "grad_norm": 0.8656395077705383,
+      "learning_rate": 0.0005186427134316006,
+      "loss": 4.6775,
+      "step": 272500
+    },
+    {
+      "epoch": 1.454324618040018,
+      "grad_norm": 0.9255796074867249,
+      "learning_rate": 0.0005177526719082205,
+      "loss": 4.6753,
+      "step": 273000
+    },
+    {
+      "epoch": 1.4569882162415564,
+      "grad_norm": 0.9551495313644409,
+      "learning_rate": 0.0005168608467344929,
+      "loss": 4.6697,
+      "step": 273500
+    },
+    {
+      "epoch": 1.4596518144430948,
+      "grad_norm": 0.9294918179512024,
+      "learning_rate": 0.0005159690215607655,
+      "loss": 4.678,
+      "step": 274000
+    },
+    {
+      "epoch": 1.4623154126446334,
+      "grad_norm": 0.8683546185493469,
+      "learning_rate": 0.0005150771963870379,
+      "loss": 4.6716,
+      "step": 274500
+    },
+    {
+      "epoch": 1.464979010846172,
+      "grad_norm": 0.9196661710739136,
+      "learning_rate": 0.0005141871548636577,
+      "loss": 4.6811,
+      "step": 275000
+    },
+    {
+      "epoch": 1.4676426090477104,
+      "grad_norm": 0.8748793005943298,
+      "learning_rate": 0.0005132953296899302,
+      "loss": 4.6676,
+      "step": 275500
+    },
+    {
+      "epoch": 1.4703062072492488,
+      "grad_norm": 0.941302478313446,
+      "learning_rate": 0.0005124035045162026,
+      "loss": 4.677,
+      "step": 276000
+    },
+    {
+      "epoch": 1.4729698054507874,
+      "grad_norm": 0.8474987149238586,
+      "learning_rate": 0.0005115116793424752,
+      "loss": 4.6656,
+      "step": 276500
+    },
+    {
+      "epoch": 1.475633403652326,
+      "grad_norm": 0.9448440074920654,
+      "learning_rate": 0.0005106216378190951,
+      "loss": 4.6719,
+      "step": 277000
+    },
+    {
+      "epoch": 1.4782970018538644,
+      "grad_norm": 0.8382176160812378,
+      "learning_rate": 0.0005097298126453676,
+      "loss": 4.6685,
+      "step": 277500
+    },
+    {
+      "epoch": 1.4809606000554028,
+      "grad_norm": 0.8633021116256714,
+      "learning_rate": 0.00050883798747164,
+      "loss": 4.6732,
+      "step": 278000
+    },
+    {
+      "epoch": 1.4836241982569414,
+      "grad_norm": 0.9060601592063904,
+      "learning_rate": 0.0005079461622979124,
+      "loss": 4.6709,
+      "step": 278500
+    },
+    {
+      "epoch": 1.4862877964584797,
+      "grad_norm": 0.8984940648078918,
+      "learning_rate": 0.0005070561207745324,
+      "loss": 4.675,
+      "step": 279000
+    },
+    {
+      "epoch": 1.4889513946600184,
+      "grad_norm": 0.8761520385742188,
+      "learning_rate": 0.0005061642956008048,
+      "loss": 4.6719,
+      "step": 279500
+    },
+    {
+      "epoch": 1.4916149928615567,
+      "grad_norm": 0.934901773929596,
+      "learning_rate": 0.0005052724704270773,
+      "loss": 4.6705,
+      "step": 280000
+    },
+    {
+      "epoch": 1.4942785910630954,
+      "grad_norm": 0.927005410194397,
+      "learning_rate": 0.0005043806452533497,
+      "loss": 4.6736,
+      "step": 280500
+    },
+    {
+      "epoch": 1.4969421892646337,
+      "grad_norm": 0.9266989827156067,
+      "learning_rate": 0.0005034888200796221,
+      "loss": 4.6773,
+      "step": 281000
+    },
+    {
+      "epoch": 1.4996057874661723,
+      "grad_norm": 0.8785182237625122,
+      "learning_rate": 0.000502598778556242,
+      "loss": 4.6678,
+      "step": 281500
+    },
+    {
+      "epoch": 1.5022693856677107,
+      "grad_norm": 1.0172791481018066,
+      "learning_rate": 0.0005017069533825145,
+      "loss": 4.6752,
+      "step": 282000
+    },
+    {
+      "epoch": 1.5049329838692493,
+      "grad_norm": 0.9704791307449341,
+      "learning_rate": 0.000500815128208787,
+      "loss": 4.6635,
+      "step": 282500
+    },
+    {
+      "epoch": 1.507596582070788,
+      "grad_norm": 0.9124333262443542,
+      "learning_rate": 0.0004999233030350595,
+      "loss": 4.6748,
+      "step": 283000
+    },
+    {
+      "epoch": 1.5102601802723261,
+      "grad_norm": 0.9736974835395813,
+      "learning_rate": 0.0004990332615116794,
+      "loss": 4.6747,
+      "step": 283500
+    },
+    {
+      "epoch": 1.5129237784738647,
+      "grad_norm": 0.9330904483795166,
+      "learning_rate": 0.0004981414363379518,
+      "loss": 4.6695,
+      "step": 284000
+    },
+    {
+      "epoch": 1.5155873766754033,
+      "grad_norm": 1.0524863004684448,
+      "learning_rate": 0.0004972496111642243,
+      "loss": 4.6699,
+      "step": 284500
+    },
+    {
+      "epoch": 1.5182509748769417,
+      "grad_norm": 0.8803556561470032,
+      "learning_rate": 0.0004963577859904968,
+      "loss": 4.6711,
+      "step": 285000
+    },
+    {
+      "epoch": 1.52091457307848,
+      "grad_norm": 0.9705889225006104,
+      "learning_rate": 0.0004954677444671166,
+      "loss": 4.6705,
+      "step": 285500
+    },
+    {
+      "epoch": 1.5235781712800187,
+      "grad_norm": 0.928056001663208,
+      "learning_rate": 0.000494575919293389,
+      "loss": 4.6728,
+      "step": 286000
+    },
+    {
+      "epoch": 1.5262417694815573,
+      "grad_norm": 0.9061446785926819,
+      "learning_rate": 0.0004936840941196615,
+      "loss": 4.6669,
+      "step": 286500
+    },
+    {
+      "epoch": 1.5289053676830957,
+      "grad_norm": 0.9161142706871033,
+      "learning_rate": 0.000492792268945934,
+      "loss": 4.6661,
+      "step": 287000
+    },
+    {
+      "epoch": 1.531568965884634,
+      "grad_norm": 0.89336097240448,
+      "learning_rate": 0.0004919022274225539,
+      "loss": 4.678,
+      "step": 287500
+    },
+    {
+      "epoch": 1.5342325640861727,
+      "grad_norm": 0.886858344078064,
+      "learning_rate": 0.0004910104022488263,
+      "loss": 4.6693,
+      "step": 288000
+    },
+    {
+      "epoch": 1.5368961622877113,
+      "grad_norm": 0.8612877130508423,
+      "learning_rate": 0.0004901185770750989,
+      "loss": 4.6631,
+      "step": 288500
+    },
+    {
+      "epoch": 1.5395597604892497,
+      "grad_norm": 1.027990460395813,
+      "learning_rate": 0.0004892267519013713,
+      "loss": 4.6738,
+      "step": 289000
+    },
+    {
+      "epoch": 1.542223358690788,
+      "grad_norm": 0.8808753490447998,
+      "learning_rate": 0.0004883367103779913,
+      "loss": 4.6794,
+      "step": 289500
+    },
+    {
+      "epoch": 1.5448869568923267,
+      "grad_norm": 0.9345124363899231,
+      "learning_rate": 0.00048744488520426364,
+      "loss": 4.6642,
+      "step": 290000
+    },
+    {
+      "epoch": 1.5475505550938653,
+      "grad_norm": 0.8728553652763367,
+      "learning_rate": 0.0004865530600305361,
+      "loss": 4.6744,
+      "step": 290500
+    },
+    {
+      "epoch": 1.5502141532954037,
+      "grad_norm": 1.0889195203781128,
+      "learning_rate": 0.00048566123485680856,
+      "loss": 4.669,
+      "step": 291000
+    },
+    {
+      "epoch": 1.552877751496942,
+      "grad_norm": 0.9284191727638245,
+      "learning_rate": 0.0004847711933334285,
+      "loss": 4.669,
+      "step": 291500
+    },
+    {
+      "epoch": 1.5555413496984807,
+      "grad_norm": 0.8793983459472656,
+      "learning_rate": 0.0004838793681597009,
+      "loss": 4.675,
+      "step": 292000
+    },
+    {
+      "epoch": 1.5582049479000193,
+      "grad_norm": 0.8682650327682495,
+      "learning_rate": 0.00048298754298597334,
+      "loss": 4.6614,
+      "step": 292500
+    },
+    {
+      "epoch": 1.5608685461015577,
+      "grad_norm": 0.9232677817344666,
+      "learning_rate": 0.0004820957178122458,
+      "loss": 4.668,
+      "step": 293000
+    },
+    {
+      "epoch": 1.563532144303096,
+      "grad_norm": 1.0062049627304077,
+      "learning_rate": 0.00048120567628886574,
+      "loss": 4.6651,
+      "step": 293500
+    },
+    {
+      "epoch": 1.5661957425046347,
+      "grad_norm": 0.9670103192329407,
+      "learning_rate": 0.0004803138511151382,
+      "loss": 4.6597,
+      "step": 294000
+    },
+    {
+      "epoch": 1.5688593407061733,
+      "grad_norm": 0.9307424426078796,
+      "learning_rate": 0.00047942202594141066,
+      "loss": 4.6697,
+      "step": 294500
+    },
+    {
+      "epoch": 1.5715229389077117,
+      "grad_norm": 0.8999619483947754,
+      "learning_rate": 0.0004785302007676831,
+      "loss": 4.6739,
+      "step": 295000
+    },
+    {
+      "epoch": 1.57418653710925,
+      "grad_norm": 0.927873432636261,
+      "learning_rate": 0.00047764015924430306,
+      "loss": 4.6701,
+      "step": 295500
+    },
+    {
+      "epoch": 1.5768501353107887,
+      "grad_norm": 0.966769814491272,
+      "learning_rate": 0.0004767483340705755,
+      "loss": 4.6635,
+      "step": 296000
+    },
+    {
+      "epoch": 1.5795137335123273,
+      "grad_norm": 0.9777745604515076,
+      "learning_rate": 0.0004758565088968479,
+      "loss": 4.6755,
+      "step": 296500
+    },
+    {
+      "epoch": 1.5821773317138657,
+      "grad_norm": 0.8396321535110474,
+      "learning_rate": 0.0004749646837231204,
+      "loss": 4.6731,
+      "step": 297000
+    },
+    {
+      "epoch": 1.584840929915404,
+      "grad_norm": 0.9812055826187134,
+      "learning_rate": 0.00047407464219974033,
+      "loss": 4.6723,
+      "step": 297500
+    },
+    {
+      "epoch": 1.5875045281169426,
+      "grad_norm": 0.8983718156814575,
+      "learning_rate": 0.0004731828170260128,
+      "loss": 4.6742,
+      "step": 298000
+    },
+    {
+      "epoch": 1.5901681263184813,
+      "grad_norm": 0.89915931224823,
+      "learning_rate": 0.00047229099185228525,
+      "loss": 4.6661,
+      "step": 298500
+    },
+    {
+      "epoch": 1.5928317245200196,
+      "grad_norm": 0.9202754497528076,
+      "learning_rate": 0.0004713991666785577,
+      "loss": 4.6679,
+      "step": 299000
+    },
+    {
+      "epoch": 1.595495322721558,
+      "grad_norm": 0.9377342462539673,
+      "learning_rate": 0.0004705091251551776,
+      "loss": 4.6664,
+      "step": 299500
+    },
+    {
+      "epoch": 1.5981589209230966,
+      "grad_norm": 0.8714098930358887,
+      "learning_rate": 0.00046961729998145,
+      "loss": 4.6668,
+      "step": 300000
+    },
+    {
+      "epoch": 1.6008225191246352,
+      "grad_norm": 0.884672224521637,
+      "learning_rate": 0.00046872547480772246,
+      "loss": 4.6648,
+      "step": 300500
+    },
+    {
+      "epoch": 1.6034861173261736,
+      "grad_norm": 0.9066005945205688,
+      "learning_rate": 0.00046783364963399495,
+      "loss": 4.6589,
+      "step": 301000
+    },
+    {
+      "epoch": 1.606149715527712,
+      "grad_norm": 1.024951457977295,
+      "learning_rate": 0.00046694360811061486,
+      "loss": 4.6645,
+      "step": 301500
+    },
+    {
+      "epoch": 1.6088133137292506,
+      "grad_norm": 0.9072735905647278,
+      "learning_rate": 0.00046605178293688735,
+      "loss": 4.665,
+      "step": 302000
+    },
+    {
+      "epoch": 1.611476911930789,
+      "grad_norm": 0.8979782462120056,
+      "learning_rate": 0.0004651599577631598,
+      "loss": 4.6667,
+      "step": 302500
+    },
+    {
+      "epoch": 1.6141405101323274,
+      "grad_norm": 0.8960680961608887,
+      "learning_rate": 0.0004642681325894322,
+      "loss": 4.6698,
+      "step": 303000
+    },
+    {
+      "epoch": 1.616804108333866,
+      "grad_norm": 0.9768756031990051,
+      "learning_rate": 0.0004633780910660522,
+      "loss": 4.6678,
+      "step": 303500
+    },
+    {
+      "epoch": 1.6194677065354046,
+      "grad_norm": 0.941615879535675,
+      "learning_rate": 0.0004624862658923246,
+      "loss": 4.6689,
+      "step": 304000
+    },
+    {
+      "epoch": 1.622131304736943,
+      "grad_norm": 0.9563820362091064,
+      "learning_rate": 0.0004615944407185971,
+      "loss": 4.6621,
+      "step": 304500
+    },
+    {
+      "epoch": 1.6247949029384814,
+      "grad_norm": 0.9180545806884766,
+      "learning_rate": 0.00046070261554486953,
+      "loss": 4.6667,
+      "step": 305000
+    },
+    {
+      "epoch": 1.62745850114002,
+      "grad_norm": 0.8739668726921082,
+      "learning_rate": 0.0004598125740214894,
+      "loss": 4.6711,
+      "step": 305500
+    },
+    {
+      "epoch": 1.6301220993415586,
+      "grad_norm": 1.0049022436141968,
+      "learning_rate": 0.0004589207488477619,
+      "loss": 4.6662,
+      "step": 306000
+    },
+    {
+      "epoch": 1.632785697543097,
+      "grad_norm": 0.8607634902000427,
+      "learning_rate": 0.0004580289236740343,
+      "loss": 4.6635,
+      "step": 306500
+    },
+    {
+      "epoch": 1.6354492957446354,
+      "grad_norm": 0.9192615747451782,
+      "learning_rate": 0.0004571370985003068,
+      "loss": 4.6634,
+      "step": 307000
+    },
+    {
+      "epoch": 1.638112893946174,
+      "grad_norm": 0.8758520483970642,
+      "learning_rate": 0.0004562470569769267,
+      "loss": 4.6532,
+      "step": 307500
+    },
+    {
+      "epoch": 1.6407764921477126,
+      "grad_norm": 0.8956847190856934,
+      "learning_rate": 0.00045535523180319915,
+      "loss": 4.6694,
+      "step": 308000
+    },
+    {
+      "epoch": 1.643440090349251,
+      "grad_norm": 0.8848024010658264,
+      "learning_rate": 0.00045446340662947163,
+      "loss": 4.6637,
+      "step": 308500
+    },
+    {
+      "epoch": 1.6461036885507894,
+      "grad_norm": 0.9178889989852905,
+      "learning_rate": 0.00045357158145574407,
+      "loss": 4.6706,
+      "step": 309000
+    },
+    {
+      "epoch": 1.648767286752328,
+      "grad_norm": 1.0721620321273804,
+      "learning_rate": 0.0004526797562820165,
+      "loss": 4.6548,
+      "step": 309500
+    },
+    {
+      "epoch": 1.6514308849538666,
+      "grad_norm": 0.8807909488677979,
+      "learning_rate": 0.000451787931108289,
+      "loss": 4.6666,
+      "step": 310000
+    },
+    {
+      "epoch": 1.654094483155405,
+      "grad_norm": 0.839378297328949,
+      "learning_rate": 0.0004508961059345614,
+      "loss": 4.6671,
+      "step": 310500
+    },
+    {
+      "epoch": 1.6567580813569434,
+      "grad_norm": 0.9249696135520935,
+      "learning_rate": 0.0004500042807608339,
+      "loss": 4.6696,
+      "step": 311000
+    },
+    {
+      "epoch": 1.659421679558482,
+      "grad_norm": 0.891743540763855,
+      "learning_rate": 0.0004491142392374538,
+      "loss": 4.6658,
+      "step": 311500
+    },
+    {
+      "epoch": 1.6620852777600206,
+      "grad_norm": 0.9119758009910583,
+      "learning_rate": 0.00044822241406372625,
+      "loss": 4.6668,
+      "step": 312000
+    },
+    {
+      "epoch": 1.664748875961559,
+      "grad_norm": 0.9169191122055054,
+      "learning_rate": 0.00044733058888999874,
+      "loss": 4.668,
+      "step": 312500
+    },
+    {
+      "epoch": 1.6674124741630973,
+      "grad_norm": 0.8988668918609619,
+      "learning_rate": 0.00044643876371627117,
+      "loss": 4.6602,
+      "step": 313000
+    },
+    {
+      "epoch": 1.670076072364636,
+      "grad_norm": 0.8959922194480896,
+      "learning_rate": 0.0004455487221928911,
+      "loss": 4.6628,
+      "step": 313500
+    },
+    {
+      "epoch": 1.6727396705661746,
+      "grad_norm": 0.8865877389907837,
+      "learning_rate": 0.00044465689701916357,
+      "loss": 4.6645,
+      "step": 314000
+    },
+    {
+      "epoch": 1.675403268767713,
+      "grad_norm": 0.9459559321403503,
+      "learning_rate": 0.000443765071845436,
+      "loss": 4.658,
+      "step": 314500
+    },
+    {
+      "epoch": 1.6780668669692513,
+      "grad_norm": 0.9914552569389343,
+      "learning_rate": 0.0004428732466717085,
+      "loss": 4.6663,
+      "step": 315000
+    },
+    {
+      "epoch": 1.68073046517079,
+      "grad_norm": 0.9912951588630676,
+      "learning_rate": 0.00044198320514832835,
+      "loss": 4.6658,
+      "step": 315500
+    },
+    {
+      "epoch": 1.6833940633723286,
+      "grad_norm": 0.9673342108726501,
+      "learning_rate": 0.0004410913799746008,
+      "loss": 4.6612,
+      "step": 316000
+    },
+    {
+      "epoch": 1.686057661573867,
+      "grad_norm": 0.9501271843910217,
+      "learning_rate": 0.00044019955480087327,
+      "loss": 4.6641,
+      "step": 316500
+    },
+    {
+      "epoch": 1.6887212597754053,
+      "grad_norm": 0.9438074827194214,
+      "learning_rate": 0.0004393077296271457,
+      "loss": 4.6557,
+      "step": 317000
+    },
+    {
+      "epoch": 1.691384857976944,
+      "grad_norm": 0.9112457633018494,
+      "learning_rate": 0.0004384176881037656,
+      "loss": 4.6665,
+      "step": 317500
+    },
+    {
+      "epoch": 1.6940484561784825,
+      "grad_norm": 0.9219810962677002,
+      "learning_rate": 0.0004375258629300381,
+      "loss": 4.6625,
+      "step": 318000
+    },
+    {
+      "epoch": 1.696712054380021,
+      "grad_norm": 0.8877586126327515,
+      "learning_rate": 0.00043663403775631054,
+      "loss": 4.6655,
+      "step": 318500
+    },
+    {
+      "epoch": 1.6993756525815593,
+      "grad_norm": 1.021628499031067,
+      "learning_rate": 0.000435742212582583,
+      "loss": 4.6615,
+      "step": 319000
+    },
+    {
+      "epoch": 1.702039250783098,
+      "grad_norm": 0.9514620900154114,
+      "learning_rate": 0.00043485217105920294,
+      "loss": 4.659,
+      "step": 319500
+    },
+    {
+      "epoch": 1.7047028489846365,
+      "grad_norm": 0.8997855186462402,
+      "learning_rate": 0.00043396034588547537,
+      "loss": 4.6608,
+      "step": 320000
+    },
+    {
+      "epoch": 1.7073664471861747,
+      "grad_norm": 0.897196352481842,
+      "learning_rate": 0.00043306852071174786,
+      "loss": 4.6633,
+      "step": 320500
+    },
+    {
+      "epoch": 1.7100300453877133,
+      "grad_norm": 0.8859612941741943,
+      "learning_rate": 0.0004321766955380203,
+      "loss": 4.663,
+      "step": 321000
+    },
+    {
+      "epoch": 1.712693643589252,
+      "grad_norm": 0.9287886023521423,
+      "learning_rate": 0.00043128665401464026,
+      "loss": 4.6616,
+      "step": 321500
+    },
+    {
+      "epoch": 1.7153572417907903,
+      "grad_norm": 0.9006426334381104,
+      "learning_rate": 0.0004303948288409127,
+      "loss": 4.6677,
+      "step": 322000
+    },
+    {
+      "epoch": 1.7180208399923287,
+      "grad_norm": 0.9155673384666443,
+      "learning_rate": 0.0004295030036671851,
+      "loss": 4.6557,
+      "step": 322500
+    },
+    {
+      "epoch": 1.7206844381938673,
+      "grad_norm": 0.909574568271637,
+      "learning_rate": 0.0004286111784934576,
+      "loss": 4.6645,
+      "step": 323000
+    },
+    {
+      "epoch": 1.723348036395406,
+      "grad_norm": 0.9735229015350342,
+      "learning_rate": 0.00042772113697007747,
+      "loss": 4.6578,
+      "step": 323500
+    },
+    {
+      "epoch": 1.7260116345969443,
+      "grad_norm": 0.9536921381950378,
+      "learning_rate": 0.0004268293117963499,
+      "loss": 4.6607,
+      "step": 324000
+    },
+    {
+      "epoch": 1.7286752327984827,
+      "grad_norm": 1.100502610206604,
+      "learning_rate": 0.0004259374866226224,
+      "loss": 4.6598,
+      "step": 324500
+    },
+    {
+      "epoch": 1.7313388310000213,
+      "grad_norm": 0.9191217422485352,
+      "learning_rate": 0.0004250456614488948,
+      "loss": 4.6607,
+      "step": 325000
+    },
+    {
+      "epoch": 1.73400242920156,
+      "grad_norm": 0.921844482421875,
+      "learning_rate": 0.0004241556199255148,
+      "loss": 4.6567,
+      "step": 325500
+    },
+    {
+      "epoch": 1.7366660274030983,
+      "grad_norm": 0.9752650856971741,
+      "learning_rate": 0.0004232637947517872,
+      "loss": 4.6625,
+      "step": 326000
+    },
+    {
+      "epoch": 1.7393296256046367,
+      "grad_norm": 0.9209687113761902,
+      "learning_rate": 0.00042237196957805966,
+      "loss": 4.6632,
+      "step": 326500
+    },
+    {
+      "epoch": 1.7419932238061753,
+      "grad_norm": 0.9033056497573853,
+      "learning_rate": 0.00042148014440433214,
+      "loss": 4.658,
+      "step": 327000
+    },
+    {
+      "epoch": 1.7446568220077139,
+      "grad_norm": 0.9369528889656067,
+      "learning_rate": 0.00042059010288095206,
+      "loss": 4.6624,
+      "step": 327500
+    },
+    {
+      "epoch": 1.7473204202092523,
+      "grad_norm": 0.9487442374229431,
+      "learning_rate": 0.00041969827770722455,
+      "loss": 4.6571,
+      "step": 328000
+    },
+    {
+      "epoch": 1.7499840184107907,
+      "grad_norm": 0.9886392951011658,
+      "learning_rate": 0.000418806452533497,
+      "loss": 4.6644,
+      "step": 328500
+    },
+    {
+      "epoch": 1.7526476166123293,
+      "grad_norm": 0.9492540955543518,
+      "learning_rate": 0.00041791641101011684,
+      "loss": 4.6602,
+      "step": 329000
+    },
+    {
+      "epoch": 1.7553112148138679,
+      "grad_norm": 1.0011500120162964,
+      "learning_rate": 0.0004170245858363893,
+      "loss": 4.654,
+      "step": 329500
+    },
+    {
+      "epoch": 1.7579748130154063,
+      "grad_norm": 0.8877175450325012,
+      "learning_rate": 0.0004161327606626618,
+      "loss": 4.661,
+      "step": 330000
+    },
+    {
+      "epoch": 1.7606384112169446,
+      "grad_norm": 0.9424140453338623,
+      "learning_rate": 0.00041524093548893424,
+      "loss": 4.6626,
+      "step": 330500
+    },
+    {
+      "epoch": 1.7633020094184833,
+      "grad_norm": 0.9958423376083374,
+      "learning_rate": 0.00041434911031520673,
+      "loss": 4.664,
+      "step": 331000
+    },
+    {
+      "epoch": 1.7659656076200219,
+      "grad_norm": 0.9889068007469177,
+      "learning_rate": 0.00041345728514147916,
+      "loss": 4.6539,
+      "step": 331500
+    },
+    {
+      "epoch": 1.7686292058215602,
+      "grad_norm": 0.9919883608818054,
+      "learning_rate": 0.00041256545996775165,
+      "loss": 4.6619,
+      "step": 332000
+    },
+    {
+      "epoch": 1.7712928040230986,
+      "grad_norm": 0.9275678992271423,
+      "learning_rate": 0.0004116736347940241,
+      "loss": 4.6645,
+      "step": 332500
+    },
+    {
+      "epoch": 1.7739564022246372,
+      "grad_norm": 0.918587863445282,
+      "learning_rate": 0.00041078359327064394,
+      "loss": 4.6579,
+      "step": 333000
+    },
+    {
+      "epoch": 1.7766200004261758,
+      "grad_norm": 0.9589911699295044,
+      "learning_rate": 0.00040989176809691643,
+      "loss": 4.6569,
+      "step": 333500
+    },
+    {
+      "epoch": 1.7792835986277142,
+      "grad_norm": 0.9149937629699707,
+      "learning_rate": 0.00040899994292318886,
+      "loss": 4.669,
+      "step": 334000
+    },
+    {
+      "epoch": 1.7819471968292526,
+      "grad_norm": 1.0267397165298462,
+      "learning_rate": 0.00040810811774946135,
+      "loss": 4.6564,
+      "step": 334500
+    },
+    {
+      "epoch": 1.7846107950307912,
+      "grad_norm": 0.9392015933990479,
+      "learning_rate": 0.00040721807622608126,
+      "loss": 4.6553,
+      "step": 335000
+    },
+    {
+      "epoch": 1.7872743932323298,
+      "grad_norm": 1.0006318092346191,
+      "learning_rate": 0.0004063262510523537,
+      "loss": 4.6639,
+      "step": 335500
+    },
+    {
+      "epoch": 1.7899379914338682,
+      "grad_norm": 1.0681666135787964,
+      "learning_rate": 0.0004054344258786262,
+      "loss": 4.6599,
+      "step": 336000
+    },
+    {
+      "epoch": 1.7926015896354066,
+      "grad_norm": 0.9203771352767944,
+      "learning_rate": 0.0004045426007048986,
+      "loss": 4.6579,
+      "step": 336500
+    },
+    {
+      "epoch": 1.7952651878369452,
+      "grad_norm": 0.9925394058227539,
+      "learning_rate": 0.00040365255918151853,
+      "loss": 4.6666,
+      "step": 337000
+    },
+    {
+      "epoch": 1.7979287860384838,
+      "grad_norm": 1.0396158695220947,
+      "learning_rate": 0.000402760734007791,
+      "loss": 4.6609,
+      "step": 337500
+    },
+    {
+      "epoch": 1.8005923842400222,
+      "grad_norm": 1.0538824796676636,
+      "learning_rate": 0.00040186890883406345,
+      "loss": 4.6548,
+      "step": 338000
+    },
+    {
+      "epoch": 1.8032559824415606,
+      "grad_norm": 1.0223603248596191,
+      "learning_rate": 0.00040097708366033594,
+      "loss": 4.6578,
+      "step": 338500
+    },
+    {
+      "epoch": 1.8059195806430992,
+      "grad_norm": 0.904390811920166,
+      "learning_rate": 0.0004000870421369558,
+      "loss": 4.6594,
+      "step": 339000
+    },
+    {
+      "epoch": 1.8085831788446376,
+      "grad_norm": 0.950413167476654,
+      "learning_rate": 0.00039919521696322823,
+      "loss": 4.6518,
+      "step": 339500
+    },
+    {
+      "epoch": 1.811246777046176,
+      "grad_norm": 0.8616942167282104,
+      "learning_rate": 0.0003983033917895007,
+      "loss": 4.6549,
+      "step": 340000
+    },
+    {
+      "epoch": 1.8139103752477146,
+      "grad_norm": 0.9749570488929749,
+      "learning_rate": 0.0003974115666157732,
+      "loss": 4.6585,
+      "step": 340500
+    },
+    {
+      "epoch": 1.8165739734492532,
+      "grad_norm": 0.9949798583984375,
+      "learning_rate": 0.00039652152509239306,
+      "loss": 4.6555,
+      "step": 341000
+    },
+    {
+      "epoch": 1.8192375716507916,
+      "grad_norm": 1.0118317604064941,
+      "learning_rate": 0.00039562969991866555,
+      "loss": 4.6539,
+      "step": 341500
+    },
+    {
+      "epoch": 1.82190116985233,
+      "grad_norm": 0.9762909412384033,
+      "learning_rate": 0.000394737874744938,
+      "loss": 4.6609,
+      "step": 342000
+    },
+    {
+      "epoch": 1.8245647680538686,
+      "grad_norm": 0.9497443437576294,
+      "learning_rate": 0.00039384604957121047,
+      "loss": 4.6575,
+      "step": 342500
+    },
+    {
+      "epoch": 1.8272283662554072,
+      "grad_norm": 0.9685680270195007,
+      "learning_rate": 0.0003929560080478304,
+      "loss": 4.6511,
+      "step": 343000
+    },
+    {
+      "epoch": 1.8298919644569456,
+      "grad_norm": 1.0874184370040894,
+      "learning_rate": 0.0003920641828741028,
+      "loss": 4.6512,
+      "step": 343500
+    },
+    {
+      "epoch": 1.832555562658484,
+      "grad_norm": 0.9718310832977295,
+      "learning_rate": 0.0003911723577003753,
+      "loss": 4.6546,
+      "step": 344000
+    },
+    {
+      "epoch": 1.8352191608600226,
+      "grad_norm": 0.9649378657341003,
+      "learning_rate": 0.00039028053252664773,
+      "loss": 4.6615,
+      "step": 344500
+    },
+    {
+      "epoch": 1.8378827590615612,
+      "grad_norm": 1.0718717575073242,
+      "learning_rate": 0.0003893904910032677,
+      "loss": 4.6664,
+      "step": 345000
+    },
+    {
+      "epoch": 1.8405463572630996,
+      "grad_norm": 1.1101832389831543,
+      "learning_rate": 0.00038849866582954014,
+      "loss": 4.6559,
+      "step": 345500
+    },
+    {
+      "epoch": 1.843209955464638,
+      "grad_norm": 0.966593325138092,
+      "learning_rate": 0.00038760684065581257,
+      "loss": 4.6577,
+      "step": 346000
+    },
+    {
+      "epoch": 1.8458735536661766,
+      "grad_norm": 1.01513671875,
+      "learning_rate": 0.00038671501548208506,
+      "loss": 4.6569,
+      "step": 346500
+    },
+    {
+      "epoch": 1.8485371518677152,
+      "grad_norm": 0.9345992207527161,
+      "learning_rate": 0.0003858249739587049,
+      "loss": 4.652,
+      "step": 347000
+    },
+    {
+      "epoch": 1.8512007500692536,
+      "grad_norm": 0.9582251310348511,
+      "learning_rate": 0.00038493314878497735,
+      "loss": 4.6579,
+      "step": 347500
+    },
+    {
+      "epoch": 1.853864348270792,
+      "grad_norm": 0.9576370716094971,
+      "learning_rate": 0.00038404132361124984,
+      "loss": 4.6542,
+      "step": 348000
+    },
+    {
+      "epoch": 1.8565279464723305,
+      "grad_norm": 0.9874210953712463,
+      "learning_rate": 0.00038314949843752227,
+      "loss": 4.6531,
+      "step": 348500
+    },
+    {
+      "epoch": 1.8591915446738692,
+      "grad_norm": 1.075488805770874,
+      "learning_rate": 0.00038225945691414224,
+      "loss": 4.656,
+      "step": 349000
+    },
+    {
+      "epoch": 1.8618551428754075,
+      "grad_norm": 0.9993325471878052,
+      "learning_rate": 0.00038136763174041467,
+      "loss": 4.6615,
+      "step": 349500
+    },
+    {
+      "epoch": 1.864518741076946,
+      "grad_norm": 0.9594368934631348,
+      "learning_rate": 0.0003804758065666871,
+      "loss": 4.6533,
+      "step": 350000
+    },
+    {
+      "epoch": 1.8671823392784845,
+      "grad_norm": 0.9755575060844421,
+      "learning_rate": 0.0003795839813929596,
+      "loss": 4.6558,
+      "step": 350500
+    },
+    {
+      "epoch": 1.8698459374800231,
+      "grad_norm": 0.9865338802337646,
+      "learning_rate": 0.0003786939398695795,
+      "loss": 4.6559,
+      "step": 351000
+    },
+    {
+      "epoch": 1.8725095356815615,
+      "grad_norm": 1.071847677230835,
+      "learning_rate": 0.00037780211469585194,
+      "loss": 4.6648,
+      "step": 351500
+    },
+    {
+      "epoch": 1.8751731338831,
+      "grad_norm": 0.9860469102859497,
+      "learning_rate": 0.0003769102895221244,
+      "loss": 4.65,
+      "step": 352000
+    },
+    {
+      "epoch": 1.8778367320846385,
+      "grad_norm": 0.9507666826248169,
+      "learning_rate": 0.00037601846434839685,
+      "loss": 4.6511,
+      "step": 352500
+    },
+    {
+      "epoch": 1.8805003302861771,
+      "grad_norm": 1.0288827419281006,
+      "learning_rate": 0.0003751284228250168,
+      "loss": 4.6602,
+      "step": 353000
+    },
+    {
+      "epoch": 1.8831639284877155,
+      "grad_norm": 1.0583363771438599,
+      "learning_rate": 0.00037423659765128926,
+      "loss": 4.6553,
+      "step": 353500
+    },
+    {
+      "epoch": 1.885827526689254,
+      "grad_norm": 0.9062938094139099,
+      "learning_rate": 0.0003733447724775617,
+      "loss": 4.6633,
+      "step": 354000
+    },
+    {
+      "epoch": 1.8884911248907925,
+      "grad_norm": 0.9386794567108154,
+      "learning_rate": 0.0003724529473038342,
+      "loss": 4.6483,
+      "step": 354500
+    },
+    {
+      "epoch": 1.8911547230923311,
+      "grad_norm": 0.9764008522033691,
+      "learning_rate": 0.00037156290578045404,
+      "loss": 4.6594,
+      "step": 355000
+    },
+    {
+      "epoch": 1.8938183212938695,
+      "grad_norm": 0.9707098007202148,
+      "learning_rate": 0.0003706710806067265,
+      "loss": 4.645,
+      "step": 355500
+    },
+    {
+      "epoch": 1.896481919495408,
+      "grad_norm": 1.046889066696167,
+      "learning_rate": 0.00036977925543299896,
+      "loss": 4.6516,
+      "step": 356000
+    },
+    {
+      "epoch": 1.8991455176969465,
+      "grad_norm": 0.9305681586265564,
+      "learning_rate": 0.0003688874302592714,
+      "loss": 4.6461,
+      "step": 356500
+    },
+    {
+      "epoch": 1.901809115898485,
+      "grad_norm": 0.963812530040741,
+      "learning_rate": 0.00036799738873589136,
+      "loss": 4.651,
+      "step": 357000
+    },
+    {
+      "epoch": 1.9044727141000233,
+      "grad_norm": 1.0378142595291138,
+      "learning_rate": 0.0003671055635621638,
+      "loss": 4.6523,
+      "step": 357500
+    },
+    {
+      "epoch": 1.9071363123015619,
+      "grad_norm": 1.0353012084960938,
+      "learning_rate": 0.0003662137383884362,
+      "loss": 4.6497,
+      "step": 358000
+    },
+    {
+      "epoch": 1.9097999105031005,
+      "grad_norm": 0.93437659740448,
+      "learning_rate": 0.0003653219132147087,
+      "loss": 4.6494,
+      "step": 358500
+    },
+    {
+      "epoch": 1.9124635087046389,
+      "grad_norm": 0.9659603238105774,
+      "learning_rate": 0.0003644318716913286,
+      "loss": 4.6482,
+      "step": 359000
+    },
+    {
+      "epoch": 1.9151271069061773,
+      "grad_norm": 0.982214629650116,
+      "learning_rate": 0.0003635400465176011,
+      "loss": 4.6464,
+      "step": 359500
+    },
+    {
+      "epoch": 1.9177907051077159,
+      "grad_norm": 0.9894130229949951,
+      "learning_rate": 0.00036264822134387354,
+      "loss": 4.6483,
+      "step": 360000
+    },
+    {
+      "epoch": 1.9204543033092545,
+      "grad_norm": 1.0288091897964478,
+      "learning_rate": 0.000361756396170146,
+      "loss": 4.6571,
+      "step": 360500
+    },
+    {
+      "epoch": 1.9231179015107929,
+      "grad_norm": 1.0425199270248413,
+      "learning_rate": 0.0003608663546467659,
+      "loss": 4.6438,
+      "step": 361000
+    },
+    {
+      "epoch": 1.9257814997123313,
+      "grad_norm": 0.9725956320762634,
+      "learning_rate": 0.0003599745294730383,
+      "loss": 4.6524,
+      "step": 361500
+    },
+    {
+      "epoch": 1.9284450979138699,
+      "grad_norm": 0.9731396436691284,
+      "learning_rate": 0.00035908270429931075,
+      "loss": 4.6521,
+      "step": 362000
+    },
+    {
+      "epoch": 1.9311086961154085,
+      "grad_norm": 1.031201720237732,
+      "learning_rate": 0.00035819087912558324,
+      "loss": 4.6466,
+      "step": 362500
+    },
+    {
+      "epoch": 1.9337722943169469,
+      "grad_norm": 0.9079106450080872,
+      "learning_rate": 0.0003572990539518557,
+      "loss": 4.6546,
+      "step": 363000
+    },
+    {
+      "epoch": 1.9364358925184852,
+      "grad_norm": 1.0807876586914062,
+      "learning_rate": 0.00035640901242847564,
+      "loss": 4.6448,
+      "step": 363500
+    },
+    {
+      "epoch": 1.9390994907200239,
+      "grad_norm": 0.9206376075744629,
+      "learning_rate": 0.0003555171872547481,
+      "loss": 4.6485,
+      "step": 364000
+    },
+    {
+      "epoch": 1.9417630889215625,
+      "grad_norm": 1.031483769416809,
+      "learning_rate": 0.0003546253620810205,
+      "loss": 4.6495,
+      "step": 364500
+    },
+    {
+      "epoch": 1.9444266871231008,
+      "grad_norm": 0.9696449041366577,
+      "learning_rate": 0.000353733536907293,
+      "loss": 4.6533,
+      "step": 365000
+    },
+    {
+      "epoch": 1.9470902853246392,
+      "grad_norm": 0.9895356893539429,
+      "learning_rate": 0.0003528434953839129,
+      "loss": 4.6508,
+      "step": 365500
+    },
+    {
+      "epoch": 1.9497538835261778,
+      "grad_norm": 0.9535285234451294,
+      "learning_rate": 0.0003519516702101854,
+      "loss": 4.6442,
+      "step": 366000
+    },
+    {
+      "epoch": 1.9524174817277165,
+      "grad_norm": 1.0165653228759766,
+      "learning_rate": 0.00035105984503645783,
+      "loss": 4.6375,
+      "step": 366500
+    },
+    {
+      "epoch": 1.9550810799292548,
+      "grad_norm": 0.9320639967918396,
+      "learning_rate": 0.00035016801986273026,
+      "loss": 4.6473,
+      "step": 367000
+    },
+    {
+      "epoch": 1.9577446781307932,
+      "grad_norm": 1.1587982177734375,
+      "learning_rate": 0.00034927797833935023,
+      "loss": 4.649,
+      "step": 367500
+    },
+    {
+      "epoch": 1.9604082763323318,
+      "grad_norm": 0.9320794343948364,
+      "learning_rate": 0.00034838615316562266,
+      "loss": 4.6514,
+      "step": 368000
+    },
+    {
+      "epoch": 1.9630718745338704,
+      "grad_norm": 0.97315913438797,
+      "learning_rate": 0.0003474943279918951,
+      "loss": 4.6486,
+      "step": 368500
+    },
+    {
+      "epoch": 1.9657354727354088,
+      "grad_norm": 1.126283049583435,
+      "learning_rate": 0.0003466025028181676,
+      "loss": 4.6525,
+      "step": 369000
+    },
+    {
+      "epoch": 1.9683990709369472,
+      "grad_norm": 1.041257381439209,
+      "learning_rate": 0.00034571246129478744,
+      "loss": 4.6456,
+      "step": 369500
+    },
+    {
+      "epoch": 1.9710626691384858,
+      "grad_norm": 1.0350292921066284,
+      "learning_rate": 0.00034482063612105993,
+      "loss": 4.6488,
+      "step": 370000
+    },
+    {
+      "epoch": 1.9737262673400244,
+      "grad_norm": 0.9576050639152527,
+      "learning_rate": 0.00034392881094733236,
+      "loss": 4.648,
+      "step": 370500
+    },
+    {
+      "epoch": 1.9763898655415628,
+      "grad_norm": 0.9608176350593567,
+      "learning_rate": 0.0003430369857736048,
+      "loss": 4.658,
+      "step": 371000
+    },
+    {
+      "epoch": 1.9790534637431012,
+      "grad_norm": 1.023898959159851,
+      "learning_rate": 0.00034214694425022476,
+      "loss": 4.6533,
+      "step": 371500
+    },
+    {
+      "epoch": 1.9817170619446398,
+      "grad_norm": 1.0364673137664795,
+      "learning_rate": 0.0003412551190764972,
+      "loss": 4.6508,
+      "step": 372000
+    },
+    {
+      "epoch": 1.9843806601461784,
+      "grad_norm": 0.9874972105026245,
+      "learning_rate": 0.00034036329390276963,
+      "loss": 4.6441,
+      "step": 372500
+    },
+    {
+      "epoch": 1.9870442583477168,
+      "grad_norm": 0.9010471701622009,
+      "learning_rate": 0.0003394714687290421,
+      "loss": 4.6428,
+      "step": 373000
+    },
+    {
+      "epoch": 1.9897078565492552,
+      "grad_norm": 0.9260776042938232,
+      "learning_rate": 0.00033858142720566203,
+      "loss": 4.6504,
+      "step": 373500
+    },
+    {
+      "epoch": 1.9923714547507938,
+      "grad_norm": 1.042839527130127,
+      "learning_rate": 0.0003376896020319345,
+      "loss": 4.6514,
+      "step": 374000
+    },
+    {
+      "epoch": 1.9950350529523324,
+      "grad_norm": 0.9163122177124023,
+      "learning_rate": 0.00033679777685820695,
+      "loss": 4.6473,
+      "step": 374500
+    },
+    {
+      "epoch": 1.9976986511538706,
+      "grad_norm": 0.9647536277770996,
+      "learning_rate": 0.0003359059516844794,
+      "loss": 4.6428,
+      "step": 375000
+    },
+    {
+      "epoch": 2.000362249355409,
+      "grad_norm": 0.9727627038955688,
+      "learning_rate": 0.0003350159101610993,
+      "loss": 4.6521,
+      "step": 375500
+    },
+    {
+      "epoch": 2.003025847556948,
+      "grad_norm": 0.9527126550674438,
+      "learning_rate": 0.00033412408498737173,
+      "loss": 4.6466,
+      "step": 376000
+    },
+    {
+      "epoch": 2.0056894457584864,
+      "grad_norm": 1.060686707496643,
+      "learning_rate": 0.00033323225981364427,
+      "loss": 4.6473,
+      "step": 376500
+    },
+    {
+      "epoch": 2.0083530439600246,
+      "grad_norm": 1.0455362796783447,
+      "learning_rate": 0.0003323404346399167,
+      "loss": 4.6405,
+      "step": 377000
+    },
+    {
+      "epoch": 2.011016642161563,
+      "grad_norm": 0.984203577041626,
+      "learning_rate": 0.00033145039311653656,
+      "loss": 4.6529,
+      "step": 377500
+    },
+    {
+      "epoch": 2.013680240363102,
+      "grad_norm": 0.9882683753967285,
+      "learning_rate": 0.00033055856794280905,
+      "loss": 4.6388,
+      "step": 378000
+    },
+    {
+      "epoch": 2.0163438385646404,
+      "grad_norm": 1.0161495208740234,
+      "learning_rate": 0.0003296667427690815,
+      "loss": 4.6407,
+      "step": 378500
+    },
+    {
+      "epoch": 2.0190074367661786,
+      "grad_norm": 0.9816075563430786,
+      "learning_rate": 0.0003287749175953539,
+      "loss": 4.6384,
+      "step": 379000
+    },
+    {
+      "epoch": 2.021671034967717,
+      "grad_norm": 0.9842175841331482,
+      "learning_rate": 0.0003278848760719739,
+      "loss": 4.6458,
+      "step": 379500
+    },
+    {
+      "epoch": 2.0243346331692558,
+      "grad_norm": 0.965932309627533,
+      "learning_rate": 0.0003269930508982463,
+      "loss": 4.6496,
+      "step": 380000
+    },
+    {
+      "epoch": 2.0269982313707944,
+      "grad_norm": 1.0019505023956299,
+      "learning_rate": 0.0003261012257245188,
+      "loss": 4.6423,
+      "step": 380500
+    },
+    {
+      "epoch": 2.0296618295723325,
+      "grad_norm": 0.9756182432174683,
+      "learning_rate": 0.0003252111842011387,
+      "loss": 4.6491,
+      "step": 381000
+    },
+    {
+      "epoch": 2.032325427773871,
+      "grad_norm": 1.0072699785232544,
+      "learning_rate": 0.00032431935902741115,
+      "loss": 4.6371,
+      "step": 381500
+    },
+    {
+      "epoch": 2.0349890259754098,
+      "grad_norm": 0.9327691793441772,
+      "learning_rate": 0.00032342753385368364,
+      "loss": 4.6456,
+      "step": 382000
+    },
+    {
+      "epoch": 2.0376526241769484,
+      "grad_norm": 1.0072060823440552,
+      "learning_rate": 0.00032253570867995607,
+      "loss": 4.6451,
+      "step": 382500
+    },
+    {
+      "epoch": 2.0403162223784865,
+      "grad_norm": 0.9492465257644653,
+      "learning_rate": 0.00032164388350622856,
+      "loss": 4.6469,
+      "step": 383000
+    },
+    {
+      "epoch": 2.042979820580025,
+      "grad_norm": 1.00717294216156,
+      "learning_rate": 0.000320752058332501,
+      "loss": 4.6463,
+      "step": 383500
+    },
+    {
+      "epoch": 2.0456434187815637,
+      "grad_norm": 0.9812472462654114,
+      "learning_rate": 0.0003198602331587734,
+      "loss": 4.6427,
+      "step": 384000
+    },
+    {
+      "epoch": 2.0483070169831024,
+      "grad_norm": 0.9539963603019714,
+      "learning_rate": 0.0003189684079850459,
+      "loss": 4.652,
+      "step": 384500
+    },
+    {
+      "epoch": 2.0509706151846405,
+      "grad_norm": 0.9309804439544678,
+      "learning_rate": 0.00031807836646166577,
+      "loss": 4.6385,
+      "step": 385000
+    },
+    {
+      "epoch": 2.053634213386179,
+      "grad_norm": 0.9806848764419556,
+      "learning_rate": 0.0003171865412879382,
+      "loss": 4.6406,
+      "step": 385500
+    },
+    {
+      "epoch": 2.0562978115877177,
+      "grad_norm": 0.9556758999824524,
+      "learning_rate": 0.0003162947161142107,
+      "loss": 4.6438,
+      "step": 386000
+    },
+    {
+      "epoch": 2.058961409789256,
+      "grad_norm": 1.0577518939971924,
+      "learning_rate": 0.0003154028909404832,
+      "loss": 4.6422,
+      "step": 386500
+    },
+    {
+      "epoch": 2.0616250079907945,
+      "grad_norm": 0.9437615871429443,
+      "learning_rate": 0.0003145128494171031,
+      "loss": 4.6503,
+      "step": 387000
+    },
+    {
+      "epoch": 2.064288606192333,
+      "grad_norm": 1.0224053859710693,
+      "learning_rate": 0.0003136210242433755,
+      "loss": 4.6422,
+      "step": 387500
+    },
+    {
+      "epoch": 2.0669522043938717,
+      "grad_norm": 0.9545285105705261,
+      "learning_rate": 0.00031272919906964795,
+      "loss": 4.6414,
+      "step": 388000
+    },
+    {
+      "epoch": 2.06961580259541,
+      "grad_norm": 1.057246208190918,
+      "learning_rate": 0.00031183737389592044,
+      "loss": 4.6467,
+      "step": 388500
+    },
+    {
+      "epoch": 2.0722794007969485,
+      "grad_norm": 1.0381290912628174,
+      "learning_rate": 0.00031094733237254035,
+      "loss": 4.6454,
+      "step": 389000
+    },
+    {
+      "epoch": 2.074942998998487,
+      "grad_norm": 0.9364863634109497,
+      "learning_rate": 0.0003100555071988128,
+      "loss": 4.6505,
+      "step": 389500
+    },
+    {
+      "epoch": 2.0776065972000257,
+      "grad_norm": 1.014183759689331,
+      "learning_rate": 0.0003091636820250853,
+      "loss": 4.6442,
+      "step": 390000
+    },
+    {
+      "epoch": 2.080270195401564,
+      "grad_norm": 1.0127809047698975,
+      "learning_rate": 0.0003082718568513577,
+      "loss": 4.6461,
+      "step": 390500
+    },
+    {
+      "epoch": 2.0829337936031025,
+      "grad_norm": 1.0870954990386963,
+      "learning_rate": 0.0003073818153279777,
+      "loss": 4.6425,
+      "step": 391000
+    },
+    {
+      "epoch": 2.085597391804641,
+      "grad_norm": 0.9986569285392761,
+      "learning_rate": 0.0003064899901542501,
+      "loss": 4.6464,
+      "step": 391500
+    },
+    {
+      "epoch": 2.0882609900061797,
+      "grad_norm": 1.044019103050232,
+      "learning_rate": 0.00030559994863086997,
+      "loss": 4.6428,
+      "step": 392000
+    },
+    {
+      "epoch": 2.090924588207718,
+      "grad_norm": 0.9670615196228027,
+      "learning_rate": 0.00030470812345714246,
+      "loss": 4.6463,
+      "step": 392500
+    },
+    {
+      "epoch": 2.0935881864092565,
+      "grad_norm": 0.968877911567688,
+      "learning_rate": 0.0003038162982834149,
+      "loss": 4.6445,
+      "step": 393000
+    },
+    {
+      "epoch": 2.096251784610795,
+      "grad_norm": 1.0653293132781982,
+      "learning_rate": 0.0003029244731096874,
+      "loss": 4.642,
+      "step": 393500
+    },
+    {
+      "epoch": 2.0989153828123337,
+      "grad_norm": 0.9970125555992126,
+      "learning_rate": 0.0003020344315863073,
+      "loss": 4.6422,
+      "step": 394000
+    },
+    {
+      "epoch": 2.101578981013872,
+      "grad_norm": 1.2096583843231201,
+      "learning_rate": 0.0003011426064125797,
+      "loss": 4.6387,
+      "step": 394500
+    },
+    {
+      "epoch": 2.1042425792154105,
+      "grad_norm": 1.0580756664276123,
+      "learning_rate": 0.0003002507812388522,
+      "loss": 4.636,
+      "step": 395000
+    },
+    {
+      "epoch": 2.106906177416949,
+      "grad_norm": 0.9974854588508606,
+      "learning_rate": 0.00029935895606512464,
+      "loss": 4.6538,
+      "step": 395500
+    },
+    {
+      "epoch": 2.1095697756184877,
+      "grad_norm": 0.9898145198822021,
+      "learning_rate": 0.0002984671308913971,
+      "loss": 4.6408,
+      "step": 396000
+    },
+    {
+      "epoch": 2.112233373820026,
+      "grad_norm": 1.0114551782608032,
+      "learning_rate": 0.00029757530571766956,
+      "loss": 4.6403,
+      "step": 396500
+    },
+    {
+      "epoch": 2.1148969720215645,
+      "grad_norm": 1.015718936920166,
+      "learning_rate": 0.000296683480543942,
+      "loss": 4.6477,
+      "step": 397000
+    },
+    {
+      "epoch": 2.117560570223103,
+      "grad_norm": 1.0037897825241089,
+      "learning_rate": 0.0002957916553702145,
+      "loss": 4.6417,
+      "step": 397500
+    },
+    {
+      "epoch": 2.1202241684246417,
+      "grad_norm": 0.9558025002479553,
+      "learning_rate": 0.0002949016138468344,
+      "loss": 4.642,
+      "step": 398000
+    },
+    {
+      "epoch": 2.12288776662618,
+      "grad_norm": 0.9956161975860596,
+      "learning_rate": 0.0002940097886731068,
+      "loss": 4.6391,
+      "step": 398500
+    },
+    {
+      "epoch": 2.1255513648277184,
+      "grad_norm": 1.0069483518600464,
+      "learning_rate": 0.0002931179634993793,
+      "loss": 4.641,
+      "step": 399000
+    },
+    {
+      "epoch": 2.128214963029257,
+      "grad_norm": 0.9737485647201538,
+      "learning_rate": 0.00029222613832565175,
+      "loss": 4.6376,
+      "step": 399500
+    },
+    {
+      "epoch": 2.1308785612307957,
+      "grad_norm": 0.996033251285553,
+      "learning_rate": 0.0002913360968022717,
+      "loss": 4.6349,
+      "step": 400000
+    },
+    {
+      "epoch": 2.133542159432334,
+      "grad_norm": 1.1211779117584229,
+      "learning_rate": 0.00029044427162854415,
+      "loss": 4.6415,
+      "step": 400500
+    },
+    {
+      "epoch": 2.1362057576338724,
+      "grad_norm": 1.0139347314834595,
+      "learning_rate": 0.0002895524464548166,
+      "loss": 4.63,
+      "step": 401000
+    },
+    {
+      "epoch": 2.138869355835411,
+      "grad_norm": 1.051085352897644,
+      "learning_rate": 0.00028866062128108907,
+      "loss": 4.6351,
+      "step": 401500
+    },
+    {
+      "epoch": 2.1415329540369497,
+      "grad_norm": 1.0234901905059814,
+      "learning_rate": 0.0002877705797577089,
+      "loss": 4.6384,
+      "step": 402000
+    },
+    {
+      "epoch": 2.144196552238488,
+      "grad_norm": 1.0635606050491333,
+      "learning_rate": 0.00028687875458398136,
+      "loss": 4.6387,
+      "step": 402500
+    },
+    {
+      "epoch": 2.1468601504400264,
+      "grad_norm": 1.0161880254745483,
+      "learning_rate": 0.00028598692941025385,
+      "loss": 4.6428,
+      "step": 403000
+    },
+    {
+      "epoch": 2.149523748641565,
+      "grad_norm": 1.1507047414779663,
+      "learning_rate": 0.0002850951042365263,
+      "loss": 4.6336,
+      "step": 403500
+    },
+    {
+      "epoch": 2.152187346843103,
+      "grad_norm": 0.9682100415229797,
+      "learning_rate": 0.00028420506271314625,
+      "loss": 4.6411,
+      "step": 404000
+    },
+    {
+      "epoch": 2.154850945044642,
+      "grad_norm": 1.001862645149231,
+      "learning_rate": 0.0002833132375394187,
+      "loss": 4.6354,
+      "step": 404500
+    },
+    {
+      "epoch": 2.1575145432461804,
+      "grad_norm": 1.032013177871704,
+      "learning_rate": 0.0002824214123656911,
+      "loss": 4.6297,
+      "step": 405000
+    },
+    {
+      "epoch": 2.160178141447719,
+      "grad_norm": 1.1009008884429932,
+      "learning_rate": 0.0002815295871919636,
+      "loss": 4.6367,
+      "step": 405500
+    },
+    {
+      "epoch": 2.162841739649257,
+      "grad_norm": 1.0074682235717773,
+      "learning_rate": 0.00028063776201823603,
+      "loss": 4.6367,
+      "step": 406000
+    },
+    {
+      "epoch": 2.165505337850796,
+      "grad_norm": 0.9696961641311646,
+      "learning_rate": 0.00027974772049485595,
+      "loss": 4.6361,
+      "step": 406500
+    },
+    {
+      "epoch": 2.1681689360523344,
+      "grad_norm": 0.9666945934295654,
+      "learning_rate": 0.00027885589532112843,
+      "loss": 4.6389,
+      "step": 407000
+    },
+    {
+      "epoch": 2.170832534253873,
+      "grad_norm": 0.9683700799942017,
+      "learning_rate": 0.00027796407014740087,
+      "loss": 4.6401,
+      "step": 407500
+    },
+    {
+      "epoch": 2.173496132455411,
+      "grad_norm": 0.9791209101676941,
+      "learning_rate": 0.00027707224497367335,
+      "loss": 4.6361,
+      "step": 408000
+    },
+    {
+      "epoch": 2.17615973065695,
+      "grad_norm": 1.0101122856140137,
+      "learning_rate": 0.0002761822034502932,
+      "loss": 4.6393,
+      "step": 408500
+    },
+    {
+      "epoch": 2.1788233288584884,
+      "grad_norm": 0.9944539070129395,
+      "learning_rate": 0.00027529037827656565,
+      "loss": 4.6346,
+      "step": 409000
+    },
+    {
+      "epoch": 2.181486927060027,
+      "grad_norm": 0.9819368124008179,
+      "learning_rate": 0.0002743985531028382,
+      "loss": 4.6298,
+      "step": 409500
+    },
+    {
+      "epoch": 2.184150525261565,
+      "grad_norm": 1.0016804933547974,
+      "learning_rate": 0.00027350851157945805,
+      "loss": 4.6415,
+      "step": 410000
+    },
+    {
+      "epoch": 2.1868141234631038,
+      "grad_norm": 0.9300929307937622,
+      "learning_rate": 0.00027261668640573053,
+      "loss": 4.6324,
+      "step": 410500
+    },
+    {
+      "epoch": 2.1894777216646424,
+      "grad_norm": 0.9710443615913391,
+      "learning_rate": 0.00027172486123200297,
+      "loss": 4.6398,
+      "step": 411000
+    },
+    {
+      "epoch": 2.192141319866181,
+      "grad_norm": 1.0339746475219727,
+      "learning_rate": 0.0002708330360582754,
+      "loss": 4.6383,
+      "step": 411500
+    },
+    {
+      "epoch": 2.194804918067719,
+      "grad_norm": 1.178084373474121,
+      "learning_rate": 0.0002699412108845479,
+      "loss": 4.6408,
+      "step": 412000
+    },
+    {
+      "epoch": 2.1974685162692578,
+      "grad_norm": 1.00334632396698,
+      "learning_rate": 0.0002690493857108203,
+      "loss": 4.637,
+      "step": 412500
+    },
+    {
+      "epoch": 2.2001321144707964,
+      "grad_norm": 0.962380588054657,
+      "learning_rate": 0.0002681575605370928,
+      "loss": 4.643,
+      "step": 413000
+    },
+    {
+      "epoch": 2.202795712672335,
+      "grad_norm": 1.0694209337234497,
+      "learning_rate": 0.00026726573536336524,
+      "loss": 4.6382,
+      "step": 413500
+    },
+    {
+      "epoch": 2.205459310873873,
+      "grad_norm": 1.0394047498703003,
+      "learning_rate": 0.00026637569383998515,
+      "loss": 4.6324,
+      "step": 414000
+    },
+    {
+      "epoch": 2.2081229090754118,
+      "grad_norm": 1.0649442672729492,
+      "learning_rate": 0.00026548386866625764,
+      "loss": 4.6362,
+      "step": 414500
+    },
+    {
+      "epoch": 2.2107865072769504,
+      "grad_norm": 1.0115076303482056,
+      "learning_rate": 0.00026459382714287755,
+      "loss": 4.6314,
+      "step": 415000
+    },
+    {
+      "epoch": 2.213450105478489,
+      "grad_norm": 1.089772343635559,
+      "learning_rate": 0.00026370200196915,
+      "loss": 4.6422,
+      "step": 415500
+    },
+    {
+      "epoch": 2.216113703680027,
+      "grad_norm": 1.0219160318374634,
+      "learning_rate": 0.00026281017679542247,
+      "loss": 4.6368,
+      "step": 416000
+    },
+    {
+      "epoch": 2.2187773018815657,
+      "grad_norm": 1.2249672412872314,
+      "learning_rate": 0.0002619183516216949,
+      "loss": 4.6363,
+      "step": 416500
+    },
+    {
+      "epoch": 2.2214409000831044,
+      "grad_norm": 1.054093599319458,
+      "learning_rate": 0.0002610265264479674,
+      "loss": 4.6344,
+      "step": 417000
+    },
+    {
+      "epoch": 2.224104498284643,
+      "grad_norm": 1.0143494606018066,
+      "learning_rate": 0.0002601347012742398,
+      "loss": 4.6343,
+      "step": 417500
+    },
+    {
+      "epoch": 2.226768096486181,
+      "grad_norm": 1.083958625793457,
+      "learning_rate": 0.00025924287610051226,
+      "loss": 4.6375,
+      "step": 418000
+    },
+    {
+      "epoch": 2.2294316946877197,
+      "grad_norm": 1.0664527416229248,
+      "learning_rate": 0.00025835105092678474,
+      "loss": 4.6374,
+      "step": 418500
+    },
+    {
+      "epoch": 2.2320952928892583,
+      "grad_norm": 0.9486995935440063,
+      "learning_rate": 0.0002574610094034046,
+      "loss": 4.6361,
+      "step": 419000
+    },
+    {
+      "epoch": 2.234758891090797,
+      "grad_norm": 1.0301847457885742,
+      "learning_rate": 0.0002565691842296771,
+      "loss": 4.6363,
+      "step": 419500
+    },
+    {
+      "epoch": 2.237422489292335,
+      "grad_norm": 1.0453981161117554,
+      "learning_rate": 0.0002556773590559496,
+      "loss": 4.6315,
+      "step": 420000
+    },
+    {
+      "epoch": 2.2400860874938737,
+      "grad_norm": 1.0513032674789429,
+      "learning_rate": 0.000254785533882222,
+      "loss": 4.6225,
+      "step": 420500
+    },
+    {
+      "epoch": 2.2427496856954123,
+      "grad_norm": 1.0518192052841187,
+      "learning_rate": 0.0002538972760091894,
+      "loss": 4.6359,
+      "step": 421000
+    },
+    {
+      "epoch": 2.2454132838969505,
+      "grad_norm": 1.0620026588439941,
+      "learning_rate": 0.00025300545083546184,
+      "loss": 4.6363,
+      "step": 421500
+    },
+    {
+      "epoch": 2.248076882098489,
+      "grad_norm": 1.0133403539657593,
+      "learning_rate": 0.00025211362566173427,
+      "loss": 4.6356,
+      "step": 422000
+    },
+    {
+      "epoch": 2.2507404803000277,
+      "grad_norm": 1.0840908288955688,
+      "learning_rate": 0.00025122180048800676,
+      "loss": 4.632,
+      "step": 422500
+    },
+    {
+      "epoch": 2.2534040785015663,
+      "grad_norm": 1.020334005355835,
+      "learning_rate": 0.0002503299753142792,
+      "loss": 4.6422,
+      "step": 423000
+    },
+    {
+      "epoch": 2.256067676703105,
+      "grad_norm": 1.0460251569747925,
+      "learning_rate": 0.0002494381501405517,
+      "loss": 4.6341,
+      "step": 423500
+    },
+    {
+      "epoch": 2.258731274904643,
+      "grad_norm": 0.9714872241020203,
+      "learning_rate": 0.0002485463249668241,
+      "loss": 4.6323,
+      "step": 424000
+    },
+    {
+      "epoch": 2.2613948731061817,
+      "grad_norm": 1.111423373222351,
+      "learning_rate": 0.00024765449979309654,
+      "loss": 4.6308,
+      "step": 424500
+    },
+    {
+      "epoch": 2.2640584713077203,
+      "grad_norm": 1.063982367515564,
+      "learning_rate": 0.00024676445826971646,
+      "loss": 4.6341,
+      "step": 425000
+    },
+    {
+      "epoch": 2.2667220695092585,
+      "grad_norm": 1.131791114807129,
+      "learning_rate": 0.0002458744167463364,
+      "loss": 4.6318,
+      "step": 425500
+    },
+    {
+      "epoch": 2.269385667710797,
+      "grad_norm": 0.969653308391571,
+      "learning_rate": 0.00024498259157260886,
+      "loss": 4.6393,
+      "step": 426000
+    },
+    {
+      "epoch": 2.2720492659123357,
+      "grad_norm": 1.02666175365448,
+      "learning_rate": 0.00024409076639888132,
+      "loss": 4.6359,
+      "step": 426500
+    },
+    {
+      "epoch": 2.2747128641138743,
+      "grad_norm": 1.1176308393478394,
+      "learning_rate": 0.00024319894122515375,
+      "loss": 4.6339,
+      "step": 427000
+    },
+    {
+      "epoch": 2.2773764623154125,
+      "grad_norm": 1.00742506980896,
+      "learning_rate": 0.0002423071160514262,
+      "loss": 4.6231,
+      "step": 427500
+    },
+    {
+      "epoch": 2.280040060516951,
+      "grad_norm": 1.0069453716278076,
+      "learning_rate": 0.00024141529087769867,
+      "loss": 4.6345,
+      "step": 428000
+    },
+    {
+      "epoch": 2.2827036587184897,
+      "grad_norm": 1.0893571376800537,
+      "learning_rate": 0.00024052346570397113,
+      "loss": 4.6319,
+      "step": 428500
+    },
+    {
+      "epoch": 2.2853672569200283,
+      "grad_norm": 0.9942576885223389,
+      "learning_rate": 0.00023963342418059104,
+      "loss": 4.6305,
+      "step": 429000
+    },
+    {
+      "epoch": 2.2880308551215665,
+      "grad_norm": 1.0035908222198486,
+      "learning_rate": 0.00023874159900686348,
+      "loss": 4.633,
+      "step": 429500
+    },
+    {
+      "epoch": 2.290694453323105,
+      "grad_norm": 0.9980865716934204,
+      "learning_rate": 0.00023784977383313594,
+      "loss": 4.6276,
+      "step": 430000
+    },
+    {
+      "epoch": 2.2933580515246437,
+      "grad_norm": 1.0136911869049072,
+      "learning_rate": 0.0002369579486594084,
+      "loss": 4.6286,
+      "step": 430500
+    },
+    {
+      "epoch": 2.2960216497261823,
+      "grad_norm": 1.04083251953125,
+      "learning_rate": 0.00023606612348568085,
+      "loss": 4.6401,
+      "step": 431000
+    },
+    {
+      "epoch": 2.2986852479277204,
+      "grad_norm": 1.3262946605682373,
+      "learning_rate": 0.00023517429831195331,
+      "loss": 4.626,
+      "step": 431500
+    },
+    {
+      "epoch": 2.301348846129259,
+      "grad_norm": 1.0201387405395508,
+      "learning_rate": 0.00023428247313822577,
+      "loss": 4.6319,
+      "step": 432000
+    },
+    {
+      "epoch": 2.3040124443307977,
+      "grad_norm": 1.1112711429595947,
+      "learning_rate": 0.0002333906479644982,
+      "loss": 4.6356,
+      "step": 432500
+    },
+    {
+      "epoch": 2.3066760425323363,
+      "grad_norm": 1.037654995918274,
+      "learning_rate": 0.00023250060644111815,
+      "loss": 4.6275,
+      "step": 433000
+    },
+    {
+      "epoch": 2.3093396407338744,
+      "grad_norm": 1.1203975677490234,
+      "learning_rate": 0.0002316087812673906,
+      "loss": 4.6348,
+      "step": 433500
+    },
+    {
+      "epoch": 2.312003238935413,
+      "grad_norm": 1.1220102310180664,
+      "learning_rate": 0.00023071695609366307,
+      "loss": 4.6275,
+      "step": 434000
+    },
+    {
+      "epoch": 2.3146668371369516,
+      "grad_norm": 1.0318022966384888,
+      "learning_rate": 0.0002298251309199355,
+      "loss": 4.6279,
+      "step": 434500
+    },
+    {
+      "epoch": 2.31733043533849,
+      "grad_norm": 1.0012495517730713,
+      "learning_rate": 0.00022893508939655541,
+      "loss": 4.6305,
+      "step": 435000
+    },
+    {
+      "epoch": 2.3199940335400284,
+      "grad_norm": 1.0262128114700317,
+      "learning_rate": 0.00022804504787317533,
+      "loss": 4.631,
+      "step": 435500
+    },
+    {
+      "epoch": 2.322657631741567,
+      "grad_norm": 1.0249779224395752,
+      "learning_rate": 0.0002271532226994478,
+      "loss": 4.6313,
+      "step": 436000
+    },
+    {
+      "epoch": 2.3253212299431056,
+      "grad_norm": 1.0550204515457153,
+      "learning_rate": 0.00022626139752572025,
+      "loss": 4.6357,
+      "step": 436500
+    },
+    {
+      "epoch": 2.3279848281446442,
+      "grad_norm": 1.1302458047866821,
+      "learning_rate": 0.0002253695723519927,
+      "loss": 4.6256,
+      "step": 437000
+    },
+    {
+      "epoch": 2.3306484263461824,
+      "grad_norm": 1.0832403898239136,
+      "learning_rate": 0.00022447774717826517,
+      "loss": 4.6286,
+      "step": 437500
+    },
+    {
+      "epoch": 2.333312024547721,
+      "grad_norm": 1.0546700954437256,
+      "learning_rate": 0.00022358770565488506,
+      "loss": 4.6341,
+      "step": 438000
+    },
+    {
+      "epoch": 2.3359756227492596,
+      "grad_norm": 1.0069321393966675,
+      "learning_rate": 0.00022269588048115752,
+      "loss": 4.6311,
+      "step": 438500
+    },
+    {
+      "epoch": 2.338639220950798,
+      "grad_norm": 1.0736314058303833,
+      "learning_rate": 0.00022180405530742997,
+      "loss": 4.6162,
+      "step": 439000
+    },
+    {
+      "epoch": 2.3413028191523364,
+      "grad_norm": 1.0590038299560547,
+      "learning_rate": 0.00022091223013370243,
+      "loss": 4.6332,
+      "step": 439500
+    },
+    {
+      "epoch": 2.343966417353875,
+      "grad_norm": 1.022923231124878,
+      "learning_rate": 0.0002200204049599749,
+      "loss": 4.6248,
+      "step": 440000
+    },
+    {
+      "epoch": 2.3466300155554136,
+      "grad_norm": 1.0593072175979614,
+      "learning_rate": 0.00021912857978624735,
+      "loss": 4.6273,
+      "step": 440500
+    },
+    {
+      "epoch": 2.3492936137569522,
+      "grad_norm": 1.0082392692565918,
+      "learning_rate": 0.00021823675461251979,
+      "loss": 4.6275,
+      "step": 441000
+    },
+    {
+      "epoch": 2.3519572119584904,
+      "grad_norm": 0.9842462539672852,
+      "learning_rate": 0.00021734492943879224,
+      "loss": 4.6346,
+      "step": 441500
+    },
+    {
+      "epoch": 2.354620810160029,
+      "grad_norm": 1.0930989980697632,
+      "learning_rate": 0.0002164548879154122,
+      "loss": 4.6299,
+      "step": 442000
+    },
+    {
+      "epoch": 2.3572844083615676,
+      "grad_norm": 1.1248174905776978,
+      "learning_rate": 0.00021556306274168465,
+      "loss": 4.6353,
+      "step": 442500
+    },
+    {
+      "epoch": 2.3599480065631058,
+      "grad_norm": 1.0502623319625854,
+      "learning_rate": 0.00021467123756795708,
+      "loss": 4.6322,
+      "step": 443000
+    },
+    {
+      "epoch": 2.3626116047646444,
+      "grad_norm": 1.045857548713684,
+      "learning_rate": 0.00021377941239422954,
+      "loss": 4.6232,
+      "step": 443500
+    },
+    {
+      "epoch": 2.365275202966183,
+      "grad_norm": 1.151315450668335,
+      "learning_rate": 0.00021288937087084945,
+      "loss": 4.6333,
+      "step": 444000
+    },
+    {
+      "epoch": 2.3679388011677216,
+      "grad_norm": 1.0542734861373901,
+      "learning_rate": 0.0002119975456971219,
+      "loss": 4.6246,
+      "step": 444500
+    },
+    {
+      "epoch": 2.3706023993692598,
+      "grad_norm": 1.1092387437820435,
+      "learning_rate": 0.00021110572052339435,
+      "loss": 4.6313,
+      "step": 445000
+    },
+    {
+      "epoch": 2.3732659975707984,
+      "grad_norm": 1.2137620449066162,
+      "learning_rate": 0.0002102138953496668,
+      "loss": 4.631,
+      "step": 445500
+    },
+    {
+      "epoch": 2.375929595772337,
+      "grad_norm": 1.072719931602478,
+      "learning_rate": 0.00020932207017593926,
+      "loss": 4.6199,
+      "step": 446000
+    },
+    {
+      "epoch": 2.3785931939738756,
+      "grad_norm": 1.1971569061279297,
+      "learning_rate": 0.00020843024500221172,
+      "loss": 4.622,
+      "step": 446500
+    },
+    {
+      "epoch": 2.3812567921754138,
+      "grad_norm": 1.0519288778305054,
+      "learning_rate": 0.00020753841982848418,
+      "loss": 4.6261,
+      "step": 447000
+    },
+    {
+      "epoch": 2.3839203903769524,
+      "grad_norm": 1.0470134019851685,
+      "learning_rate": 0.0002066483783051041,
+      "loss": 4.6231,
+      "step": 447500
+    },
+    {
+      "epoch": 2.386583988578491,
+      "grad_norm": 1.2513642311096191,
+      "learning_rate": 0.00020575655313137656,
+      "loss": 4.6348,
+      "step": 448000
+    },
+    {
+      "epoch": 2.3892475867800296,
+      "grad_norm": 1.031900405883789,
+      "learning_rate": 0.00020486472795764902,
+      "loss": 4.6281,
+      "step": 448500
+    },
+    {
+      "epoch": 2.3919111849815677,
+      "grad_norm": 1.0538623332977295,
+      "learning_rate": 0.00020397290278392148,
+      "loss": 4.6284,
+      "step": 449000
+    },
+    {
+      "epoch": 2.3945747831831063,
+      "grad_norm": 1.071651816368103,
+      "learning_rate": 0.00020308107761019394,
+      "loss": 4.6278,
+      "step": 449500
+    },
+    {
+      "epoch": 2.397238381384645,
+      "grad_norm": 1.1340712308883667,
+      "learning_rate": 0.00020219103608681382,
+      "loss": 4.6314,
+      "step": 450000
+    },
+    {
+      "epoch": 2.3999019795861836,
+      "grad_norm": 1.0195579528808594,
+      "learning_rate": 0.00020129921091308628,
+      "loss": 4.6304,
+      "step": 450500
+    },
+    {
+      "epoch": 2.4025655777877217,
+      "grad_norm": 1.0579105615615845,
+      "learning_rate": 0.00020040738573935874,
+      "loss": 4.6223,
+      "step": 451000
+    },
+    {
+      "epoch": 2.4052291759892603,
+      "grad_norm": 1.0337562561035156,
+      "learning_rate": 0.0001995155605656312,
+      "loss": 4.6316,
+      "step": 451500
+    },
+    {
+      "epoch": 2.407892774190799,
+      "grad_norm": 1.085295557975769,
+      "learning_rate": 0.00019862373539190363,
+      "loss": 4.6199,
+      "step": 452000
+    },
+    {
+      "epoch": 2.4105563723923376,
+      "grad_norm": 1.1386431455612183,
+      "learning_rate": 0.00019773369386852358,
+      "loss": 4.6247,
+      "step": 452500
+    },
+    {
+      "epoch": 2.4132199705938757,
+      "grad_norm": 1.0375934839248657,
+      "learning_rate": 0.00019684186869479604,
+      "loss": 4.6259,
+      "step": 453000
+    },
+    {
+      "epoch": 2.4158835687954143,
+      "grad_norm": 1.110255479812622,
+      "learning_rate": 0.0001959500435210685,
+      "loss": 4.6211,
+      "step": 453500
+    },
+    {
+      "epoch": 2.418547166996953,
+      "grad_norm": 1.0886731147766113,
+      "learning_rate": 0.00019505821834734096,
+      "loss": 4.6308,
+      "step": 454000
+    },
+    {
+      "epoch": 2.4212107651984915,
+      "grad_norm": 1.1896620988845825,
+      "learning_rate": 0.0001941663931736134,
+      "loss": 4.6244,
+      "step": 454500
+    },
+    {
+      "epoch": 2.4238743634000297,
+      "grad_norm": 1.076377034187317,
+      "learning_rate": 0.00019327456799988585,
+      "loss": 4.6261,
+      "step": 455000
+    },
+    {
+      "epoch": 2.4265379616015683,
+      "grad_norm": 1.1211566925048828,
+      "learning_rate": 0.0001923827428261583,
+      "loss": 4.627,
+      "step": 455500
+    },
+    {
+      "epoch": 2.429201559803107,
+      "grad_norm": 1.1093415021896362,
+      "learning_rate": 0.00019149091765243077,
+      "loss": 4.6292,
+      "step": 456000
+    },
+    {
+      "epoch": 2.431865158004645,
+      "grad_norm": 1.2548290491104126,
+      "learning_rate": 0.00019060087612905065,
+      "loss": 4.6207,
+      "step": 456500
+    },
+    {
+      "epoch": 2.4345287562061837,
+      "grad_norm": 1.0689791440963745,
+      "learning_rate": 0.0001897108346056706,
+      "loss": 4.622,
+      "step": 457000
+    },
+    {
+      "epoch": 2.4371923544077223,
+      "grad_norm": 1.0006210803985596,
+      "learning_rate": 0.00018881900943194306,
+      "loss": 4.6269,
+      "step": 457500
+    },
+    {
+      "epoch": 2.439855952609261,
+      "grad_norm": 1.0268884897232056,
+      "learning_rate": 0.00018792718425821552,
+      "loss": 4.6215,
+      "step": 458000
+    },
+    {
+      "epoch": 2.4425195508107995,
+      "grad_norm": 1.223487377166748,
+      "learning_rate": 0.00018703535908448795,
+      "loss": 4.6205,
+      "step": 458500
+    },
+    {
+      "epoch": 2.4451831490123377,
+      "grad_norm": 1.104552984237671,
+      "learning_rate": 0.0001861435339107604,
+      "loss": 4.6241,
+      "step": 459000
+    },
+    {
+      "epoch": 2.4478467472138763,
+      "grad_norm": 1.0752313137054443,
+      "learning_rate": 0.00018525349238738032,
+      "loss": 4.6253,
+      "step": 459500
+    },
+    {
+      "epoch": 2.450510345415415,
+      "grad_norm": 1.0842454433441162,
+      "learning_rate": 0.00018436166721365278,
+      "loss": 4.6234,
+      "step": 460000
+    },
+    {
+      "epoch": 2.453173943616953,
+      "grad_norm": 1.0523731708526611,
+      "learning_rate": 0.00018346984203992521,
+      "loss": 4.63,
+      "step": 460500
+    },
+    {
+      "epoch": 2.4558375418184917,
+      "grad_norm": 1.1694416999816895,
+      "learning_rate": 0.00018257801686619767,
+      "loss": 4.6239,
+      "step": 461000
+    },
+    {
+      "epoch": 2.4585011400200303,
+      "grad_norm": 1.0971251726150513,
+      "learning_rate": 0.00018168619169247013,
+      "loss": 4.6215,
+      "step": 461500
+    },
+    {
+      "epoch": 2.461164738221569,
+      "grad_norm": 1.0404231548309326,
+      "learning_rate": 0.00018079615016909008,
+      "loss": 4.6263,
+      "step": 462000
+    },
+    {
+      "epoch": 2.463828336423107,
+      "grad_norm": 1.0926011800765991,
+      "learning_rate": 0.00017990432499536254,
+      "loss": 4.6273,
+      "step": 462500
+    },
+    {
+      "epoch": 2.4664919346246457,
+      "grad_norm": 1.079408884048462,
+      "learning_rate": 0.00017901249982163497,
+      "loss": 4.6124,
+      "step": 463000
+    },
+    {
+      "epoch": 2.4691555328261843,
+      "grad_norm": 1.0728904008865356,
+      "learning_rate": 0.00017812067464790743,
+      "loss": 4.6215,
+      "step": 463500
+    },
+    {
+      "epoch": 2.471819131027723,
+      "grad_norm": 1.2427496910095215,
+      "learning_rate": 0.00017722884947417989,
+      "loss": 4.6177,
+      "step": 464000
+    },
+    {
+      "epoch": 2.474482729229261,
+      "grad_norm": 1.0962218046188354,
+      "learning_rate": 0.00017633702430045235,
+      "loss": 4.6202,
+      "step": 464500
+    },
+    {
+      "epoch": 2.4771463274307997,
+      "grad_norm": 1.0535134077072144,
+      "learning_rate": 0.00017544698277707223,
+      "loss": 4.6229,
+      "step": 465000
+    },
+    {
+      "epoch": 2.4798099256323383,
+      "grad_norm": 1.1047760248184204,
+      "learning_rate": 0.0001745551576033447,
+      "loss": 4.6232,
+      "step": 465500
+    },
+    {
+      "epoch": 2.482473523833877,
+      "grad_norm": 1.0571211576461792,
+      "learning_rate": 0.00017366333242961715,
+      "loss": 4.6182,
+      "step": 466000
+    },
+    {
+      "epoch": 2.485137122035415,
+      "grad_norm": 1.045280933380127,
+      "learning_rate": 0.0001727715072558896,
+      "loss": 4.6214,
+      "step": 466500
+    },
+    {
+      "epoch": 2.4878007202369536,
+      "grad_norm": 1.0921036005020142,
+      "learning_rate": 0.00017187968208216207,
+      "loss": 4.6283,
+      "step": 467000
+    },
+    {
+      "epoch": 2.4904643184384923,
+      "grad_norm": 1.0829055309295654,
+      "learning_rate": 0.00017098785690843453,
+      "loss": 4.6228,
+      "step": 467500
+    },
+    {
+      "epoch": 2.493127916640031,
+      "grad_norm": 1.0832949876785278,
+      "learning_rate": 0.00017009603173470696,
+      "loss": 4.6234,
+      "step": 468000
+    },
+    {
+      "epoch": 2.495791514841569,
+      "grad_norm": 1.1113747358322144,
+      "learning_rate": 0.0001692059902113269,
+      "loss": 4.6245,
+      "step": 468500
+    },
+    {
+      "epoch": 2.4984551130431076,
+      "grad_norm": 1.0775564908981323,
+      "learning_rate": 0.00016831416503759937,
+      "loss": 4.6211,
+      "step": 469000
+    },
+    {
+      "epoch": 2.5011187112446462,
+      "grad_norm": 1.0286856889724731,
+      "learning_rate": 0.00016742233986387182,
+      "loss": 4.6168,
+      "step": 469500
+    },
+    {
+      "epoch": 2.5037823094461844,
+      "grad_norm": 1.1658544540405273,
+      "learning_rate": 0.00016653051469014426,
+      "loss": 4.6178,
+      "step": 470000
+    },
+    {
+      "epoch": 2.506445907647723,
+      "grad_norm": 1.1998695135116577,
+      "learning_rate": 0.00016563868951641672,
+      "loss": 4.6239,
+      "step": 470500
+    },
+    {
+      "epoch": 2.5091095058492616,
+      "grad_norm": 1.065800666809082,
+      "learning_rate": 0.00016474686434268918,
+      "loss": 4.6215,
+      "step": 471000
+    },
+    {
+      "epoch": 2.5117731040508002,
+      "grad_norm": 1.1773850917816162,
+      "learning_rate": 0.0001638568228193091,
+      "loss": 4.6245,
+      "step": 471500
+    },
+    {
+      "epoch": 2.514436702252339,
+      "grad_norm": 1.1137776374816895,
+      "learning_rate": 0.00016296499764558152,
+      "loss": 4.6168,
+      "step": 472000
+    },
+    {
+      "epoch": 2.517100300453877,
+      "grad_norm": 1.0657340288162231,
+      "learning_rate": 0.00016207317247185398,
+      "loss": 4.6185,
+      "step": 472500
+    },
+    {
+      "epoch": 2.5197638986554156,
+      "grad_norm": 1.0470982789993286,
+      "learning_rate": 0.00016118134729812644,
+      "loss": 4.6135,
+      "step": 473000
+    },
+    {
+      "epoch": 2.522427496856954,
+      "grad_norm": 1.116703748703003,
+      "learning_rate": 0.0001602895221243989,
+      "loss": 4.623,
+      "step": 473500
+    },
+    {
+      "epoch": 2.5250910950584924,
+      "grad_norm": 1.0753133296966553,
+      "learning_rate": 0.00015939948060101884,
+      "loss": 4.6198,
+      "step": 474000
+    },
+    {
+      "epoch": 2.527754693260031,
+      "grad_norm": 1.034504771232605,
+      "learning_rate": 0.00015850765542729128,
+      "loss": 4.6167,
+      "step": 474500
+    },
+    {
+      "epoch": 2.5304182914615696,
+      "grad_norm": 1.1084864139556885,
+      "learning_rate": 0.00015761583025356374,
+      "loss": 4.6158,
+      "step": 475000
+    },
+    {
+      "epoch": 2.533081889663108,
+      "grad_norm": 1.1004912853240967,
+      "learning_rate": 0.0001567240050798362,
+      "loss": 4.62,
+      "step": 475500
+    },
+    {
+      "epoch": 2.535745487864647,
+      "grad_norm": 1.0630244016647339,
+      "learning_rate": 0.00015583217990610865,
+      "loss": 4.6224,
+      "step": 476000
+    },
+    {
+      "epoch": 2.538409086066185,
+      "grad_norm": 1.2044382095336914,
+      "learning_rate": 0.00015494213838272854,
+      "loss": 4.621,
+      "step": 476500
+    },
+    {
+      "epoch": 2.5410726842677236,
+      "grad_norm": 1.135984182357788,
+      "learning_rate": 0.000154050313209001,
+      "loss": 4.6186,
+      "step": 477000
+    },
+    {
+      "epoch": 2.543736282469262,
+      "grad_norm": 1.026955008506775,
+      "learning_rate": 0.00015315848803527346,
+      "loss": 4.6232,
+      "step": 477500
+    },
+    {
+      "epoch": 2.5463998806708004,
+      "grad_norm": 1.180627465248108,
+      "learning_rate": 0.00015226666286154592,
+      "loss": 4.616,
+      "step": 478000
+    },
+    {
+      "epoch": 2.549063478872339,
+      "grad_norm": 1.1590373516082764,
+      "learning_rate": 0.00015137483768781838,
+      "loss": 4.6181,
+      "step": 478500
+    },
+    {
+      "epoch": 2.5517270770738776,
+      "grad_norm": 1.1868000030517578,
+      "learning_rate": 0.0001504847961644383,
+      "loss": 4.6204,
+      "step": 479000
+    },
+    {
+      "epoch": 2.554390675275416,
+      "grad_norm": 1.1171778440475464,
+      "learning_rate": 0.00014959297099071076,
+      "loss": 4.612,
+      "step": 479500
+    },
+    {
+      "epoch": 2.557054273476955,
+      "grad_norm": 1.1593362092971802,
+      "learning_rate": 0.00014870114581698321,
+      "loss": 4.6143,
+      "step": 480000
+    },
+    {
+      "epoch": 2.559717871678493,
+      "grad_norm": 1.047542691230774,
+      "learning_rate": 0.00014780932064325567,
+      "loss": 4.6133,
+      "step": 480500
+    },
+    {
+      "epoch": 2.5623814698800316,
+      "grad_norm": 1.1630990505218506,
+      "learning_rate": 0.00014691749546952813,
+      "loss": 4.6167,
+      "step": 481000
+    },
+    {
+      "epoch": 2.56504506808157,
+      "grad_norm": 1.067874789237976,
+      "learning_rate": 0.00014602567029580057,
+      "loss": 4.6257,
+      "step": 481500
+    },
+    {
+      "epoch": 2.5677086662831083,
+      "grad_norm": 1.2333664894104004,
+      "learning_rate": 0.00014513384512207303,
+      "loss": 4.621,
+      "step": 482000
+    },
+    {
+      "epoch": 2.570372264484647,
+      "grad_norm": 1.1577945947647095,
+      "learning_rate": 0.00014424380359869294,
+      "loss": 4.619,
+      "step": 482500
+    },
+    {
+      "epoch": 2.5730358626861856,
+      "grad_norm": 1.1029491424560547,
+      "learning_rate": 0.0001433519784249654,
+      "loss": 4.6151,
+      "step": 483000
+    },
+    {
+      "epoch": 2.575699460887724,
+      "grad_norm": 1.076328158378601,
+      "learning_rate": 0.00014246015325123783,
+      "loss": 4.6231,
+      "step": 483500
+    },
+    {
+      "epoch": 2.5783630590892628,
+      "grad_norm": 1.164756178855896,
+      "learning_rate": 0.0001415683280775103,
+      "loss": 4.6106,
+      "step": 484000
+    },
+    {
+      "epoch": 2.581026657290801,
+      "grad_norm": 1.0658756494522095,
+      "learning_rate": 0.00014067650290378275,
+      "loss": 4.622,
+      "step": 484500
+    },
+    {
+      "epoch": 2.5836902554923395,
+      "grad_norm": 1.08512282371521,
+      "learning_rate": 0.00013978467773005524,
+      "loss": 4.6156,
+      "step": 485000
+    },
+    {
+      "epoch": 2.586353853693878,
+      "grad_norm": 1.2632811069488525,
+      "learning_rate": 0.0001388928525563277,
+      "loss": 4.6186,
+      "step": 485500
+    },
+    {
+      "epoch": 2.5890174518954163,
+      "grad_norm": 1.0426981449127197,
+      "learning_rate": 0.00013800102738260016,
+      "loss": 4.6172,
+      "step": 486000
+    },
+    {
+      "epoch": 2.591681050096955,
+      "grad_norm": 1.0602271556854248,
+      "learning_rate": 0.00013711098585922005,
+      "loss": 4.617,
+      "step": 486500
+    },
+    {
+      "epoch": 2.5943446482984935,
+      "grad_norm": 1.0918567180633545,
+      "learning_rate": 0.0001362191606854925,
+      "loss": 4.6205,
+      "step": 487000
+    },
+    {
+      "epoch": 2.5970082465000317,
+      "grad_norm": 1.1476528644561768,
+      "learning_rate": 0.00013532911916211242,
+      "loss": 4.6164,
+      "step": 487500
+    },
+    {
+      "epoch": 2.5996718447015703,
+      "grad_norm": 1.0901427268981934,
+      "learning_rate": 0.00013443729398838485,
+      "loss": 4.6212,
+      "step": 488000
+    },
+    {
+      "epoch": 2.602335442903109,
+      "grad_norm": 1.1208913326263428,
+      "learning_rate": 0.0001335454688146573,
+      "loss": 4.6151,
+      "step": 488500
+    },
+    {
+      "epoch": 2.6049990411046475,
+      "grad_norm": 1.1271238327026367,
+      "learning_rate": 0.00013265364364092977,
+      "loss": 4.6199,
+      "step": 489000
+    },
+    {
+      "epoch": 2.607662639306186,
+      "grad_norm": 1.0943602323532104,
+      "learning_rate": 0.0001317636021175497,
+      "loss": 4.6166,
+      "step": 489500
+    },
+    {
+      "epoch": 2.6103262375077243,
+      "grad_norm": 1.1179605722427368,
+      "learning_rate": 0.00013087177694382215,
+      "loss": 4.6055,
+      "step": 490000
+    },
+    {
+      "epoch": 2.612989835709263,
+      "grad_norm": 1.107720971107483,
+      "learning_rate": 0.0001299799517700946,
+      "loss": 4.6132,
+      "step": 490500
+    },
+    {
+      "epoch": 2.6156534339108015,
+      "grad_norm": 1.0601928234100342,
+      "learning_rate": 0.00012908812659636706,
+      "loss": 4.6083,
+      "step": 491000
+    },
+    {
+      "epoch": 2.6183170321123397,
+      "grad_norm": 1.1014827489852905,
+      "learning_rate": 0.00012819630142263952,
+      "loss": 4.6125,
+      "step": 491500
+    },
+    {
+      "epoch": 2.6209806303138783,
+      "grad_norm": 1.2044124603271484,
+      "learning_rate": 0.0001273062598992594,
+      "loss": 4.6232,
+      "step": 492000
+    },
+    {
+      "epoch": 2.623644228515417,
+      "grad_norm": 1.0993869304656982,
+      "learning_rate": 0.00012641443472553187,
+      "loss": 4.6082,
+      "step": 492500
+    },
+    {
+      "epoch": 2.6263078267169555,
+      "grad_norm": 1.161431074142456,
+      "learning_rate": 0.00012552260955180433,
+      "loss": 4.6177,
+      "step": 493000
+    },
+    {
+      "epoch": 2.628971424918494,
+      "grad_norm": 1.0688318014144897,
+      "learning_rate": 0.0001246307843780768,
+      "loss": 4.6124,
+      "step": 493500
+    },
+    {
+      "epoch": 2.6316350231200323,
+      "grad_norm": 1.0411505699157715,
+      "learning_rate": 0.00012373895920434925,
+      "loss": 4.6142,
+      "step": 494000
+    },
+    {
+      "epoch": 2.634298621321571,
+      "grad_norm": 1.101181983947754,
+      "learning_rate": 0.0001228471340306217,
+      "loss": 4.6185,
+      "step": 494500
+    },
+    {
+      "epoch": 2.6369622195231095,
+      "grad_norm": 1.0938246250152588,
+      "learning_rate": 0.00012195530885689417,
+      "loss": 4.6202,
+      "step": 495000
+    },
+    {
+      "epoch": 2.6396258177246477,
+      "grad_norm": 1.137458086013794,
+      "learning_rate": 0.00012106348368316663,
+      "loss": 4.6102,
+      "step": 495500
+    },
+    {
+      "epoch": 2.6422894159261863,
+      "grad_norm": 1.1052279472351074,
+      "learning_rate": 0.00012017522581013399,
+      "loss": 4.6173,
+      "step": 496000
+    },
+    {
+      "epoch": 2.644953014127725,
+      "grad_norm": 1.1576839685440063,
+      "learning_rate": 0.00011928340063640645,
+      "loss": 4.6115,
+      "step": 496500
+    },
+    {
+      "epoch": 2.6476166123292635,
+      "grad_norm": 1.149245023727417,
+      "learning_rate": 0.00011839157546267889,
+      "loss": 4.6142,
+      "step": 497000
+    },
+    {
+      "epoch": 2.650280210530802,
+      "grad_norm": 1.1047520637512207,
+      "learning_rate": 0.00011749975028895136,
+      "loss": 4.6143,
+      "step": 497500
+    },
+    {
+      "epoch": 2.6529438087323403,
+      "grad_norm": 1.2275629043579102,
+      "learning_rate": 0.00011660792511522382,
+      "loss": 4.6131,
+      "step": 498000
+    },
+    {
+      "epoch": 2.655607406933879,
+      "grad_norm": 1.0445078611373901,
+      "learning_rate": 0.00011571788359184373,
+      "loss": 4.6088,
+      "step": 498500
+    },
+    {
+      "epoch": 2.6582710051354175,
+      "grad_norm": 1.119834065437317,
+      "learning_rate": 0.00011482605841811617,
+      "loss": 4.6125,
+      "step": 499000
+    },
+    {
+      "epoch": 2.6609346033369556,
+      "grad_norm": 1.1206032037734985,
+      "learning_rate": 0.00011393423324438863,
+      "loss": 4.6185,
+      "step": 499500
+    },
+    {
+      "epoch": 2.6635982015384942,
+      "grad_norm": 1.3817057609558105,
+      "learning_rate": 0.0001130424080706611,
+      "loss": 4.6119,
+      "step": 500000
+    },
+    {
+      "epoch": 2.666261799740033,
+      "grad_norm": 1.1292685270309448,
+      "learning_rate": 0.00011215058289693356,
+      "loss": 4.6121,
+      "step": 500500
+    },
+    {
+      "epoch": 2.6689253979415715,
+      "grad_norm": 1.1789071559906006,
+      "learning_rate": 0.00011125875772320601,
+      "loss": 4.6178,
+      "step": 501000
+    },
+    {
+      "epoch": 2.67158899614311,
+      "grad_norm": 1.1726536750793457,
+      "learning_rate": 0.00011036871619982591,
+      "loss": 4.6096,
+      "step": 501500
+    },
+    {
+      "epoch": 2.6742525943446482,
+      "grad_norm": 1.1307861804962158,
+      "learning_rate": 0.00010947689102609837,
+      "loss": 4.6142,
+      "step": 502000
+    },
+    {
+      "epoch": 2.676916192546187,
+      "grad_norm": 1.2103127241134644,
+      "learning_rate": 0.00010858506585237083,
+      "loss": 4.6082,
+      "step": 502500
+    },
+    {
+      "epoch": 2.6795797907477255,
+      "grad_norm": 1.0934276580810547,
+      "learning_rate": 0.00010769324067864329,
+      "loss": 4.6164,
+      "step": 503000
+    },
+    {
+      "epoch": 2.6822433889492636,
+      "grad_norm": 1.232783555984497,
+      "learning_rate": 0.00010680141550491575,
+      "loss": 4.6195,
+      "step": 503500
+    },
+    {
+      "epoch": 2.6849069871508022,
+      "grad_norm": 1.0889538526535034,
+      "learning_rate": 0.00010591137398153565,
+      "loss": 4.6099,
+      "step": 504000
+    },
+    {
+      "epoch": 2.687570585352341,
+      "grad_norm": 1.0930888652801514,
+      "learning_rate": 0.00010501954880780811,
+      "loss": 4.6079,
+      "step": 504500
+    },
+    {
+      "epoch": 2.690234183553879,
+      "grad_norm": 1.108748197555542,
+      "learning_rate": 0.00010412772363408056,
+      "loss": 4.6141,
+      "step": 505000
+    },
+    {
+      "epoch": 2.6928977817554176,
+      "grad_norm": 1.1860270500183105,
+      "learning_rate": 0.00010323589846035303,
+      "loss": 4.6107,
+      "step": 505500
+    },
+    {
+      "epoch": 2.695561379956956,
+      "grad_norm": 1.1693322658538818,
+      "learning_rate": 0.00010234407328662549,
+      "loss": 4.6087,
+      "step": 506000
+    },
+    {
+      "epoch": 2.698224978158495,
+      "grad_norm": 1.169573187828064,
+      "learning_rate": 0.00010145224811289793,
+      "loss": 4.6091,
+      "step": 506500
+    },
+    {
+      "epoch": 2.7008885763600334,
+      "grad_norm": 1.126935601234436,
+      "learning_rate": 0.00010056220658951784,
+      "loss": 4.6041,
+      "step": 507000
+    },
+    {
+      "epoch": 2.7035521745615716,
+      "grad_norm": 1.132071614265442,
+      "learning_rate": 9.96703814157903e-05,
+      "loss": 4.6176,
+      "step": 507500
+    },
+    {
+      "epoch": 2.70621577276311,
+      "grad_norm": 1.1209650039672852,
+      "learning_rate": 9.877855624206277e-05,
+      "loss": 4.6115,
+      "step": 508000
+    },
+    {
+      "epoch": 2.708879370964649,
+      "grad_norm": 1.1064993143081665,
+      "learning_rate": 9.788673106833521e-05,
+      "loss": 4.614,
+      "step": 508500
+    },
+    {
+      "epoch": 2.711542969166187,
+      "grad_norm": 1.2343615293502808,
+      "learning_rate": 9.699490589460767e-05,
+      "loss": 4.6112,
+      "step": 509000
+    },
+    {
+      "epoch": 2.7142065673677256,
+      "grad_norm": 1.1082515716552734,
+      "learning_rate": 9.610486437122757e-05,
+      "loss": 4.6153,
+      "step": 509500
+    },
+    {
+      "epoch": 2.716870165569264,
+      "grad_norm": 1.058441162109375,
+      "learning_rate": 9.521303919750003e-05,
+      "loss": 4.6144,
+      "step": 510000
+    },
+    {
+      "epoch": 2.719533763770803,
+      "grad_norm": 1.2399941682815552,
+      "learning_rate": 9.43212140237725e-05,
+      "loss": 4.6078,
+      "step": 510500
+    },
+    {
+      "epoch": 2.7221973619723414,
+      "grad_norm": 1.1185581684112549,
+      "learning_rate": 9.342938885004495e-05,
+      "loss": 4.6106,
+      "step": 511000
+    },
+    {
+      "epoch": 2.7248609601738796,
+      "grad_norm": 1.1241427659988403,
+      "learning_rate": 9.253934732666485e-05,
+      "loss": 4.6117,
+      "step": 511500
+    },
+    {
+      "epoch": 2.727524558375418,
+      "grad_norm": 1.118444800376892,
+      "learning_rate": 9.164752215293731e-05,
+      "loss": 4.6118,
+      "step": 512000
+    },
+    {
+      "epoch": 2.730188156576957,
+      "grad_norm": 1.1134285926818848,
+      "learning_rate": 9.075569697920977e-05,
+      "loss": 4.6044,
+      "step": 512500
+    },
+    {
+      "epoch": 2.732851754778495,
+      "grad_norm": 1.1537599563598633,
+      "learning_rate": 8.986387180548223e-05,
+      "loss": 4.6185,
+      "step": 513000
+    },
+    {
+      "epoch": 2.7355153529800336,
+      "grad_norm": 1.1125168800354004,
+      "learning_rate": 8.897204663175469e-05,
+      "loss": 4.6028,
+      "step": 513500
+    },
+    {
+      "epoch": 2.738178951181572,
+      "grad_norm": 1.1752519607543945,
+      "learning_rate": 8.808022145802715e-05,
+      "loss": 4.6123,
+      "step": 514000
+    },
+    {
+      "epoch": 2.740842549383111,
+      "grad_norm": 1.105495572090149,
+      "learning_rate": 8.719017993464705e-05,
+      "loss": 4.6159,
+      "step": 514500
+    },
+    {
+      "epoch": 2.7435061475846494,
+      "grad_norm": 1.0856335163116455,
+      "learning_rate": 8.62983547609195e-05,
+      "loss": 4.609,
+      "step": 515000
+    },
+    {
+      "epoch": 2.7461697457861876,
+      "grad_norm": 1.145843505859375,
+      "learning_rate": 8.540652958719197e-05,
+      "loss": 4.6083,
+      "step": 515500
+    },
+    {
+      "epoch": 2.748833343987726,
+      "grad_norm": 1.1720407009124756,
+      "learning_rate": 8.451470441346443e-05,
+      "loss": 4.6036,
+      "step": 516000
+    },
+    {
+      "epoch": 2.7514969421892648,
+      "grad_norm": 1.2031077146530151,
+      "learning_rate": 8.362287923973688e-05,
+      "loss": 4.6093,
+      "step": 516500
+    },
+    {
+      "epoch": 2.754160540390803,
+      "grad_norm": 1.2993487119674683,
+      "learning_rate": 8.273283771635678e-05,
+      "loss": 4.6076,
+      "step": 517000
+    },
+    {
+      "epoch": 2.7568241385923415,
+      "grad_norm": 1.0932821035385132,
+      "learning_rate": 8.184101254262924e-05,
+      "loss": 4.5978,
+      "step": 517500
+    },
+    {
+      "epoch": 2.75948773679388,
+      "grad_norm": 1.068040370941162,
+      "learning_rate": 8.09491873689017e-05,
+      "loss": 4.6077,
+      "step": 518000
+    },
+    {
+      "epoch": 2.7621513349954188,
+      "grad_norm": 1.0666356086730957,
+      "learning_rate": 8.005736219517416e-05,
+      "loss": 4.6068,
+      "step": 518500
+    },
+    {
+      "epoch": 2.7648149331969574,
+      "grad_norm": 1.1699191331863403,
+      "learning_rate": 7.916732067179406e-05,
+      "loss": 4.604,
+      "step": 519000
+    },
+    {
+      "epoch": 2.7674785313984955,
+      "grad_norm": 1.1018375158309937,
+      "learning_rate": 7.827549549806652e-05,
+      "loss": 4.6085,
+      "step": 519500
+    },
+    {
+      "epoch": 2.770142129600034,
+      "grad_norm": 1.2034190893173218,
+      "learning_rate": 7.738367032433898e-05,
+      "loss": 4.6146,
+      "step": 520000
+    },
+    {
+      "epoch": 2.7728057278015728,
+      "grad_norm": 1.1737667322158813,
+      "learning_rate": 7.649184515061142e-05,
+      "loss": 4.6117,
+      "step": 520500
+    },
+    {
+      "epoch": 2.775469326003111,
+      "grad_norm": 1.1514512300491333,
+      "learning_rate": 7.56000199768839e-05,
+      "loss": 4.6019,
+      "step": 521000
+    },
+    {
+      "epoch": 2.7781329242046495,
+      "grad_norm": 1.0964044332504272,
+      "learning_rate": 7.470819480315636e-05,
+      "loss": 4.616,
+      "step": 521500
+    },
+    {
+      "epoch": 2.780796522406188,
+      "grad_norm": 1.3086357116699219,
+      "learning_rate": 7.381815327977626e-05,
+      "loss": 4.6071,
+      "step": 522000
+    },
+    {
+      "epoch": 2.7834601206077263,
+      "grad_norm": 1.073895812034607,
+      "learning_rate": 7.292632810604872e-05,
+      "loss": 4.6065,
+      "step": 522500
+    },
+    {
+      "epoch": 2.786123718809265,
+      "grad_norm": 1.1826096773147583,
+      "learning_rate": 7.203450293232116e-05,
+      "loss": 4.6063,
+      "step": 523000
+    },
+    {
+      "epoch": 2.7887873170108035,
+      "grad_norm": 1.230764627456665,
+      "learning_rate": 7.114267775859364e-05,
+      "loss": 4.6069,
+      "step": 523500
+    },
+    {
+      "epoch": 2.791450915212342,
+      "grad_norm": 1.2007604837417603,
+      "learning_rate": 7.025263623521354e-05,
+      "loss": 4.5989,
+      "step": 524000
+    },
+    {
+      "epoch": 2.7941145134138807,
+      "grad_norm": 1.0956413745880127,
+      "learning_rate": 6.9360811061486e-05,
+      "loss": 4.6065,
+      "step": 524500
+    },
+    {
+      "epoch": 2.796778111615419,
+      "grad_norm": 1.1486014127731323,
+      "learning_rate": 6.846898588775844e-05,
+      "loss": 4.6125,
+      "step": 525000
+    },
+    {
+      "epoch": 2.7994417098169575,
+      "grad_norm": 1.0698477029800415,
+      "learning_rate": 6.75771607140309e-05,
+      "loss": 4.6129,
+      "step": 525500
+    },
+    {
+      "epoch": 2.802105308018496,
+      "grad_norm": 1.1725722551345825,
+      "learning_rate": 6.668711919065082e-05,
+      "loss": 4.6064,
+      "step": 526000
+    },
+    {
+      "epoch": 2.8047689062200343,
+      "grad_norm": 1.1817371845245361,
+      "learning_rate": 6.579529401692328e-05,
+      "loss": 4.6115,
+      "step": 526500
+    },
+    {
+      "epoch": 2.807432504421573,
+      "grad_norm": 1.0840002298355103,
+      "learning_rate": 6.490346884319572e-05,
+      "loss": 4.6077,
+      "step": 527000
+    },
+    {
+      "epoch": 2.8100961026231115,
+      "grad_norm": 1.2627172470092773,
+      "learning_rate": 6.401164366946818e-05,
+      "loss": 4.5979,
+      "step": 527500
+    },
+    {
+      "epoch": 2.81275970082465,
+      "grad_norm": 1.1478033065795898,
+      "learning_rate": 6.311981849574064e-05,
+      "loss": 4.6051,
+      "step": 528000
+    },
+    {
+      "epoch": 2.8154232990261887,
+      "grad_norm": 1.1611443758010864,
+      "learning_rate": 6.222977697236056e-05,
+      "loss": 4.6019,
+      "step": 528500
+    },
+    {
+      "epoch": 2.818086897227727,
+      "grad_norm": 1.2540146112442017,
+      "learning_rate": 6.1337951798633e-05,
+      "loss": 4.6126,
+      "step": 529000
+    },
+    {
+      "epoch": 2.8207504954292655,
+      "grad_norm": 1.1421033143997192,
+      "learning_rate": 6.0446126624905464e-05,
+      "loss": 4.6093,
+      "step": 529500
+    },
+    {
+      "epoch": 2.823414093630804,
+      "grad_norm": 1.157571792602539,
+      "learning_rate": 5.955430145117793e-05,
+      "loss": 4.5974,
+      "step": 530000
+    },
+    {
+      "epoch": 2.8260776918323423,
+      "grad_norm": 1.2044634819030762,
+      "learning_rate": 5.866247627745038e-05,
+      "loss": 4.6016,
+      "step": 530500
+    },
+    {
+      "epoch": 2.828741290033881,
+      "grad_norm": 1.1470133066177368,
+      "learning_rate": 5.777243475407029e-05,
+      "loss": 4.6065,
+      "step": 531000
+    },
+    {
+      "epoch": 2.8314048882354195,
+      "grad_norm": 1.1482868194580078,
+      "learning_rate": 5.6880609580342744e-05,
+      "loss": 4.5994,
+      "step": 531500
+    },
+    {
+      "epoch": 2.834068486436958,
+      "grad_norm": 1.1420148611068726,
+      "learning_rate": 5.59887844066152e-05,
+      "loss": 4.6044,
+      "step": 532000
+    },
+    {
+      "epoch": 2.8367320846384967,
+      "grad_norm": 1.1463284492492676,
+      "learning_rate": 5.509695923288766e-05,
+      "loss": 4.6069,
+      "step": 532500
+    },
+    {
+      "epoch": 2.839395682840035,
+      "grad_norm": 1.1625584363937378,
+      "learning_rate": 5.4205134059160115e-05,
+      "loss": 4.6004,
+      "step": 533000
+    },
+    {
+      "epoch": 2.8420592810415735,
+      "grad_norm": 1.1769341230392456,
+      "learning_rate": 5.3313308885432575e-05,
+      "loss": 4.6074,
+      "step": 533500
+    },
+    {
+      "epoch": 2.844722879243112,
+      "grad_norm": 1.1729334592819214,
+      "learning_rate": 5.242326736205248e-05,
+      "loss": 4.6021,
+      "step": 534000
+    },
+    {
+      "epoch": 2.8473864774446502,
+      "grad_norm": 1.0966566801071167,
+      "learning_rate": 5.1531442188324936e-05,
+      "loss": 4.6069,
+      "step": 534500
+    },
+    {
+      "epoch": 2.850050075646189,
+      "grad_norm": 1.1562509536743164,
+      "learning_rate": 5.06396170145974e-05,
+      "loss": 4.6079,
+      "step": 535000
+    },
+    {
+      "epoch": 2.8527136738477274,
+      "grad_norm": 1.0936706066131592,
+      "learning_rate": 4.9747791840869855e-05,
+      "loss": 4.6052,
+      "step": 535500
+    },
+    {
+      "epoch": 2.855377272049266,
+      "grad_norm": 1.1146814823150635,
+      "learning_rate": 4.885596666714231e-05,
+      "loss": 4.6007,
+      "step": 536000
+    },
+    {
+      "epoch": 2.8580408702508047,
+      "grad_norm": 1.1146438121795654,
+      "learning_rate": 4.7965925143762216e-05,
+      "loss": 4.6097,
+      "step": 536500
+    },
+    {
+      "epoch": 2.860704468452343,
+      "grad_norm": 1.1392590999603271,
+      "learning_rate": 4.7074099970034675e-05,
+      "loss": 4.6014,
+      "step": 537000
+    },
+    {
+      "epoch": 2.8633680666538814,
+      "grad_norm": 1.1630158424377441,
+      "learning_rate": 4.6182274796307135e-05,
+      "loss": 4.6029,
+      "step": 537500
+    },
+    {
+      "epoch": 2.86603166485542,
+      "grad_norm": 1.1878501176834106,
+      "learning_rate": 4.529044962257959e-05,
+      "loss": 4.6065,
+      "step": 538000
+    },
+    {
+      "epoch": 2.868695263056958,
+      "grad_norm": 1.2973501682281494,
+      "learning_rate": 4.4400408099199496e-05,
+      "loss": 4.6,
+      "step": 538500
+    },
+    {
+      "epoch": 2.871358861258497,
+      "grad_norm": 1.136915683746338,
+      "learning_rate": 4.3508582925471955e-05,
+      "loss": 4.6006,
+      "step": 539000
+    },
+    {
+      "epoch": 2.8740224594600354,
+      "grad_norm": 1.2329761981964111,
+      "learning_rate": 4.261675775174441e-05,
+      "loss": 4.6019,
+      "step": 539500
+    },
+    {
+      "epoch": 2.8766860576615736,
+      "grad_norm": 1.1819766759872437,
+      "learning_rate": 4.172493257801686e-05,
+      "loss": 4.5992,
+      "step": 540000
+    },
+    {
+      "epoch": 2.879349655863112,
+      "grad_norm": 1.116248369216919,
+      "learning_rate": 4.083489105463678e-05,
+      "loss": 4.5994,
+      "step": 540500
+    },
+    {
+      "epoch": 2.882013254064651,
+      "grad_norm": 1.3588722944259644,
+      "learning_rate": 3.9943065880909235e-05,
+      "loss": 4.605,
+      "step": 541000
+    },
+    {
+      "epoch": 2.8846768522661894,
+      "grad_norm": 1.2594339847564697,
+      "learning_rate": 3.905124070718169e-05,
+      "loss": 4.5973,
+      "step": 541500
+    },
+    {
+      "epoch": 2.887340450467728,
+      "grad_norm": 1.1628178358078003,
+      "learning_rate": 3.815941553345415e-05,
+      "loss": 4.6019,
+      "step": 542000
+    },
+    {
+      "epoch": 2.890004048669266,
+      "grad_norm": 1.2354239225387573,
+      "learning_rate": 3.72675903597266e-05,
+      "loss": 4.5984,
+      "step": 542500
+    },
+    {
+      "epoch": 2.892667646870805,
+      "grad_norm": 1.2508246898651123,
+      "learning_rate": 3.6377548836346515e-05,
+      "loss": 4.5962,
+      "step": 543000
+    },
+    {
+      "epoch": 2.8953312450723434,
+      "grad_norm": 1.1606773138046265,
+      "learning_rate": 3.548572366261897e-05,
+      "loss": 4.6024,
+      "step": 543500
+    },
+    {
+      "epoch": 2.8979948432738816,
+      "grad_norm": 1.2162941694259644,
+      "learning_rate": 3.459389848889143e-05,
+      "loss": 4.6087,
+      "step": 544000
+    },
+    {
+      "epoch": 2.90065844147542,
+      "grad_norm": 1.373126745223999,
+      "learning_rate": 3.370207331516388e-05,
+      "loss": 4.5972,
+      "step": 544500
+    },
+    {
+      "epoch": 2.903322039676959,
+      "grad_norm": 1.1077393293380737,
+      "learning_rate": 3.281203179178379e-05,
+      "loss": 4.6033,
+      "step": 545000
+    },
+    {
+      "epoch": 2.9059856378784974,
+      "grad_norm": 1.1094976663589478,
+      "learning_rate": 3.1920206618056255e-05,
+      "loss": 4.5966,
+      "step": 545500
+    },
+    {
+      "epoch": 2.908649236080036,
+      "grad_norm": 1.182131052017212,
+      "learning_rate": 3.102838144432871e-05,
+      "loss": 4.6056,
+      "step": 546000
+    },
+    {
+      "epoch": 2.911312834281574,
+      "grad_norm": 1.2314406633377075,
+      "learning_rate": 3.0136556270601163e-05,
+      "loss": 4.5982,
+      "step": 546500
+    },
+    {
+      "epoch": 2.9139764324831128,
+      "grad_norm": 1.1318516731262207,
+      "learning_rate": 2.9244731096873616e-05,
+      "loss": 4.5985,
+      "step": 547000
+    },
+    {
+      "epoch": 2.9166400306846514,
+      "grad_norm": 1.1479239463806152,
+      "learning_rate": 2.835468957349353e-05,
+      "loss": 4.6081,
+      "step": 547500
+    },
+    {
+      "epoch": 2.9193036288861895,
+      "grad_norm": 1.1278290748596191,
+      "learning_rate": 2.7462864399765984e-05,
+      "loss": 4.6014,
+      "step": 548000
+    },
+    {
+      "epoch": 2.921967227087728,
+      "grad_norm": 1.300802230834961,
+      "learning_rate": 2.6571039226038443e-05,
+      "loss": 4.6044,
+      "step": 548500
+    },
+    {
+      "epoch": 2.9246308252892668,
+      "grad_norm": 1.1365079879760742,
+      "learning_rate": 2.56792140523109e-05,
+      "loss": 4.5968,
+      "step": 549000
+    },
+    {
+      "epoch": 2.9272944234908054,
+      "grad_norm": 1.1759607791900635,
+      "learning_rate": 2.4787388878583352e-05,
+      "loss": 4.6017,
+      "step": 549500
+    },
+    {
+      "epoch": 2.929958021692344,
+      "grad_norm": 1.2129359245300293,
+      "learning_rate": 2.389556370485581e-05,
+      "loss": 4.5963,
+      "step": 550000
+    },
+    {
+      "epoch": 2.932621619893882,
+      "grad_norm": 1.1694817543029785,
+      "learning_rate": 2.300552218147572e-05,
+      "loss": 4.6016,
+      "step": 550500
+    },
+    {
+      "epoch": 2.9352852180954208,
+      "grad_norm": 1.108017086982727,
+      "learning_rate": 2.211369700774818e-05,
+      "loss": 4.6028,
+      "step": 551000
+    },
+    {
+      "epoch": 2.9379488162969594,
+      "grad_norm": 1.1087723970413208,
+      "learning_rate": 2.1221871834020636e-05,
+      "loss": 4.5982,
+      "step": 551500
+    },
+    {
+      "epoch": 2.9406124144984975,
+      "grad_norm": 1.055584192276001,
+      "learning_rate": 2.0330046660293088e-05,
+      "loss": 4.6014,
+      "step": 552000
+    },
+    {
+      "epoch": 2.943276012700036,
+      "grad_norm": 1.1524064540863037,
+      "learning_rate": 1.9440005136913003e-05,
+      "loss": 4.5961,
+      "step": 552500
+    },
+    {
+      "epoch": 2.9459396109015747,
+      "grad_norm": 1.16587233543396,
+      "learning_rate": 1.8548179963185456e-05,
+      "loss": 4.5915,
+      "step": 553000
+    },
+    {
+      "epoch": 2.9486032091031134,
+      "grad_norm": 1.1600918769836426,
+      "learning_rate": 1.7656354789457912e-05,
+      "loss": 4.596,
+      "step": 553500
+    },
+    {
+      "epoch": 2.951266807304652,
+      "grad_norm": 1.1187764406204224,
+      "learning_rate": 1.676452961573037e-05,
+      "loss": 4.6055,
+      "step": 554000
+    },
+    {
+      "epoch": 2.95393040550619,
+      "grad_norm": 1.2266861200332642,
+      "learning_rate": 1.5872704442002824e-05,
+      "loss": 4.6004,
+      "step": 554500
+    },
+    {
+      "epoch": 2.9565940037077287,
+      "grad_norm": 1.130671739578247,
+      "learning_rate": 1.4980879268275282e-05,
+      "loss": 4.5992,
+      "step": 555000
+    },
+    {
+      "epoch": 2.9592576019092673,
+      "grad_norm": 1.1526157855987549,
+      "learning_rate": 1.4089054094547738e-05,
+      "loss": 4.6047,
+      "step": 555500
+    },
+    {
+      "epoch": 2.9619212001108055,
+      "grad_norm": 1.2285641431808472,
+      "learning_rate": 1.3197228920820194e-05,
+      "loss": 4.5974,
+      "step": 556000
+    },
+    {
+      "epoch": 2.964584798312344,
+      "grad_norm": 1.1854966878890991,
+      "learning_rate": 1.2307187397440106e-05,
+      "loss": 4.5995,
+      "step": 556500
+    },
+    {
+      "epoch": 2.9672483965138827,
+      "grad_norm": 1.1808573007583618,
+      "learning_rate": 1.141536222371256e-05,
+      "loss": 4.6021,
+      "step": 557000
+    },
+    {
+      "epoch": 2.9699119947154213,
+      "grad_norm": 1.1743810176849365,
+      "learning_rate": 1.0523537049985018e-05,
+      "loss": 4.5979,
+      "step": 557500
+    },
+    {
+      "epoch": 2.9725755929169595,
+      "grad_norm": 1.172972321510315,
+      "learning_rate": 9.631711876257474e-06,
+      "loss": 4.5972,
+      "step": 558000
+    },
+    {
+      "epoch": 2.975239191118498,
+      "grad_norm": 1.1044169664382935,
+      "learning_rate": 8.741670352877386e-06,
+      "loss": 4.5968,
+      "step": 558500
+    },
+    {
+      "epoch": 2.9779027893200367,
+      "grad_norm": 1.1353402137756348,
+      "learning_rate": 7.84984517914984e-06,
+      "loss": 4.597,
+      "step": 559000
+    },
+    {
+      "epoch": 2.9805663875215753,
+      "grad_norm": 1.1849350929260254,
+      "learning_rate": 6.958020005422297e-06,
+      "loss": 4.6015,
+      "step": 559500
+    },
+    {
+      "epoch": 2.9832299857231135,
+      "grad_norm": 1.2167035341262817,
+      "learning_rate": 6.066194831694753e-06,
+      "loss": 4.5984,
+      "step": 560000
+    },
+    {
+      "epoch": 2.985893583924652,
+      "grad_norm": 1.1984131336212158,
+      "learning_rate": 5.176153308314665e-06,
+      "loss": 4.5977,
+      "step": 560500
+    },
+    {
+      "epoch": 2.9885571821261907,
+      "grad_norm": 1.148808240890503,
+      "learning_rate": 4.2843281345871205e-06,
+      "loss": 4.592,
+      "step": 561000
+    },
+    {
+      "epoch": 2.991220780327729,
+      "grad_norm": 1.1721874475479126,
+      "learning_rate": 3.392502960859577e-06,
+      "loss": 4.5946,
+      "step": 561500
+    },
+    {
+      "epoch": 2.9938843785292675,
+      "grad_norm": 1.171322226524353,
+      "learning_rate": 2.500677787132033e-06,
+      "loss": 4.6057,
+      "step": 562000
+    },
+    {
+      "epoch": 2.996547976730806,
+      "grad_norm": 1.1349517107009888,
+      "learning_rate": 1.608852613404489e-06,
+      "loss": 4.5944,
+      "step": 562500
+    },
+    {
+      "epoch": 2.9992115749323447,
+      "grad_norm": 1.145351529121399,
+      "learning_rate": 7.188110900244004e-07,
+      "loss": 4.5939,
+      "step": 563000
+    },
+    {
+      "epoch": 3.0,
+      "step": 563148,
+      "total_flos": 2.917985780733604e+17,
+      "train_loss": 4.746018495113768,
+      "train_runtime": 39559.2785,
+      "train_samples_per_second": 911.074,
+      "train_steps_per_second": 14.236
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 563148,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 5000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.917985780733604e+17,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}