diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json"
--- "a/last-checkpoint/trainer_state.json"
+++ "b/last-checkpoint/trainer_state.json"
@@ -1,3571 +1,7039 @@
 {
-  "best_metric": 12.256741523742676,
-  "best_model_checkpoint": "miner_id_24/checkpoint-500",
-  "epoch": 0.0818732601932209,
-  "eval_steps": 100,
-  "global_step": 500,
+  "best_metric": 12.25197696685791,
+  "best_model_checkpoint": "miner_id_24/checkpoint-1000",
+  "epoch": 0.1637465203864418,
+  "eval_steps": 1000,
+  "global_step": 1000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0001637465203864418,
-      "grad_norm": 0.04208580031991005,
-      "learning_rate": 6.666666649834951e-06,
+      "grad_norm": 0.04071168601512909,
+      "learning_rate": 6.666666666666667e-07,
       "loss": 12.4513,
       "step": 1
     },
     {
       "epoch": 0.0001637465203864418,
-      "eval_loss": 12.441868782043457,
-      "eval_runtime": 7.3093,
-      "eval_samples_per_second": 33.656,
-      "eval_steps_per_second": 16.828,
+      "eval_loss": 12.441837310791016,
+      "eval_runtime": 7.2884,
+      "eval_samples_per_second": 33.752,
+      "eval_steps_per_second": 16.876,
       "step": 1
     },
     {
       "epoch": 0.0003274930407728836,
-      "grad_norm": 0.04489905759692192,
-      "learning_rate": 1.3333333299669903e-05,
+      "grad_norm": 0.042674656957387924,
+      "learning_rate": 1.3333333333333334e-06,
       "loss": 12.449,
       "step": 2
     },
     {
       "epoch": 0.0004912395611593254,
-      "grad_norm": 0.04625147953629494,
-      "learning_rate": 1.9999999494757503e-05,
-      "loss": 12.4494,
+      "grad_norm": 0.04358048737049103,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 12.4493,
       "step": 3
     },
     {
       "epoch": 0.0006549860815457672,
-      "grad_norm": 0.043086178600788116,
-      "learning_rate": 2.6666666599339806e-05,
+      "grad_norm": 0.04197787120938301,
+      "learning_rate": 2.666666666666667e-06,
       "loss": 12.4376,
       "step": 4
     },
     {
       "epoch": 0.0008187326019322089,
-      "grad_norm": 0.04618668928742409,
-      "learning_rate": 3.333333370392211e-05,
+      "grad_norm": 0.04088578745722771,
+      "learning_rate": 3.3333333333333333e-06,
       "loss": 12.4541,
       "step": 5
     },
     {
       "epoch": 0.0009824791223186507,
-      "grad_norm": 0.03436649218201637,
-      "learning_rate": 3.9999998989515007e-05,
+      "grad_norm": 0.03298024460673332,
+      "learning_rate": 4.000000000000001e-06,
       "loss": 12.4441,
       "step": 6
     },
     {
       "epoch": 0.0011462256427050926,
-      "grad_norm": 0.032714761793613434,
-      "learning_rate": 4.6666664275107905e-05,
-      "loss": 12.4419,
+      "grad_norm": 0.03118324838578701,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": 12.442,
       "step": 7
     },
     {
       "epoch": 0.0013099721630915344,
-      "grad_norm": 0.051821667701005936,
-      "learning_rate": 5.333333319867961e-05,
-      "loss": 12.4438,
+      "grad_norm": 0.04715936258435249,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 12.4439,
       "step": 8
     },
     {
       "epoch": 0.001473718683477976,
-      "grad_norm": 0.038592468947172165,
-      "learning_rate": 6.000000212225132e-05,
-      "loss": 12.4529,
+      "grad_norm": 0.034939300268888474,
+      "learning_rate": 6e-06,
+      "loss": 12.453,
       "step": 9
     },
     {
       "epoch": 0.0016374652038644178,
-      "grad_norm": 0.03184358403086662,
-      "learning_rate": 6.666666740784422e-05,
-      "loss": 12.4582,
+      "grad_norm": 0.028914228081703186,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 12.4583,
       "step": 10
     },
     {
       "epoch": 0.0018012117242508596,
-      "grad_norm": 0.036762818694114685,
-      "learning_rate": 7.333333633141592e-05,
-      "loss": 12.438,
+      "grad_norm": 0.03148627653717995,
+      "learning_rate": 7.333333333333334e-06,
+      "loss": 12.4382,
       "step": 11
     },
     {
       "epoch": 0.0019649582446373015,
-      "grad_norm": 0.04558774083852768,
-      "learning_rate": 7.999999797903001e-05,
-      "loss": 12.4414,
+      "grad_norm": 0.03997644782066345,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 12.4416,
       "step": 12
     },
     {
       "epoch": 0.002128704765023743,
-      "grad_norm": 0.057379335165023804,
-      "learning_rate": 8.666666690260172e-05,
-      "loss": 12.4501,
+      "grad_norm": 0.05061323940753937,
+      "learning_rate": 8.666666666666668e-06,
+      "loss": 12.4505,
       "step": 13
     },
     {
       "epoch": 0.002292451285410185,
-      "grad_norm": 0.04371897876262665,
-      "learning_rate": 9.333332855021581e-05,
-      "loss": 12.4428,
+      "grad_norm": 0.03674255311489105,
+      "learning_rate": 9.333333333333334e-06,
+      "loss": 12.4432,
       "step": 14
     },
     {
       "epoch": 0.0024561978057966267,
-      "grad_norm": 0.048772815614938736,
-      "learning_rate": 9.999999747378752e-05,
-      "loss": 12.4444,
+      "grad_norm": 0.040007635951042175,
+      "learning_rate": 1e-05,
+      "loss": 12.4448,
       "step": 15
     },
     {
       "epoch": 0.0026199443261830688,
-      "grad_norm": 0.0438830703496933,
-      "learning_rate": 0.00010666666639735922,
-      "loss": 12.4454,
+      "grad_norm": 0.035397134721279144,
+      "learning_rate": 1.0666666666666667e-05,
+      "loss": 12.4459,
       "step": 16
     },
     {
       "epoch": 0.0027836908465695104,
-      "grad_norm": 0.04073454812169075,
-      "learning_rate": 0.00011333332804497331,
-      "loss": 12.4374,
+      "grad_norm": 0.03148770332336426,
+      "learning_rate": 1.1333333333333334e-05,
+      "loss": 12.438,
       "step": 17
     },
     {
       "epoch": 0.002947437366955952,
-      "grad_norm": 0.05522220581769943,
-      "learning_rate": 0.00012000000424450263,
-      "loss": 12.4235,
+      "grad_norm": 0.03974079713225365,
+      "learning_rate": 1.2e-05,
+      "loss": 12.4245,
       "step": 18
     },
     {
       "epoch": 0.003111183887342394,
-      "grad_norm": 0.056581761687994,
-      "learning_rate": 0.00012666666589211673,
-      "loss": 12.4398,
+      "grad_norm": 0.04220546409487724,
+      "learning_rate": 1.2666666666666668e-05,
+      "loss": 12.4408,
       "step": 19
     },
     {
       "epoch": 0.0032749304077288356,
-      "grad_norm": 0.054074015468358994,
-      "learning_rate": 0.00013333333481568843,
-      "loss": 12.4549,
+      "grad_norm": 0.037504322826862335,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 12.456,
       "step": 20
     },
     {
       "epoch": 0.0034386769281152777,
-      "grad_norm": 0.06291481107473373,
-      "learning_rate": 0.0001399999891873449,
-      "loss": 12.444,
+      "grad_norm": 0.03901657089591026,
+      "learning_rate": 1.4000000000000001e-05,
+      "loss": 12.4453,
       "step": 21
     },
     {
       "epoch": 0.0036024234485017193,
-      "grad_norm": 0.05257013067603111,
-      "learning_rate": 0.00014666667266283184,
-      "loss": 12.4552,
+      "grad_norm": 0.03178351745009422,
+      "learning_rate": 1.4666666666666668e-05,
+      "loss": 12.4564,
       "step": 22
     },
     {
       "epoch": 0.0037661699688881613,
-      "grad_norm": 0.05741506442427635,
-      "learning_rate": 0.00015333332703448832,
-      "loss": 12.4378,
+      "grad_norm": 0.033598098903894424,
+      "learning_rate": 1.5333333333333334e-05,
+      "loss": 12.4394,
       "step": 23
     },
     {
       "epoch": 0.003929916489274603,
-      "grad_norm": 0.07213739305734634,
-      "learning_rate": 0.00015999999595806003,
-      "loss": 12.439,
+      "grad_norm": 0.039266929030418396,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 12.4412,
       "step": 24
     },
     {
       "epoch": 0.0040936630096610445,
-      "grad_norm": 0.06334943324327469,
-      "learning_rate": 0.00016666666488163173,
-      "loss": 12.4353,
+      "grad_norm": 0.03318433836102486,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 12.4374,
       "step": 25
     },
     {
       "epoch": 0.004257409530047486,
-      "grad_norm": 0.08387371152639389,
-      "learning_rate": 0.00017333333380520344,
-      "loss": 12.44,
+      "grad_norm": 0.041068192571401596,
+      "learning_rate": 1.7333333333333336e-05,
+      "loss": 12.4429,
       "step": 26
     },
     {
       "epoch": 0.004421156050433929,
-      "grad_norm": 0.08301780372858047,
-      "learning_rate": 0.00017999998817685992,
-      "loss": 12.4327,
+      "grad_norm": 0.038254860788583755,
+      "learning_rate": 1.8e-05,
+      "loss": 12.4358,
       "step": 27
     },
     {
       "epoch": 0.00458490257082037,
-      "grad_norm": 0.09270280599594116,
-      "learning_rate": 0.00018666665710043162,
-      "loss": 12.4285,
+      "grad_norm": 0.0400465689599514,
+      "learning_rate": 1.866666666666667e-05,
+      "loss": 12.4324,
       "step": 28
     },
     {
       "epoch": 0.004748649091206812,
-      "grad_norm": 0.10819040238857269,
-      "learning_rate": 0.00019333332602400333,
-      "loss": 12.4291,
+      "grad_norm": 0.04147111251950264,
+      "learning_rate": 1.9333333333333333e-05,
+      "loss": 12.4338,
       "step": 29
     },
     {
       "epoch": 0.004912395611593253,
-      "grad_norm": 0.10528005659580231,
-      "learning_rate": 0.00019999999494757503,
-      "loss": 12.4423,
+      "grad_norm": 0.03947514668107033,
+      "learning_rate": 2e-05,
+      "loss": 12.4472,
       "step": 30
     },
     {
       "epoch": 0.005076142131979695,
-      "grad_norm": 0.13457149267196655,
-      "learning_rate": 0.00019999999494757503,
-      "loss": 12.4323,
+      "grad_norm": 0.04782761260867119,
+      "learning_rate": 2.0666666666666666e-05,
+      "loss": 12.4389,
       "step": 31
     },
     {
       "epoch": 0.0052398886523661375,
-      "grad_norm": 0.13698327541351318,
-      "learning_rate": 0.00019999999494757503,
-      "loss": 12.4182,
+      "grad_norm": 0.044334013015031815,
+      "learning_rate": 2.1333333333333335e-05,
+      "loss": 12.4259,
       "step": 32
     },
     {
       "epoch": 0.005403635172752579,
-      "grad_norm": 0.13750460743904114,
-      "learning_rate": 0.0001999999803956598,
-      "loss": 12.4365,
+      "grad_norm": 0.03857624903321266,
+      "learning_rate": 2.2000000000000003e-05,
+      "loss": 12.4445,
       "step": 33
     },
     {
       "epoch": 0.005567381693139021,
-      "grad_norm": 0.1702355593442917,
-      "learning_rate": 0.00019999996584374458,
-      "loss": 12.4517,
+      "grad_norm": 0.05013938993215561,
+      "learning_rate": 2.2666666666666668e-05,
+      "loss": 12.4622,
       "step": 34
     },
     {
       "epoch": 0.005731128213525462,
-      "grad_norm": 0.14421802759170532,
-      "learning_rate": 0.00019999996584374458,
-      "loss": 12.4439,
+      "grad_norm": 0.03651641681790352,
+      "learning_rate": 2.3333333333333336e-05,
+      "loss": 12.453,
       "step": 35
     },
     {
       "epoch": 0.005894874733911904,
-      "grad_norm": 0.17617714405059814,
-      "learning_rate": 0.00019999995129182935,
-      "loss": 12.4248,
+      "grad_norm": 0.04444118216633797,
+      "learning_rate": 2.4e-05,
+      "loss": 12.437,
       "step": 36
     },
     {
       "epoch": 0.006058621254298346,
-      "grad_norm": 0.1948043555021286,
-      "learning_rate": 0.0001999999221879989,
-      "loss": 12.4319,
+      "grad_norm": 0.0415070615708828,
+      "learning_rate": 2.466666666666667e-05,
+      "loss": 12.4461,
       "step": 37
     },
     {
       "epoch": 0.006222367774684788,
-      "grad_norm": 0.2024512141942978,
-      "learning_rate": 0.00019999989308416843,
-      "loss": 12.4267,
+      "grad_norm": 0.04257901757955551,
+      "learning_rate": 2.5333333333333337e-05,
+      "loss": 12.4423,
       "step": 38
     },
     {
       "epoch": 0.00638611429507123,
-      "grad_norm": 0.21583795547485352,
-      "learning_rate": 0.0001999998785322532,
-      "loss": 12.4148,
+      "grad_norm": 0.04246218129992485,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 12.4324,
       "step": 39
     },
     {
       "epoch": 0.006549860815457671,
-      "grad_norm": 0.212106391787529,
-      "learning_rate": 0.00019999984942842275,
-      "loss": 12.4077,
+      "grad_norm": 0.0474301353096962,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 12.4268,
       "step": 40
     },
     {
       "epoch": 0.006713607335844114,
-      "grad_norm": 0.25522929430007935,
-      "learning_rate": 0.0001999998203245923,
-      "loss": 12.4199,
+      "grad_norm": 0.04999005049467087,
+      "learning_rate": 2.733333333333333e-05,
+      "loss": 12.4445,
       "step": 41
     },
     {
       "epoch": 0.006877353856230555,
-      "grad_norm": 0.29123276472091675,
-      "learning_rate": 0.0001999997766688466,
-      "loss": 12.3987,
+      "grad_norm": 0.04916905239224434,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 12.4276,
       "step": 42
     },
     {
       "epoch": 0.007041100376616997,
-      "grad_norm": 0.31058281660079956,
-      "learning_rate": 0.00019999974756501615,
-      "loss": 12.4021,
+      "grad_norm": 0.05572114139795303,
+      "learning_rate": 2.8666666666666668e-05,
+      "loss": 12.4357,
       "step": 43
     },
     {
       "epoch": 0.0072048468970034385,
-      "grad_norm": 0.22949866950511932,
-      "learning_rate": 0.00019999970390927047,
-      "loss": 12.4186,
+      "grad_norm": 0.03746430575847626,
+      "learning_rate": 2.9333333333333336e-05,
+      "loss": 12.4436,
       "step": 44
     },
     {
       "epoch": 0.00736859341738988,
-      "grad_norm": 0.27713820338249207,
-      "learning_rate": 0.00019999966025352478,
-      "loss": 12.4255,
+      "grad_norm": 0.03692767024040222,
+      "learning_rate": 3e-05,
+      "loss": 12.4552,
       "step": 45
     },
     {
       "epoch": 0.007532339937776323,
-      "grad_norm": 0.27647095918655396,
-      "learning_rate": 0.0001999996165977791,
-      "loss": 12.3932,
+      "grad_norm": 0.04242353141307831,
+      "learning_rate": 3.066666666666667e-05,
+      "loss": 12.4279,
       "step": 46
     },
     {
       "epoch": 0.007696086458162764,
-      "grad_norm": 0.3024648427963257,
-      "learning_rate": 0.0001999995729420334,
-      "loss": 12.3947,
+      "grad_norm": 0.05116885527968407,
+      "learning_rate": 3.1333333333333334e-05,
+      "loss": 12.4373,
       "step": 47
     },
     {
       "epoch": 0.007859832978549206,
-      "grad_norm": 0.2551914155483246,
-      "learning_rate": 0.0001999995147343725,
-      "loss": 12.4186,
+      "grad_norm": 0.039842814207077026,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 12.4554,
       "step": 48
     },
     {
       "epoch": 0.008023579498935647,
-      "grad_norm": 0.25191551446914673,
-      "learning_rate": 0.00019999945652671158,
-      "loss": 12.4031,
+      "grad_norm": 0.0379764698445797,
+      "learning_rate": 3.266666666666667e-05,
+      "loss": 12.4418,
       "step": 49
     },
     {
       "epoch": 0.008187326019322089,
-      "grad_norm": 0.2916131913661957,
-      "learning_rate": 0.0001999994128709659,
-      "loss": 12.3902,
+      "grad_norm": 0.048301611095666885,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 12.4398,
       "step": 50
     },
     {
       "epoch": 0.00835107253970853,
-      "grad_norm": 0.3026813566684723,
-      "learning_rate": 0.00019999934011138976,
-      "loss": 12.3681,
+      "grad_norm": 0.056472256779670715,
+      "learning_rate": 3.4000000000000007e-05,
+      "loss": 12.4247,
       "step": 51
     },
     {
       "epoch": 0.008514819060094972,
-      "grad_norm": 0.3356999456882477,
-      "learning_rate": 0.00019999928190372884,
-      "loss": 12.3927,
+      "grad_norm": 0.0549093633890152,
+      "learning_rate": 3.466666666666667e-05,
+      "loss": 12.4551,
       "step": 52
     },
     {
       "epoch": 0.008678565580481416,
-      "grad_norm": 0.2360207736492157,
-      "learning_rate": 0.00019999922369606793,
-      "loss": 12.351,
+      "grad_norm": 0.04894556105136871,
+      "learning_rate": 3.5333333333333336e-05,
+      "loss": 12.4026,
       "step": 53
     },
     {
       "epoch": 0.008842312100867857,
-      "grad_norm": 0.28673455119132996,
-      "learning_rate": 0.0001999991509364918,
-      "loss": 12.3748,
+      "grad_norm": 0.0566958487033844,
+      "learning_rate": 3.6e-05,
+      "loss": 12.4368,
       "step": 54
     },
     {
       "epoch": 0.009006058621254299,
-      "grad_norm": 0.308011919260025,
-      "learning_rate": 0.00019999907817691565,
-      "loss": 12.3639,
+      "grad_norm": 0.06052306294441223,
+      "learning_rate": 3.6666666666666666e-05,
+      "loss": 12.4367,
       "step": 55
     },
     {
       "epoch": 0.00916980514164074,
-      "grad_norm": 0.3026260435581207,
-      "learning_rate": 0.00019999899086542428,
-      "loss": 12.3605,
+      "grad_norm": 0.05914003401994705,
+      "learning_rate": 3.733333333333334e-05,
+      "loss": 12.4328,
       "step": 56
     },
     {
       "epoch": 0.009333551662027182,
-      "grad_norm": 0.22466889023780823,
-      "learning_rate": 0.00019999891810584813,
-      "loss": 12.3757,
+      "grad_norm": 0.050145380198955536,
+      "learning_rate": 3.8e-05,
+      "loss": 12.4385,
       "step": 57
     },
     {
       "epoch": 0.009497298182413624,
-      "grad_norm": 0.29599490761756897,
-      "learning_rate": 0.000199998845346272,
-      "loss": 12.372,
+      "grad_norm": 0.07536210119724274,
+      "learning_rate": 3.866666666666667e-05,
+      "loss": 12.4527,
       "step": 58
     },
     {
       "epoch": 0.009661044702800065,
-      "grad_norm": 0.23513752222061157,
-      "learning_rate": 0.00019999875803478062,
-      "loss": 12.3508,
+      "grad_norm": 0.06089041382074356,
+      "learning_rate": 3.933333333333333e-05,
+      "loss": 12.4182,
       "step": 59
     },
     {
       "epoch": 0.009824791223186507,
-      "grad_norm": 0.2927089035511017,
-      "learning_rate": 0.00019999867072328925,
-      "loss": 12.3443,
+      "grad_norm": 0.06450305879116058,
+      "learning_rate": 4e-05,
+      "loss": 12.4279,
       "step": 60
     },
     {
       "epoch": 0.009988537743572948,
-      "grad_norm": 0.23235340416431427,
-      "learning_rate": 0.00019999858341179788,
-      "loss": 12.3343,
+      "grad_norm": 0.05715927854180336,
+      "learning_rate": 4.066666666666667e-05,
+      "loss": 12.4077,
       "step": 61
     },
     {
       "epoch": 0.01015228426395939,
-      "grad_norm": 0.2948826253414154,
-      "learning_rate": 0.00019999848154839128,
-      "loss": 12.3566,
+      "grad_norm": 0.06323696672916412,
+      "learning_rate": 4.133333333333333e-05,
+      "loss": 12.4455,
       "step": 62
     },
     {
       "epoch": 0.010316030784345833,
-      "grad_norm": 0.29048776626586914,
-      "learning_rate": 0.00019999837968498468,
-      "loss": 12.3419,
+      "grad_norm": 0.06708288937807083,
+      "learning_rate": 4.2e-05,
+      "loss": 12.4392,
       "step": 63
     },
     {
       "epoch": 0.010479777304732275,
-      "grad_norm": 0.27304956316947937,
-      "learning_rate": 0.00019999829237349331,
-      "loss": 12.3437,
+      "grad_norm": 0.08000550419092178,
+      "learning_rate": 4.266666666666667e-05,
+      "loss": 12.442,
       "step": 64
     },
     {
       "epoch": 0.010643523825118717,
-      "grad_norm": 0.2460082769393921,
-      "learning_rate": 0.0001999981759581715,
-      "loss": 12.3449,
+      "grad_norm": 0.0591249093413353,
+      "learning_rate": 4.3333333333333334e-05,
+      "loss": 12.4374,
       "step": 65
     },
     {
       "epoch": 0.010807270345505158,
-      "grad_norm": 0.23197799921035767,
-      "learning_rate": 0.00019999808864668012,
-      "loss": 12.3434,
+      "grad_norm": 0.06522256880998611,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 12.4319,
       "step": 66
     },
     {
       "epoch": 0.0109710168658916,
-      "grad_norm": 0.21384094655513763,
-      "learning_rate": 0.00019999798678327352,
-      "loss": 12.3519,
+      "grad_norm": 0.07051816582679749,
+      "learning_rate": 4.466666666666667e-05,
+      "loss": 12.4403,
       "step": 67
     },
     {
       "epoch": 0.011134763386278041,
-      "grad_norm": 0.26949459314346313,
-      "learning_rate": 0.00019999785581603646,
-      "loss": 12.34,
+      "grad_norm": 0.07400926947593689,
+      "learning_rate": 4.5333333333333335e-05,
+      "loss": 12.4411,
       "step": 68
     },
     {
       "epoch": 0.011298509906664483,
-      "grad_norm": 0.21784676611423492,
-      "learning_rate": 0.00019999775395262986,
-      "loss": 12.3514,
+      "grad_norm": 0.05660618841648102,
+      "learning_rate": 4.600000000000001e-05,
+      "loss": 12.4406,
       "step": 69
     },
     {
       "epoch": 0.011462256427050925,
-      "grad_norm": 0.23999345302581787,
-      "learning_rate": 0.00019999763753730804,
-      "loss": 12.3387,
+      "grad_norm": 0.08654368668794632,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 12.4398,
       "step": 70
     },
     {
       "epoch": 0.011626002947437366,
-      "grad_norm": 0.2197384089231491,
-      "learning_rate": 0.0001999975211219862,
-      "loss": 12.3307,
+      "grad_norm": 0.07927855104207993,
+      "learning_rate": 4.7333333333333336e-05,
+      "loss": 12.4291,
       "step": 71
     },
     {
       "epoch": 0.011789749467823808,
-      "grad_norm": 0.24359112977981567,
-      "learning_rate": 0.00019999739015474916,
-      "loss": 12.331,
+      "grad_norm": 0.08644130080938339,
+      "learning_rate": 4.8e-05,
+      "loss": 12.4459,
       "step": 72
     },
     {
       "epoch": 0.011953495988210251,
-      "grad_norm": 0.22217059135437012,
-      "learning_rate": 0.0001999972591875121,
-      "loss": 12.3253,
+      "grad_norm": 0.07535272091627121,
+      "learning_rate": 4.866666666666667e-05,
+      "loss": 12.4458,
       "step": 73
     },
     {
       "epoch": 0.012117242508596693,
-      "grad_norm": 0.22041912376880646,
-      "learning_rate": 0.00019999712822027504,
-      "loss": 12.3309,
+      "grad_norm": 0.07552432268857956,
+      "learning_rate": 4.933333333333334e-05,
+      "loss": 12.4359,
       "step": 74
     },
     {
       "epoch": 0.012280989028983134,
-      "grad_norm": 0.2732013165950775,
-      "learning_rate": 0.000199996997253038,
-      "loss": 12.3279,
+      "grad_norm": 0.07036083936691284,
+      "learning_rate": 5e-05,
+      "loss": 12.4276,
       "step": 75
     },
     {
       "epoch": 0.012444735549369576,
-      "grad_norm": 0.1881040781736374,
-      "learning_rate": 0.00019999686628580093,
-      "loss": 12.3237,
+      "grad_norm": 0.09128473699092865,
+      "learning_rate": 5.0666666666666674e-05,
+      "loss": 12.4385,
       "step": 76
     },
     {
       "epoch": 0.012608482069756018,
-      "grad_norm": 0.14977961778640747,
-      "learning_rate": 0.00019999673531856388,
-      "loss": 12.3332,
+      "grad_norm": 0.08544669300317764,
+      "learning_rate": 5.133333333333333e-05,
+      "loss": 12.4398,
       "step": 77
     },
     {
       "epoch": 0.01277222859014246,
-      "grad_norm": 0.18099509179592133,
-      "learning_rate": 0.00019999660435132682,
-      "loss": 12.321,
+      "grad_norm": 0.09351212531328201,
+      "learning_rate": 5.2000000000000004e-05,
+      "loss": 12.4355,
       "step": 78
     },
     {
       "epoch": 0.0129359751105289,
-      "grad_norm": 0.2314033955335617,
-      "learning_rate": 0.00019999645883217454,
-      "loss": 12.3326,
+      "grad_norm": 0.09390752017498016,
+      "learning_rate": 5.266666666666666e-05,
+      "loss": 12.4411,
       "step": 79
     },
     {
       "epoch": 0.013099721630915342,
-      "grad_norm": 0.18064385652542114,
-      "learning_rate": 0.00019999631331302226,
-      "loss": 12.3228,
+      "grad_norm": 0.10644028335809708,
+      "learning_rate": 5.333333333333333e-05,
+      "loss": 12.4441,
       "step": 80
     },
     {
       "epoch": 0.013263468151301784,
-      "grad_norm": 0.35007038712501526,
-      "learning_rate": 0.00019999615324195474,
-      "loss": 12.3499,
+      "grad_norm": 0.0856703445315361,
+      "learning_rate": 5.4000000000000005e-05,
+      "loss": 12.4478,
       "step": 81
     },
     {
       "epoch": 0.013427214671688227,
-      "grad_norm": 0.4421752691268921,
-      "learning_rate": 0.00019999600772280246,
-      "loss": 12.3257,
+      "grad_norm": 0.09133117645978928,
+      "learning_rate": 5.466666666666666e-05,
+      "loss": 12.4175,
       "step": 82
     },
     {
       "epoch": 0.013590961192074669,
-      "grad_norm": 0.2741035223007202,
-      "learning_rate": 0.00019999584765173495,
-      "loss": 12.3004,
+      "grad_norm": 0.11033530533313751,
+      "learning_rate": 5.5333333333333334e-05,
+      "loss": 12.4325,
       "step": 83
     },
     {
       "epoch": 0.01375470771246111,
-      "grad_norm": 0.1526937633752823,
-      "learning_rate": 0.00019999568758066744,
-      "loss": 12.3203,
+      "grad_norm": 0.10031059384346008,
+      "learning_rate": 5.6000000000000006e-05,
+      "loss": 12.4367,
       "step": 84
     },
     {
       "epoch": 0.013918454232847552,
-      "grad_norm": 0.15117084980010986,
-      "learning_rate": 0.00019999554206151515,
-      "loss": 12.317,
+      "grad_norm": 0.10192529857158661,
+      "learning_rate": 5.666666666666667e-05,
+      "loss": 12.4213,
       "step": 85
     },
     {
       "epoch": 0.014082200753233994,
-      "grad_norm": 0.3051150441169739,
-      "learning_rate": 0.0001999953674385324,
-      "loss": 12.3093,
+      "grad_norm": 0.1395905762910843,
+      "learning_rate": 5.7333333333333336e-05,
+      "loss": 12.43,
       "step": 86
     },
     {
       "epoch": 0.014245947273620435,
-      "grad_norm": 0.21714192628860474,
-      "learning_rate": 0.0001999952073674649,
-      "loss": 12.2923,
+      "grad_norm": 0.12898190319538116,
+      "learning_rate": 5.8e-05,
+      "loss": 12.4351,
       "step": 87
     },
     {
       "epoch": 0.014409693794006877,
-      "grad_norm": 0.29008936882019043,
-      "learning_rate": 0.00019999503274448216,
-      "loss": 12.3135,
+      "grad_norm": 0.125532329082489,
+      "learning_rate": 5.866666666666667e-05,
+      "loss": 12.4293,
       "step": 88
     },
     {
       "epoch": 0.014573440314393319,
-      "grad_norm": 0.15054282546043396,
-      "learning_rate": 0.00019999485812149942,
-      "loss": 12.337,
+      "grad_norm": 0.10696202516555786,
+      "learning_rate": 5.9333333333333343e-05,
+      "loss": 12.4444,
       "step": 89
     },
     {
       "epoch": 0.01473718683477976,
-      "grad_norm": 0.29189133644104004,
-      "learning_rate": 0.00019999468349851668,
-      "loss": 12.3169,
+      "grad_norm": 0.1154562458395958,
+      "learning_rate": 6e-05,
+      "loss": 12.4223,
       "step": 90
     },
     {
       "epoch": 0.014900933355166202,
-      "grad_norm": 0.16853569447994232,
-      "learning_rate": 0.00019999450887553394,
-      "loss": 12.3087,
+      "grad_norm": 0.13437619805335999,
+      "learning_rate": 6.066666666666667e-05,
+      "loss": 12.4324,
       "step": 91
     },
     {
       "epoch": 0.015064679875552645,
-      "grad_norm": 0.24085475504398346,
-      "learning_rate": 0.00019999431970063597,
-      "loss": 12.3309,
+      "grad_norm": 0.13228319585323334,
+      "learning_rate": 6.133333333333334e-05,
+      "loss": 12.4311,
       "step": 92
     },
     {
       "epoch": 0.015228426395939087,
-      "grad_norm": 0.14460758864879608,
-      "learning_rate": 0.00019999414507765323,
-      "loss": 12.3043,
+      "grad_norm": 0.14434094727039337,
+      "learning_rate": 6.2e-05,
+      "loss": 12.4221,
       "step": 93
     },
     {
       "epoch": 0.015392172916325528,
-      "grad_norm": 0.20345059037208557,
-      "learning_rate": 0.00019999395590275526,
-      "loss": 12.3139,
+      "grad_norm": 0.1585177183151245,
+      "learning_rate": 6.266666666666667e-05,
+      "loss": 12.4432,
       "step": 94
     },
     {
       "epoch": 0.01555591943671197,
-      "grad_norm": 0.30965539813041687,
-      "learning_rate": 0.0001999937667278573,
-      "loss": 12.2988,
+      "grad_norm": 0.1559607982635498,
+      "learning_rate": 6.333333333333333e-05,
+      "loss": 12.4328,
       "step": 95
     },
     {
       "epoch": 0.01571966595709841,
-      "grad_norm": 0.28346818685531616,
-      "learning_rate": 0.0001999935630010441,
-      "loss": 12.3345,
+      "grad_norm": 0.132478266954422,
+      "learning_rate": 6.400000000000001e-05,
+      "loss": 12.4274,
       "step": 96
     },
     {
       "epoch": 0.015883412477484853,
-      "grad_norm": 0.1707168072462082,
-      "learning_rate": 0.00019999337382614613,
-      "loss": 12.3099,
+      "grad_norm": 0.17377401888370514,
+      "learning_rate": 6.466666666666666e-05,
+      "loss": 12.4283,
       "step": 97
     },
     {
       "epoch": 0.016047158997871295,
-      "grad_norm": 0.20409435033798218,
-      "learning_rate": 0.00019999317009933293,
-      "loss": 12.338,
+      "grad_norm": 0.13973486423492432,
+      "learning_rate": 6.533333333333334e-05,
+      "loss": 12.4373,
       "step": 98
     },
     {
       "epoch": 0.016210905518257736,
-      "grad_norm": 0.35684144496917725,
-      "learning_rate": 0.00019999296637251973,
-      "loss": 12.2916,
+      "grad_norm": 0.17785918712615967,
+      "learning_rate": 6.6e-05,
+      "loss": 12.4219,
       "step": 99
     },
     {
       "epoch": 0.016374652038644178,
-      "grad_norm": 0.24036715924739838,
-      "learning_rate": 0.00019999277719762176,
-      "loss": 12.3097,
-      "step": 100
-    },
-    {
-      "epoch": 0.016374652038644178,
-      "eval_loss": 12.311452865600586,
-      "eval_runtime": 7.4253,
-      "eval_samples_per_second": 33.13,
-      "eval_steps_per_second": 16.565,
+      "grad_norm": 0.16729354858398438,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 12.4167,
       "step": 100
     },
     {
       "epoch": 0.01653839855903062,
-      "grad_norm": 0.14290913939476013,
-      "learning_rate": 0.00019999255891889334,
-      "loss": 12.3244,
+      "grad_norm": 0.16612979769706726,
+      "learning_rate": 6.733333333333333e-05,
+      "loss": 12.4189,
       "step": 101
     },
     {
       "epoch": 0.01670214507941706,
-      "grad_norm": 0.3147849440574646,
-      "learning_rate": 0.00019999235519208014,
-      "loss": 12.3063,
+      "grad_norm": 0.17767584323883057,
+      "learning_rate": 6.800000000000001e-05,
+      "loss": 12.43,
       "step": 102
     },
     {
       "epoch": 0.016865891599803503,
-      "grad_norm": 0.20907671749591827,
-      "learning_rate": 0.00019999213691335171,
-      "loss": 12.2983,
+      "grad_norm": 0.20716869831085205,
+      "learning_rate": 6.866666666666666e-05,
+      "loss": 12.4229,
       "step": 103
     },
     {
       "epoch": 0.017029638120189945,
-      "grad_norm": 0.12480394542217255,
-      "learning_rate": 0.0001999919186346233,
-      "loss": 12.3012,
+      "grad_norm": 0.21526587009429932,
+      "learning_rate": 6.933333333333334e-05,
+      "loss": 12.4282,
       "step": 104
     },
     {
       "epoch": 0.017193384640576386,
-      "grad_norm": 0.18896125257015228,
-      "learning_rate": 0.00019999170035589486,
-      "loss": 12.3152,
+      "grad_norm": 0.19571231305599213,
+      "learning_rate": 7e-05,
+      "loss": 12.4137,
       "step": 105
     },
     {
       "epoch": 0.01735713116096283,
-      "grad_norm": 0.34390348196029663,
-      "learning_rate": 0.0001999914675252512,
-      "loss": 12.3272,
+      "grad_norm": 0.14979496598243713,
+      "learning_rate": 7.066666666666667e-05,
+      "loss": 12.4119,
       "step": 106
     },
     {
       "epoch": 0.017520877681349273,
-      "grad_norm": 0.22107726335525513,
-      "learning_rate": 0.00019999124924652278,
-      "loss": 12.3076,
+      "grad_norm": 0.21473410725593567,
+      "learning_rate": 7.133333333333334e-05,
+      "loss": 12.4146,
       "step": 107
     },
     {
       "epoch": 0.017684624201735714,
-      "grad_norm": 0.22771164774894714,
-      "learning_rate": 0.00019999101641587913,
-      "loss": 12.3141,
+      "grad_norm": 0.19255532324314117,
+      "learning_rate": 7.2e-05,
+      "loss": 12.4214,
       "step": 108
     },
     {
       "epoch": 0.017848370722122156,
-      "grad_norm": 0.23079240322113037,
-      "learning_rate": 0.0001999907981371507,
-      "loss": 12.3073,
+      "grad_norm": 0.23225137591362,
+      "learning_rate": 7.266666666666667e-05,
+      "loss": 12.4152,
       "step": 109
     },
     {
       "epoch": 0.018012117242508598,
-      "grad_norm": 0.25316718220710754,
-      "learning_rate": 0.00019999055075459182,
-      "loss": 12.31,
+      "grad_norm": 0.2821487486362457,
+      "learning_rate": 7.333333333333333e-05,
+      "loss": 12.4242,
       "step": 110
     },
     {
       "epoch": 0.01817586376289504,
-      "grad_norm": 0.16608233749866486,
-      "learning_rate": 0.00019999031792394817,
-      "loss": 12.297,
+      "grad_norm": 0.2615848183631897,
+      "learning_rate": 7.4e-05,
+      "loss": 12.4051,
       "step": 111
     },
     {
       "epoch": 0.01833961028328148,
-      "grad_norm": 0.15613137185573578,
-      "learning_rate": 0.0001999900705413893,
-      "loss": 12.2899,
+      "grad_norm": 0.2540196478366852,
+      "learning_rate": 7.466666666666667e-05,
+      "loss": 12.4187,
       "step": 112
     },
     {
       "epoch": 0.018503356803667922,
-      "grad_norm": 0.4086350202560425,
-      "learning_rate": 0.00019998983771074563,
-      "loss": 12.3246,
+      "grad_norm": 0.20726901292800903,
+      "learning_rate": 7.533333333333334e-05,
+      "loss": 12.4194,
       "step": 113
     },
     {
       "epoch": 0.018667103324054364,
-      "grad_norm": 0.22173292934894562,
-      "learning_rate": 0.00019998959032818675,
-      "loss": 12.3183,
+      "grad_norm": 0.2292231023311615,
+      "learning_rate": 7.6e-05,
+      "loss": 12.4186,
       "step": 114
     },
     {
       "epoch": 0.018830849844440806,
-      "grad_norm": 0.15508519113063812,
-      "learning_rate": 0.00019998934294562787,
-      "loss": 12.3032,
+      "grad_norm": 0.24483346939086914,
+      "learning_rate": 7.666666666666667e-05,
+      "loss": 12.4136,
       "step": 115
     },
     {
       "epoch": 0.018994596364827247,
-      "grad_norm": 0.13017487525939941,
-      "learning_rate": 0.00019998908101115376,
-      "loss": 12.3025,
+      "grad_norm": 0.25352075695991516,
+      "learning_rate": 7.733333333333333e-05,
+      "loss": 12.3991,
       "step": 116
     },
     {
       "epoch": 0.01915834288521369,
-      "grad_norm": 0.18207354843616486,
-      "learning_rate": 0.00019998881907667965,
-      "loss": 12.2849,
+      "grad_norm": 0.28832969069480896,
+      "learning_rate": 7.800000000000001e-05,
+      "loss": 12.403,
       "step": 117
     },
     {
       "epoch": 0.01932208940560013,
-      "grad_norm": 0.14748769998550415,
-      "learning_rate": 0.00019998857169412076,
-      "loss": 12.2891,
+      "grad_norm": 0.28145042061805725,
+      "learning_rate": 7.866666666666666e-05,
+      "loss": 12.4107,
       "step": 118
     },
     {
       "epoch": 0.019485835925986572,
-      "grad_norm": 0.141549214720726,
-      "learning_rate": 0.00019998830975964665,
-      "loss": 12.294,
+      "grad_norm": 0.29081061482429504,
+      "learning_rate": 7.933333333333334e-05,
+      "loss": 12.3983,
       "step": 119
     },
     {
       "epoch": 0.019649582446373014,
-      "grad_norm": 0.14569465816020966,
-      "learning_rate": 0.00019998804782517254,
-      "loss": 12.304,
+      "grad_norm": 0.24011318385601044,
+      "learning_rate": 8e-05,
+      "loss": 12.3979,
       "step": 120
     },
     {
       "epoch": 0.019813328966759455,
-      "grad_norm": 0.39608556032180786,
-      "learning_rate": 0.0001999877713387832,
-      "loss": 12.3186,
+      "grad_norm": 0.2432175874710083,
+      "learning_rate": 8.066666666666667e-05,
+      "loss": 12.4146,
       "step": 121
     },
     {
       "epoch": 0.019977075487145897,
-      "grad_norm": 0.12429912388324738,
-      "learning_rate": 0.0001999875094043091,
-      "loss": 12.2826,
+      "grad_norm": 0.3161214292049408,
+      "learning_rate": 8.133333333333334e-05,
+      "loss": 12.3937,
       "step": 122
     },
     {
       "epoch": 0.02014082200753234,
-      "grad_norm": 0.22286245226860046,
-      "learning_rate": 0.00019998723291791975,
-      "loss": 12.3161,
+      "grad_norm": 0.23684079945087433,
+      "learning_rate": 8.2e-05,
+      "loss": 12.4068,
       "step": 123
     },
     {
       "epoch": 0.02030456852791878,
-      "grad_norm": 0.4403093755245209,
-      "learning_rate": 0.00019998697098344564,
-      "loss": 12.331,
+      "grad_norm": 0.20811216533184052,
+      "learning_rate": 8.266666666666667e-05,
+      "loss": 12.4116,
       "step": 124
     },
     {
       "epoch": 0.020468315048305225,
-      "grad_norm": 0.21915563941001892,
-      "learning_rate": 0.00019998667994514108,
-      "loss": 12.2953,
+      "grad_norm": 0.25938355922698975,
+      "learning_rate": 8.333333333333334e-05,
+      "loss": 12.3906,
       "step": 125
     },
     {
       "epoch": 0.020632061568691667,
-      "grad_norm": 0.13325762748718262,
-      "learning_rate": 0.00019998640345875174,
-      "loss": 12.3046,
+      "grad_norm": 0.26399001479148865,
+      "learning_rate": 8.4e-05,
+      "loss": 12.3827,
       "step": 126
     },
     {
       "epoch": 0.02079580808907811,
-      "grad_norm": 0.1221589520573616,
-      "learning_rate": 0.00019998611242044717,
-      "loss": 12.3158,
+      "grad_norm": 0.2413434386253357,
+      "learning_rate": 8.466666666666667e-05,
+      "loss": 12.4046,
       "step": 127
     },
     {
       "epoch": 0.02095955460946455,
-      "grad_norm": 0.2719859480857849,
-      "learning_rate": 0.00019998583593405783,
-      "loss": 12.3273,
+      "grad_norm": 0.21343113481998444,
+      "learning_rate": 8.533333333333334e-05,
+      "loss": 12.3997,
       "step": 128
     },
     {
       "epoch": 0.02112330112985099,
-      "grad_norm": 0.14147429168224335,
-      "learning_rate": 0.00019998553034383804,
-      "loss": 12.295,
+      "grad_norm": 0.2558322548866272,
+      "learning_rate": 8.6e-05,
+      "loss": 12.3871,
       "step": 129
     },
     {
       "epoch": 0.021287047650237433,
-      "grad_norm": 0.3306241035461426,
-      "learning_rate": 0.0001999852538574487,
-      "loss": 12.3274,
+      "grad_norm": 0.2511197626590729,
+      "learning_rate": 8.666666666666667e-05,
+      "loss": 12.4011,
       "step": 130
     },
     {
       "epoch": 0.021450794170623875,
-      "grad_norm": 0.1614050716161728,
-      "learning_rate": 0.0001999849482672289,
-      "loss": 12.3168,
+      "grad_norm": 0.2675906717777252,
+      "learning_rate": 8.733333333333333e-05,
+      "loss": 12.4074,
       "step": 131
     },
     {
       "epoch": 0.021614540691010316,
-      "grad_norm": 0.2571965754032135,
-      "learning_rate": 0.00019998465722892433,
-      "loss": 12.3033,
+      "grad_norm": 0.25766870379447937,
+      "learning_rate": 8.800000000000001e-05,
+      "loss": 12.3809,
       "step": 132
     },
     {
       "epoch": 0.021778287211396758,
-      "grad_norm": 0.3647140562534332,
-      "learning_rate": 0.0001999843370867893,
-      "loss": 12.3138,
+      "grad_norm": 0.25887688994407654,
+      "learning_rate": 8.866666666666668e-05,
+      "loss": 12.3937,
       "step": 133
     },
     {
       "epoch": 0.0219420337317832,
-      "grad_norm": 0.22893981635570526,
-      "learning_rate": 0.00019998404604848474,
-      "loss": 12.3124,
+      "grad_norm": 0.2538568079471588,
+      "learning_rate": 8.933333333333334e-05,
+      "loss": 12.3999,
       "step": 134
     },
     {
       "epoch": 0.02210578025216964,
-      "grad_norm": 0.18172000348567963,
-      "learning_rate": 0.00019998374045826495,
-      "loss": 12.2969,
+      "grad_norm": 0.22949941456317902,
+      "learning_rate": 9e-05,
+      "loss": 12.3719,
       "step": 135
     },
     {
       "epoch": 0.022269526772556083,
-      "grad_norm": 0.2715581953525543,
-      "learning_rate": 0.00019998342031612992,
-      "loss": 12.2789,
+      "grad_norm": 0.2809057831764221,
+      "learning_rate": 9.066666666666667e-05,
+      "loss": 12.3547,
       "step": 136
     },
     {
       "epoch": 0.022433273292942525,
-      "grad_norm": 0.2520585358142853,
-      "learning_rate": 0.0001999831001739949,
-      "loss": 12.2753,
+      "grad_norm": 0.29336753487586975,
+      "learning_rate": 9.133333333333334e-05,
+      "loss": 12.3573,
       "step": 137
     },
     {
       "epoch": 0.022597019813328966,
-      "grad_norm": 0.20995964109897614,
-      "learning_rate": 0.0001999827945837751,
-      "loss": 12.2933,
+      "grad_norm": 0.26571160554885864,
+      "learning_rate": 9.200000000000001e-05,
+      "loss": 12.3756,
       "step": 138
     },
     {
       "epoch": 0.022760766333715408,
-      "grad_norm": 0.18961293995380402,
-      "learning_rate": 0.00019998247444164008,
-      "loss": 12.2929,
+      "grad_norm": 0.27329036593437195,
+      "learning_rate": 9.266666666666666e-05,
+      "loss": 12.3841,
       "step": 139
     },
     {
       "epoch": 0.02292451285410185,
-      "grad_norm": 0.15688307583332062,
-      "learning_rate": 0.00019998215429950505,
-      "loss": 12.292,
+      "grad_norm": 0.2753359079360962,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 12.3678,
       "step": 140
     },
     {
       "epoch": 0.02308825937448829,
-      "grad_norm": 0.09857089072465897,
-      "learning_rate": 0.00019998183415737003,
-      "loss": 12.2883,
+      "grad_norm": 0.26319000124931335,
+      "learning_rate": 9.4e-05,
+      "loss": 12.3615,
       "step": 141
     },
     {
       "epoch": 0.023252005894874733,
-      "grad_norm": 0.2050275206565857,
-      "learning_rate": 0.00019998149946331978,
-      "loss": 12.2957,
+      "grad_norm": 0.23538266122341156,
+      "learning_rate": 9.466666666666667e-05,
+      "loss": 12.3522,
       "step": 142
     },
     {
       "epoch": 0.023415752415261174,
-      "grad_norm": 0.43624162673950195,
-      "learning_rate": 0.00019998116476926953,
-      "loss": 12.3091,
+      "grad_norm": 0.21864941716194153,
+      "learning_rate": 9.533333333333334e-05,
+      "loss": 12.362,
       "step": 143
     },
     {
       "epoch": 0.023579498935647616,
-      "grad_norm": 0.1448792666196823,
-      "learning_rate": 0.00019998083007521927,
-      "loss": 12.2973,
+      "grad_norm": 0.2649800479412079,
+      "learning_rate": 9.6e-05,
+      "loss": 12.3644,
       "step": 144
     },
     {
       "epoch": 0.02374324545603406,
-      "grad_norm": 0.1774253100156784,
-      "learning_rate": 0.00019998049538116902,
-      "loss": 12.3051,
+      "grad_norm": 0.254098504781723,
+      "learning_rate": 9.666666666666667e-05,
+      "loss": 12.3645,
       "step": 145
     },
     {
       "epoch": 0.023906991976420502,
-      "grad_norm": 0.13856734335422516,
-      "learning_rate": 0.00019998014613520354,
-      "loss": 12.2838,
+      "grad_norm": 0.26626890897750854,
+      "learning_rate": 9.733333333333335e-05,
+      "loss": 12.3378,
       "step": 146
     },
     {
       "epoch": 0.024070738496806944,
-      "grad_norm": 0.14116550981998444,
-      "learning_rate": 0.00019997979688923806,
-      "loss": 12.2834,
+      "grad_norm": 0.266643762588501,
+      "learning_rate": 9.8e-05,
+      "loss": 12.3436,
       "step": 147
     },
     {
       "epoch": 0.024234485017193386,
-      "grad_norm": 0.13662639260292053,
-      "learning_rate": 0.0001999794621951878,
-      "loss": 12.2885,
+      "grad_norm": 0.24790844321250916,
+      "learning_rate": 9.866666666666668e-05,
+      "loss": 12.3534,
       "step": 148
     },
     {
       "epoch": 0.024398231537579827,
-      "grad_norm": 0.0898972824215889,
-      "learning_rate": 0.00019997911294922233,
-      "loss": 12.2985,
+      "grad_norm": 0.24089303612709045,
+      "learning_rate": 9.933333333333334e-05,
+      "loss": 12.3588,
       "step": 149
     },
     {
       "epoch": 0.02456197805796627,
-      "grad_norm": 0.18935908377170563,
-      "learning_rate": 0.00019997874915134162,
-      "loss": 12.3198,
+      "grad_norm": 0.20637933909893036,
+      "learning_rate": 0.0001,
+      "loss": 12.3666,
       "step": 150
     },
     {
       "epoch": 0.02472572457835271,
-      "grad_norm": 0.14787761867046356,
-      "learning_rate": 0.00019997839990537614,
-      "loss": 12.3134,
+      "grad_norm": 0.18257121741771698,
+      "learning_rate": 0.00010066666666666667,
+      "loss": 12.3432,
       "step": 151
     },
     {
       "epoch": 0.024889471098739152,
-      "grad_norm": 0.1418771892786026,
-      "learning_rate": 0.00019997803610749543,
-      "loss": 12.292,
+      "grad_norm": 0.24397976696491241,
+      "learning_rate": 0.00010133333333333335,
+      "loss": 12.3509,
       "step": 152
     },
     {
       "epoch": 0.025053217619125594,
-      "grad_norm": 0.11932875216007233,
-      "learning_rate": 0.00019997767230961472,
-      "loss": 12.2779,
+      "grad_norm": 0.19030074775218964,
+      "learning_rate": 0.00010200000000000001,
+      "loss": 12.3322,
       "step": 153
     },
     {
       "epoch": 0.025216964139512035,
-      "grad_norm": 0.22217413783073425,
-      "learning_rate": 0.00019997732306364924,
-      "loss": 12.3222,
+      "grad_norm": 0.25341999530792236,
+      "learning_rate": 0.00010266666666666666,
+      "loss": 12.3715,
       "step": 154
     },
     {
       "epoch": 0.025380710659898477,
-      "grad_norm": 0.17280636727809906,
-      "learning_rate": 0.0001999769447138533,
-      "loss": 12.3,
+      "grad_norm": 0.1844940036535263,
+      "learning_rate": 0.00010333333333333334,
+      "loss": 12.3434,
       "step": 155
     },
     {
       "epoch": 0.02554445718028492,
-      "grad_norm": 0.11617791652679443,
-      "learning_rate": 0.0001999765809159726,
-      "loss": 12.2993,
+      "grad_norm": 0.1969732791185379,
+      "learning_rate": 0.00010400000000000001,
+      "loss": 12.3491,
       "step": 156
     },
     {
       "epoch": 0.02570820370067136,
-      "grad_norm": 0.21635393798351288,
-      "learning_rate": 0.00019997620256617665,
-      "loss": 12.2918,
+      "grad_norm": 0.21235470473766327,
+      "learning_rate": 0.00010466666666666667,
+      "loss": 12.3378,
       "step": 157
     },
     {
       "epoch": 0.0258719502210578,
-      "grad_norm": 0.11997334659099579,
-      "learning_rate": 0.00019997582421638072,
-      "loss": 12.2956,
+      "grad_norm": 0.21899423003196716,
+      "learning_rate": 0.00010533333333333332,
+      "loss": 12.346,
       "step": 158
     },
     {
       "epoch": 0.026035696741444243,
-      "grad_norm": 0.19266721606254578,
-      "learning_rate": 0.00019997544586658478,
-      "loss": 12.3138,
+      "grad_norm": 0.19178782403469086,
+      "learning_rate": 0.00010600000000000002,
+      "loss": 12.3623,
       "step": 159
     },
     {
       "epoch": 0.026199443261830685,
-      "grad_norm": 0.19289498031139374,
-      "learning_rate": 0.00019997506751678884,
-      "loss": 12.3125,
+      "grad_norm": 0.21276162564754486,
+      "learning_rate": 0.00010666666666666667,
+      "loss": 12.3637,
       "step": 160
     },
     {
       "epoch": 0.026363189782217127,
-      "grad_norm": 0.25989240407943726,
-      "learning_rate": 0.0001999746891669929,
-      "loss": 12.2826,
+      "grad_norm": 0.2185026854276657,
+      "learning_rate": 0.00010733333333333333,
+      "loss": 12.3458,
       "step": 161
     },
     {
       "epoch": 0.026526936302603568,
-      "grad_norm": 0.15681873261928558,
-      "learning_rate": 0.00019997429626528174,
-      "loss": 12.2928,
+      "grad_norm": 0.1894225925207138,
+      "learning_rate": 0.00010800000000000001,
+      "loss": 12.3417,
       "step": 162
     },
     {
       "epoch": 0.02669068282299001,
-      "grad_norm": 0.19333164393901825,
-      "learning_rate": 0.00019997390336357057,
-      "loss": 12.272,
+      "grad_norm": 0.2355530709028244,
+      "learning_rate": 0.00010866666666666667,
+      "loss": 12.3253,
       "step": 163
     },
     {
       "epoch": 0.026854429343376455,
-      "grad_norm": 0.21432405710220337,
-      "learning_rate": 0.0001999735104618594,
-      "loss": 12.3191,
+      "grad_norm": 0.17933063209056854,
+      "learning_rate": 0.00010933333333333333,
+      "loss": 12.3522,
       "step": 164
     },
     {
       "epoch": 0.027018175863762896,
-      "grad_norm": 0.14616520702838898,
-      "learning_rate": 0.00019997311756014824,
-      "loss": 12.2888,
+      "grad_norm": 0.20666790008544922,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 12.325,
       "step": 165
     },
     {
       "epoch": 0.027181922384149338,
-      "grad_norm": 0.2478683441877365,
-      "learning_rate": 0.00019997272465843707,
-      "loss": 12.3073,
+      "grad_norm": 0.19132888317108154,
+      "learning_rate": 0.00011066666666666667,
+      "loss": 12.344,
       "step": 166
     },
     {
       "epoch": 0.02734566890453578,
-      "grad_norm": 0.10278715938329697,
-      "learning_rate": 0.00019997231720481068,
-      "loss": 12.2835,
+      "grad_norm": 0.16811159253120422,
+      "learning_rate": 0.00011133333333333333,
+      "loss": 12.3294,
       "step": 167
     },
     {
       "epoch": 0.02750941542492222,
-      "grad_norm": 0.15840058028697968,
-      "learning_rate": 0.00019997190975118428,
-      "loss": 12.2941,
+      "grad_norm": 0.1825079768896103,
+      "learning_rate": 0.00011200000000000001,
+      "loss": 12.3343,
       "step": 168
     },
     {
       "epoch": 0.027673161945308663,
-      "grad_norm": 0.12881392240524292,
-      "learning_rate": 0.00019997148774564266,
-      "loss": 12.3144,
+      "grad_norm": 0.18791979551315308,
+      "learning_rate": 0.00011266666666666668,
+      "loss": 12.3431,
       "step": 169
     },
     {
       "epoch": 0.027836908465695104,
-      "grad_norm": 0.12443830817937851,
-      "learning_rate": 0.00019997108029201627,
-      "loss": 12.2907,
+      "grad_norm": 0.17354759573936462,
+      "learning_rate": 0.00011333333333333334,
+      "loss": 12.3344,
       "step": 170
     },
     {
       "epoch": 0.028000654986081546,
-      "grad_norm": 0.22190840542316437,
-      "learning_rate": 0.00019997067283838987,
-      "loss": 12.2963,
+      "grad_norm": 0.18769007921218872,
+      "learning_rate": 0.00011399999999999999,
+      "loss": 12.3321,
       "step": 171
     },
     {
       "epoch": 0.028164401506467988,
-      "grad_norm": 0.1679324209690094,
-      "learning_rate": 0.00019997025083284825,
-      "loss": 12.276,
+      "grad_norm": 0.2163514792919159,
+      "learning_rate": 0.00011466666666666667,
+      "loss": 12.323,
       "step": 172
     },
     {
       "epoch": 0.02832814802685443,
-      "grad_norm": 0.14645697176456451,
-      "learning_rate": 0.00019996982882730663,
-      "loss": 12.2968,
+      "grad_norm": 0.18447071313858032,
+      "learning_rate": 0.00011533333333333334,
+      "loss": 12.337,
       "step": 173
     },
     {
       "epoch": 0.02849189454724087,
-      "grad_norm": 0.14275029301643372,
-      "learning_rate": 0.000199969406821765,
-      "loss": 12.2698,
+      "grad_norm": 0.2503637969493866,
+      "learning_rate": 0.000116,
+      "loss": 12.3102,
       "step": 174
     },
     {
       "epoch": 0.028655641067627313,
-      "grad_norm": 0.167531818151474,
-      "learning_rate": 0.00019996898481622338,
-      "loss": 12.2998,
+      "grad_norm": 0.1710224151611328,
+      "learning_rate": 0.00011666666666666668,
+      "loss": 12.323,
       "step": 175
     },
     {
       "epoch": 0.028819387588013754,
-      "grad_norm": 0.20161862671375275,
-      "learning_rate": 0.00019996856281068176,
-      "loss": 12.3161,
+      "grad_norm": 0.18545013666152954,
+      "learning_rate": 0.00011733333333333334,
+      "loss": 12.3386,
       "step": 176
     },
     {
       "epoch": 0.028983134108400196,
-      "grad_norm": 0.1790657937526703,
-      "learning_rate": 0.00019996811170130968,
-      "loss": 12.2802,
+      "grad_norm": 0.1715422123670578,
+      "learning_rate": 0.000118,
+      "loss": 12.3175,
       "step": 177
     },
     {
       "epoch": 0.029146880628786637,
-      "grad_norm": 0.13983246684074402,
-      "learning_rate": 0.00019996768969576806,
-      "loss": 12.2937,
+      "grad_norm": 0.21719999611377716,
+      "learning_rate": 0.00011866666666666669,
+      "loss": 12.3342,
       "step": 178
     },
     {
       "epoch": 0.02931062714917308,
-      "grad_norm": 0.09113561362028122,
-      "learning_rate": 0.0001999672531383112,
-      "loss": 12.2749,
+      "grad_norm": 0.14283528923988342,
+      "learning_rate": 0.00011933333333333334,
+      "loss": 12.3114,
       "step": 179
     },
     {
       "epoch": 0.02947437366955952,
-      "grad_norm": 0.15219911932945251,
-      "learning_rate": 0.00019996680202893913,
-      "loss": 12.291,
+      "grad_norm": 0.14593714475631714,
+      "learning_rate": 0.00012,
+      "loss": 12.3237,
       "step": 180
     },
     {
       "epoch": 0.029638120189945962,
-      "grad_norm": 0.15494637191295624,
-      "learning_rate": 0.00019996636547148228,
-      "loss": 12.307,
+      "grad_norm": 0.2272740751504898,
+      "learning_rate": 0.00012066666666666668,
+      "loss": 12.3285,
       "step": 181
     },
     {
       "epoch": 0.029801866710332404,
-      "grad_norm": 0.10758625715970993,
-      "learning_rate": 0.0001999659143621102,
-      "loss": 12.3137,
+      "grad_norm": 0.1673334389925003,
+      "learning_rate": 0.00012133333333333335,
+      "loss": 12.3374,
       "step": 182
     },
     {
       "epoch": 0.02996561323071885,
-      "grad_norm": 0.11248419433832169,
-      "learning_rate": 0.00019996547780465335,
-      "loss": 12.3021,
+      "grad_norm": 0.1368860900402069,
+      "learning_rate": 0.000122,
+      "loss": 12.335,
       "step": 183
     },
     {
       "epoch": 0.03012935975110529,
-      "grad_norm": 0.262703537940979,
-      "learning_rate": 0.00019996501214336604,
-      "loss": 12.3134,
+      "grad_norm": 0.2395031452178955,
+      "learning_rate": 0.00012266666666666668,
+      "loss": 12.3387,
       "step": 184
     },
     {
       "epoch": 0.030293106271491732,
-      "grad_norm": 0.16971834003925323,
-      "learning_rate": 0.00019996456103399396,
-      "loss": 12.2921,
+      "grad_norm": 0.16532330214977264,
+      "learning_rate": 0.00012333333333333334,
+      "loss": 12.3234,
       "step": 185
     },
     {
       "epoch": 0.030456852791878174,
-      "grad_norm": 0.21756646037101746,
-      "learning_rate": 0.00019996409537270665,
-      "loss": 12.3007,
+      "grad_norm": 0.18451902270317078,
+      "learning_rate": 0.000124,
+      "loss": 12.3312,
       "step": 186
     },
     {
       "epoch": 0.030620599312264615,
-      "grad_norm": 0.16931568086147308,
-      "learning_rate": 0.00019996362971141934,
-      "loss": 12.2984,
+      "grad_norm": 0.14678575098514557,
+      "learning_rate": 0.00012466666666666667,
+      "loss": 12.3127,
       "step": 187
     },
     {
       "epoch": 0.030784345832651057,
-      "grad_norm": 0.1374356746673584,
-      "learning_rate": 0.00019996316405013204,
-      "loss": 12.2771,
+      "grad_norm": 0.1310257613658905,
+      "learning_rate": 0.00012533333333333334,
+      "loss": 12.3053,
       "step": 188
     },
     {
       "epoch": 0.0309480923530375,
-      "grad_norm": 0.19620707631111145,
-      "learning_rate": 0.00019996271294075996,
-      "loss": 12.3238,
+      "grad_norm": 0.17656965553760529,
+      "learning_rate": 0.000126,
+      "loss": 12.3509,
       "step": 189
     },
     {
       "epoch": 0.03111183887342394,
-      "grad_norm": 0.15874610841274261,
-      "learning_rate": 0.00019996224727947265,
-      "loss": 12.281,
+      "grad_norm": 0.18695133924484253,
+      "learning_rate": 0.00012666666666666666,
+      "loss": 12.3122,
       "step": 190
     },
     {
       "epoch": 0.03127558539381038,
-      "grad_norm": 0.1111041009426117,
-      "learning_rate": 0.0001999617670662701,
-      "loss": 12.2995,
+      "grad_norm": 0.1660790890455246,
+      "learning_rate": 0.00012733333333333336,
+      "loss": 12.3267,
       "step": 191
     },
     {
       "epoch": 0.03143933191419682,
-      "grad_norm": 0.18783725798130035,
-      "learning_rate": 0.00019996128685306758,
-      "loss": 12.2759,
+      "grad_norm": 0.16188572347164154,
+      "learning_rate": 0.00012800000000000002,
+      "loss": 12.306,
       "step": 192
     },
     {
       "epoch": 0.031603078434583265,
-      "grad_norm": 0.2878400385379791,
-      "learning_rate": 0.00019996080663986504,
-      "loss": 12.2879,
+      "grad_norm": 0.4507922828197479,
+      "learning_rate": 0.00012866666666666666,
+      "loss": 12.312,
       "step": 193
     },
     {
       "epoch": 0.03176682495496971,
-      "grad_norm": 0.15140052139759064,
-      "learning_rate": 0.0001999603264266625,
-      "loss": 12.2966,
+      "grad_norm": 0.1829795241355896,
+      "learning_rate": 0.00012933333333333332,
+      "loss": 12.3141,
       "step": 194
     },
     {
       "epoch": 0.03193057147535615,
-      "grad_norm": 0.18027496337890625,
-      "learning_rate": 0.00019995984621345997,
-      "loss": 12.3031,
+      "grad_norm": 0.16686825454235077,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 12.3271,
       "step": 195
     },
     {
       "epoch": 0.03209431799574259,
-      "grad_norm": 0.28657597303390503,
-      "learning_rate": 0.0001999593514483422,
-      "loss": 12.2774,
+      "grad_norm": 0.26499465107917786,
+      "learning_rate": 0.00013066666666666668,
+      "loss": 12.3014,
       "step": 196
     },
     {
       "epoch": 0.03225806451612903,
-      "grad_norm": 0.22153787314891815,
-      "learning_rate": 0.00019995885668322444,
-      "loss": 12.3025,
+      "grad_norm": 0.1288941651582718,
+      "learning_rate": 0.00013133333333333332,
+      "loss": 12.3216,
       "step": 197
     },
     {
       "epoch": 0.03242181103651547,
-      "grad_norm": 0.19516243040561676,
-      "learning_rate": 0.00019995836191810668,
-      "loss": 12.2923,
+      "grad_norm": 0.19364045560359955,
+      "learning_rate": 0.000132,
+      "loss": 12.3043,
       "step": 198
     },
     {
       "epoch": 0.032585557556901915,
-      "grad_norm": 0.17245976626873016,
-      "learning_rate": 0.0001999578671529889,
-      "loss": 12.265,
+      "grad_norm": 0.1700012981891632,
+      "learning_rate": 0.00013266666666666667,
+      "loss": 12.2953,
       "step": 199
     },
     {
       "epoch": 0.032749304077288356,
-      "grad_norm": 0.1579219251871109,
-      "learning_rate": 0.00019995737238787115,
-      "loss": 12.2812,
-      "step": 200
-    },
-    {
-      "epoch": 0.032749304077288356,
-      "eval_loss": 12.291393280029297,
-      "eval_runtime": 7.4374,
-      "eval_samples_per_second": 33.076,
-      "eval_steps_per_second": 16.538,
+      "grad_norm": 0.16173534095287323,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 12.3054,
       "step": 200
     },
     {
       "epoch": 0.0329130505976748,
-      "grad_norm": 0.161886066198349,
-      "learning_rate": 0.00019995686307083815,
-      "loss": 12.2984,
+      "grad_norm": 0.1652362197637558,
+      "learning_rate": 0.000134,
+      "loss": 12.3216,
       "step": 201
     },
     {
       "epoch": 0.03307679711806124,
-      "grad_norm": 0.221256822347641,
-      "learning_rate": 0.0001999563683057204,
-      "loss": 12.3126,
+      "grad_norm": 0.18516595661640167,
+      "learning_rate": 0.00013466666666666667,
+      "loss": 12.3394,
       "step": 202
     },
     {
       "epoch": 0.03324054363844768,
-      "grad_norm": 0.12891951203346252,
-      "learning_rate": 0.0001999558589886874,
-      "loss": 12.3078,
+      "grad_norm": 0.16532324254512787,
+      "learning_rate": 0.00013533333333333333,
+      "loss": 12.3365,
       "step": 203
     },
     {
       "epoch": 0.03340429015883412,
-      "grad_norm": 0.1363368183374405,
-      "learning_rate": 0.00019995533511973917,
-      "loss": 12.2684,
+      "grad_norm": 0.1639377474784851,
+      "learning_rate": 0.00013600000000000003,
+      "loss": 12.2997,
       "step": 204
     },
     {
       "epoch": 0.033568036679220564,
-      "grad_norm": 0.12029746174812317,
-      "learning_rate": 0.00019995482580270618,
-      "loss": 12.283,
+      "grad_norm": 0.13720141351222992,
+      "learning_rate": 0.00013666666666666666,
+      "loss": 12.3105,
       "step": 205
     },
     {
       "epoch": 0.033731783199607006,
-      "grad_norm": 0.13670730590820312,
-      "learning_rate": 0.0001999543164856732,
-      "loss": 12.2871,
+      "grad_norm": 0.116838738322258,
+      "learning_rate": 0.00013733333333333333,
+      "loss": 12.3063,
       "step": 206
     },
     {
       "epoch": 0.03389552971999345,
-      "grad_norm": 0.12571097910404205,
-      "learning_rate": 0.00019995379261672497,
-      "loss": 12.297,
+      "grad_norm": 0.22012831270694733,
+      "learning_rate": 0.000138,
+      "loss": 12.32,
       "step": 207
     },
     {
       "epoch": 0.03405927624037989,
-      "grad_norm": 0.20929580926895142,
-      "learning_rate": 0.00019995326874777675,
-      "loss": 12.2853,
+      "grad_norm": 0.29010009765625,
+      "learning_rate": 0.00013866666666666669,
+      "loss": 12.3054,
       "step": 208
     },
     {
       "epoch": 0.03422302276076633,
-      "grad_norm": 0.13680557906627655,
-      "learning_rate": 0.00019995274487882853,
-      "loss": 12.3088,
+      "grad_norm": 0.13133449852466583,
+      "learning_rate": 0.00013933333333333335,
+      "loss": 12.329,
       "step": 209
     },
     {
       "epoch": 0.03438676928115277,
-      "grad_norm": 0.24794039130210876,
-      "learning_rate": 0.00019995220645796508,
-      "loss": 12.3039,
+      "grad_norm": 0.18425866961479187,
+      "learning_rate": 0.00014,
+      "loss": 12.3166,
       "step": 210
     },
     {
       "epoch": 0.034550515801539214,
-      "grad_norm": 0.220923513174057,
-      "learning_rate": 0.00019995168258901685,
-      "loss": 12.2739,
+      "grad_norm": 0.21036991477012634,
+      "learning_rate": 0.00014066666666666668,
+      "loss": 12.3058,
       "step": 211
     },
     {
       "epoch": 0.03471426232192566,
-      "grad_norm": 0.18528875708580017,
-      "learning_rate": 0.0001999511441681534,
-      "loss": 12.2915,
+      "grad_norm": 0.13942180573940277,
+      "learning_rate": 0.00014133333333333334,
+      "loss": 12.3181,
       "step": 212
     },
     {
       "epoch": 0.034878008842312104,
-      "grad_norm": 0.10424947738647461,
-      "learning_rate": 0.00019995060574728996,
-      "loss": 12.3008,
+      "grad_norm": 0.13669800758361816,
+      "learning_rate": 0.000142,
+      "loss": 12.3243,
       "step": 213
     },
     {
       "epoch": 0.035041755362698546,
-      "grad_norm": 0.10950024425983429,
-      "learning_rate": 0.00019995005277451128,
-      "loss": 12.2888,
+      "grad_norm": 0.11211808025836945,
+      "learning_rate": 0.00014266666666666667,
+      "loss": 12.3052,
       "step": 214
     },
     {
       "epoch": 0.03520550188308499,
-      "grad_norm": 0.1325610876083374,
-      "learning_rate": 0.00019994952890556306,
-      "loss": 12.2658,
+      "grad_norm": 0.2188958078622818,
+      "learning_rate": 0.00014333333333333334,
+      "loss": 12.2904,
       "step": 215
     },
     {
       "epoch": 0.03536924840347143,
-      "grad_norm": 0.21740667521953583,
-      "learning_rate": 0.00019994897593278438,
-      "loss": 12.2997,
+      "grad_norm": 0.20076972246170044,
+      "learning_rate": 0.000144,
+      "loss": 12.3283,
       "step": 216
     },
     {
       "epoch": 0.03553299492385787,
-      "grad_norm": 0.14994339644908905,
-      "learning_rate": 0.0001999484229600057,
-      "loss": 12.285,
+      "grad_norm": 0.1893579661846161,
+      "learning_rate": 0.0001446666666666667,
+      "loss": 12.3036,
       "step": 217
     },
     {
       "epoch": 0.03569674144424431,
-      "grad_norm": 0.1752844899892807,
-      "learning_rate": 0.00019994786998722702,
-      "loss": 12.2739,
+      "grad_norm": 0.12186531722545624,
+      "learning_rate": 0.00014533333333333333,
+      "loss": 12.2949,
       "step": 218
     },
     {
       "epoch": 0.035860487964630754,
-      "grad_norm": 0.16935011744499207,
-      "learning_rate": 0.00019994731701444834,
-      "loss": 12.2662,
+      "grad_norm": 0.19035090506076813,
+      "learning_rate": 0.000146,
+      "loss": 12.2976,
       "step": 219
     },
     {
       "epoch": 0.036024234485017195,
-      "grad_norm": 0.15907202661037445,
-      "learning_rate": 0.00019994674948975444,
-      "loss": 12.2754,
+      "grad_norm": 0.17918401956558228,
+      "learning_rate": 0.00014666666666666666,
+      "loss": 12.299,
       "step": 220
     },
     {
       "epoch": 0.03618798100540364,
-      "grad_norm": 0.1968131810426712,
-      "learning_rate": 0.00019994619651697576,
-      "loss": 12.2901,
+      "grad_norm": 0.22605666518211365,
+      "learning_rate": 0.00014733333333333335,
+      "loss": 12.3179,
       "step": 221
     },
     {
       "epoch": 0.03635172752579008,
-      "grad_norm": 0.15830907225608826,
-      "learning_rate": 0.00019994562899228185,
-      "loss": 12.279,
+      "grad_norm": 0.17605355381965637,
+      "learning_rate": 0.000148,
+      "loss": 12.3032,
       "step": 222
     },
     {
       "epoch": 0.03651547404617652,
-      "grad_norm": 0.13416405022144318,
-      "learning_rate": 0.00019994504691567272,
-      "loss": 12.2651,
+      "grad_norm": 0.1425754278898239,
+      "learning_rate": 0.00014866666666666666,
+      "loss": 12.2915,
       "step": 223
     },
     {
       "epoch": 0.03667922056656296,
-      "grad_norm": 0.2023450881242752,
-      "learning_rate": 0.00019994449394289404,
-      "loss": 12.2948,
+      "grad_norm": 0.17672143876552582,
+      "learning_rate": 0.00014933333333333335,
+      "loss": 12.3145,
       "step": 224
     },
     {
       "epoch": 0.0368429670869494,
-      "grad_norm": 0.11039238423109055,
-      "learning_rate": 0.0001999439118662849,
-      "loss": 12.2881,
+      "grad_norm": 0.12379948049783707,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 12.3129,
       "step": 225
     },
     {
       "epoch": 0.037006713607335845,
-      "grad_norm": 0.2013334035873413,
-      "learning_rate": 0.00019994332978967577,
-      "loss": 12.3025,
+      "grad_norm": 0.1985873281955719,
+      "learning_rate": 0.00015066666666666668,
+      "loss": 12.3372,
       "step": 226
     },
     {
       "epoch": 0.037170460127722287,
-      "grad_norm": 0.20877353847026825,
-      "learning_rate": 0.00019994276226498187,
-      "loss": 12.3149,
+      "grad_norm": 0.22402392327785492,
+      "learning_rate": 0.00015133333333333334,
+      "loss": 12.3404,
       "step": 227
     },
     {
       "epoch": 0.03733420664810873,
-      "grad_norm": 0.12196000665426254,
-      "learning_rate": 0.00019994218018837273,
-      "loss": 12.2965,
+      "grad_norm": 0.14348964393138885,
+      "learning_rate": 0.000152,
+      "loss": 12.317,
       "step": 228
     },
     {
       "epoch": 0.03749795316849517,
-      "grad_norm": 0.18351589143276215,
-      "learning_rate": 0.0001999415981117636,
-      "loss": 12.3101,
+      "grad_norm": 0.19029805064201355,
+      "learning_rate": 0.00015266666666666667,
+      "loss": 12.3274,
       "step": 229
     },
     {
       "epoch": 0.03766169968888161,
-      "grad_norm": 0.3522573411464691,
-      "learning_rate": 0.00019994100148323923,
-      "loss": 12.3185,
+      "grad_norm": 0.3693244457244873,
+      "learning_rate": 0.00015333333333333334,
+      "loss": 12.336,
       "step": 230
     },
     {
       "epoch": 0.03782544620926805,
-      "grad_norm": 0.1347028911113739,
-      "learning_rate": 0.0001999404194066301,
-      "loss": 12.2782,
+      "grad_norm": 0.15163873136043549,
+      "learning_rate": 0.000154,
+      "loss": 12.2995,
       "step": 231
     },
     {
       "epoch": 0.037989192729654495,
-      "grad_norm": 0.1606866717338562,
-      "learning_rate": 0.00019993982277810574,
-      "loss": 12.2837,
+      "grad_norm": 0.18613851070404053,
+      "learning_rate": 0.00015466666666666667,
+      "loss": 12.3056,
       "step": 232
     },
     {
       "epoch": 0.038152939250040936,
-      "grad_norm": 0.18167780339717865,
-      "learning_rate": 0.00019993922614958137,
-      "loss": 12.305,
+      "grad_norm": 0.2206224650144577,
+      "learning_rate": 0.00015533333333333333,
+      "loss": 12.3205,
       "step": 233
     },
     {
       "epoch": 0.03831668577042738,
-      "grad_norm": 0.1324174553155899,
-      "learning_rate": 0.00019993861496914178,
-      "loss": 12.2859,
+      "grad_norm": 0.11398882418870926,
+      "learning_rate": 0.00015600000000000002,
+      "loss": 12.3021,
       "step": 234
     },
     {
       "epoch": 0.03848043229081382,
-      "grad_norm": 0.16425751149654388,
-      "learning_rate": 0.00019993801834061742,
-      "loss": 12.2773,
+      "grad_norm": 0.1618189513683319,
+      "learning_rate": 0.00015666666666666666,
+      "loss": 12.2969,
       "step": 235
     },
     {
       "epoch": 0.03864417881120026,
-      "grad_norm": 0.21324995160102844,
-      "learning_rate": 0.00019993740716017783,
-      "loss": 12.2713,
+      "grad_norm": 0.1825104057788849,
+      "learning_rate": 0.00015733333333333333,
+      "loss": 12.2849,
       "step": 236
     },
     {
       "epoch": 0.0388079253315867,
-      "grad_norm": 0.22252927720546722,
-      "learning_rate": 0.00019993679597973824,
-      "loss": 12.3141,
+      "grad_norm": 0.2700357139110565,
+      "learning_rate": 0.00015800000000000002,
+      "loss": 12.3281,
       "step": 237
     },
     {
       "epoch": 0.038971671851973144,
-      "grad_norm": 0.12647491693496704,
-      "learning_rate": 0.00019993618479929864,
-      "loss": 12.2744,
+      "grad_norm": 0.12693750858306885,
+      "learning_rate": 0.00015866666666666668,
+      "loss": 12.2915,
       "step": 238
     },
     {
       "epoch": 0.039135418372359586,
-      "grad_norm": 0.23671850562095642,
-      "learning_rate": 0.00019993557361885905,
-      "loss": 12.2572,
+      "grad_norm": 0.20626111328601837,
+      "learning_rate": 0.00015933333333333332,
+      "loss": 12.2844,
       "step": 239
     },
     {
       "epoch": 0.03929916489274603,
-      "grad_norm": 0.14135394990444183,
-      "learning_rate": 0.00019993494788650423,
-      "loss": 12.2701,
+      "grad_norm": 0.28300392627716064,
+      "learning_rate": 0.00016,
+      "loss": 12.2941,
       "step": 240
     },
     {
       "epoch": 0.03946291141313247,
-      "grad_norm": 0.19470906257629395,
-      "learning_rate": 0.00019993433670606464,
-      "loss": 12.2932,
+      "grad_norm": 0.2612934112548828,
+      "learning_rate": 0.00016066666666666668,
+      "loss": 12.3069,
       "step": 241
     },
     {
       "epoch": 0.03962665793351891,
-      "grad_norm": 0.21060970425605774,
-      "learning_rate": 0.00019993371097370982,
-      "loss": 12.2607,
+      "grad_norm": 0.16001901030540466,
+      "learning_rate": 0.00016133333333333334,
+      "loss": 12.28,
       "step": 242
     },
     {
       "epoch": 0.03979040445390535,
-      "grad_norm": 0.1116245836019516,
-      "learning_rate": 0.000199933085241355,
-      "loss": 12.2812,
+      "grad_norm": 0.13632474839687347,
+      "learning_rate": 0.000162,
+      "loss": 12.3005,
       "step": 243
     },
     {
       "epoch": 0.039954150974291794,
-      "grad_norm": 0.20140965282917023,
-      "learning_rate": 0.00019993244495708495,
-      "loss": 12.2896,
+      "grad_norm": 0.30553674697875977,
+      "learning_rate": 0.00016266666666666667,
+      "loss": 12.3069,
       "step": 244
     },
     {
       "epoch": 0.040117897494678235,
-      "grad_norm": 0.11436797678470612,
-      "learning_rate": 0.00019993181922473013,
-      "loss": 12.2738,
+      "grad_norm": 0.14686566591262817,
+      "learning_rate": 0.00016333333333333334,
+      "loss": 12.2975,
       "step": 245
     },
     {
       "epoch": 0.04028164401506468,
-      "grad_norm": 0.1359335482120514,
-      "learning_rate": 0.00019993119349237531,
-      "loss": 12.2512,
+      "grad_norm": 0.15335270762443542,
+      "learning_rate": 0.000164,
+      "loss": 12.2738,
       "step": 246
     },
     {
       "epoch": 0.04044539053545112,
-      "grad_norm": 0.1482047438621521,
-      "learning_rate": 0.00019993053865619004,
-      "loss": 12.2944,
+      "grad_norm": 0.15902471542358398,
+      "learning_rate": 0.00016466666666666667,
+      "loss": 12.3114,
       "step": 247
     },
     {
       "epoch": 0.04060913705583756,
-      "grad_norm": 0.12831032276153564,
-      "learning_rate": 0.00019992989837192,
-      "loss": 12.304,
+      "grad_norm": 0.13151733577251434,
+      "learning_rate": 0.00016533333333333333,
+      "loss": 12.3157,
       "step": 248
     },
     {
       "epoch": 0.040772883576224,
-      "grad_norm": 0.18362678587436676,
-      "learning_rate": 0.00019992925808764994,
-      "loss": 12.2673,
+      "grad_norm": 0.2536516785621643,
+      "learning_rate": 0.000166,
+      "loss": 12.2878,
       "step": 249
     },
     {
       "epoch": 0.04093663009661045,
-      "grad_norm": 0.13624298572540283,
-      "learning_rate": 0.0001999286178033799,
-      "loss": 12.2556,
+      "grad_norm": 0.21861162781715393,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 12.2746,
       "step": 250
     },
     {
       "epoch": 0.04110037661699689,
-      "grad_norm": 0.1984298676252365,
-      "learning_rate": 0.00019992796296719462,
-      "loss": 12.3274,
+      "grad_norm": 0.19811271131038666,
+      "learning_rate": 0.00016733333333333335,
+      "loss": 12.3352,
       "step": 251
     },
     {
       "epoch": 0.041264123137383334,
-      "grad_norm": 0.19891658425331116,
-      "learning_rate": 0.00019992730813100934,
-      "loss": 12.2671,
+      "grad_norm": 0.4501516819000244,
+      "learning_rate": 0.000168,
+      "loss": 12.2949,
       "step": 252
     },
     {
       "epoch": 0.041427869657769775,
-      "grad_norm": 0.16791290044784546,
-      "learning_rate": 0.00019992665329482406,
-      "loss": 12.2687,
+      "grad_norm": 0.17889365553855896,
+      "learning_rate": 0.00016866666666666668,
+      "loss": 12.2884,
       "step": 253
     },
     {
       "epoch": 0.04159161617815622,
-      "grad_norm": 0.3288877606391907,
-      "learning_rate": 0.00019992598390672356,
-      "loss": 12.2904,
+      "grad_norm": 0.22105363011360168,
+      "learning_rate": 0.00016933333333333335,
+      "loss": 12.3049,
       "step": 254
     },
     {
       "epoch": 0.04175536269854266,
-      "grad_norm": 0.2074000984430313,
-      "learning_rate": 0.00019992532907053828,
-      "loss": 12.2962,
+      "grad_norm": 0.2042873650789261,
+      "learning_rate": 0.00017,
+      "loss": 12.3225,
       "step": 255
     },
     {
       "epoch": 0.0419191092189291,
-      "grad_norm": 0.16552244126796722,
-      "learning_rate": 0.00019992465968243778,
-      "loss": 12.2825,
+      "grad_norm": 0.13782532513141632,
+      "learning_rate": 0.00017066666666666668,
+      "loss": 12.3041,
       "step": 256
     },
     {
       "epoch": 0.04208285573931554,
-      "grad_norm": 0.0921081155538559,
-      "learning_rate": 0.0001999240048462525,
-      "loss": 12.2688,
+      "grad_norm": 0.11734238266944885,
+      "learning_rate": 0.00017133333333333334,
+      "loss": 12.2912,
       "step": 257
     },
     {
       "epoch": 0.04224660225970198,
-      "grad_norm": 0.1811661273241043,
-      "learning_rate": 0.000199923335458152,
-      "loss": 12.2645,
+      "grad_norm": 0.2625914216041565,
+      "learning_rate": 0.000172,
+      "loss": 12.2863,
       "step": 258
     },
     {
       "epoch": 0.042410348780088425,
-      "grad_norm": 0.1614493876695633,
-      "learning_rate": 0.00019992265151813626,
-      "loss": 12.3029,
+      "grad_norm": 0.15754052996635437,
+      "learning_rate": 0.00017266666666666667,
+      "loss": 12.3181,
       "step": 259
     },
     {
       "epoch": 0.042574095300474867,
-      "grad_norm": 0.1890016794204712,
-      "learning_rate": 0.00019992196757812053,
-      "loss": 12.2787,
+      "grad_norm": 0.21560034155845642,
+      "learning_rate": 0.00017333333333333334,
+      "loss": 12.3165,
       "step": 260
     },
     {
       "epoch": 0.04273784182086131,
-      "grad_norm": 0.137629434466362,
-      "learning_rate": 0.00019992129819002002,
-      "loss": 12.2516,
+      "grad_norm": 0.12006810307502747,
+      "learning_rate": 0.000174,
+      "loss": 12.2814,
       "step": 261
     },
     {
       "epoch": 0.04290158834124775,
-      "grad_norm": 0.2103978395462036,
-      "learning_rate": 0.0001999206142500043,
-      "loss": 12.2785,
+      "grad_norm": 0.19567811489105225,
+      "learning_rate": 0.00017466666666666667,
+      "loss": 12.303,
       "step": 262
     },
     {
       "epoch": 0.04306533486163419,
-      "grad_norm": 0.1392078995704651,
-      "learning_rate": 0.00019991993030998856,
-      "loss": 12.2816,
+      "grad_norm": 0.15239214897155762,
+      "learning_rate": 0.00017533333333333336,
+      "loss": 12.3033,
       "step": 263
     },
     {
       "epoch": 0.04322908138202063,
-      "grad_norm": 0.15243275463581085,
-      "learning_rate": 0.00019991924636997283,
-      "loss": 12.2646,
+      "grad_norm": 0.14320224523544312,
+      "learning_rate": 0.00017600000000000002,
+      "loss": 12.2838,
       "step": 264
     },
     {
       "epoch": 0.043392827902407075,
-      "grad_norm": 0.155501127243042,
-      "learning_rate": 0.00019991854787804186,
-      "loss": 12.2842,
+      "grad_norm": 0.1736345887184143,
+      "learning_rate": 0.00017666666666666666,
+      "loss": 12.3115,
       "step": 265
     },
     {
       "epoch": 0.043556574422793516,
-      "grad_norm": 0.1776060163974762,
-      "learning_rate": 0.00019991786393802613,
-      "loss": 12.2972,
+      "grad_norm": 0.16777797043323517,
+      "learning_rate": 0.00017733333333333335,
+      "loss": 12.3189,
       "step": 266
     },
     {
       "epoch": 0.04372032094317996,
-      "grad_norm": 0.1182047501206398,
-      "learning_rate": 0.00019991715089417994,
-      "loss": 12.2812,
+      "grad_norm": 0.1118602603673935,
+      "learning_rate": 0.00017800000000000002,
+      "loss": 12.2967,
       "step": 267
     },
     {
       "epoch": 0.0438840674635664,
-      "grad_norm": 0.10008183866739273,
-      "learning_rate": 0.00019991645240224898,
-      "loss": 12.2747,
+      "grad_norm": 0.14121738076210022,
+      "learning_rate": 0.00017866666666666668,
+      "loss": 12.2935,
       "step": 268
     },
     {
       "epoch": 0.04404781398395284,
-      "grad_norm": 0.2416013479232788,
-      "learning_rate": 0.00019991575391031802,
-      "loss": 12.2916,
+      "grad_norm": 0.2797190546989441,
+      "learning_rate": 0.00017933333333333332,
+      "loss": 12.3174,
       "step": 269
     },
     {
       "epoch": 0.04421156050433928,
-      "grad_norm": 0.15756867825984955,
-      "learning_rate": 0.00019991504086647183,
-      "loss": 12.259,
+      "grad_norm": 0.2606465220451355,
+      "learning_rate": 0.00018,
+      "loss": 12.283,
       "step": 270
     },
     {
       "epoch": 0.044375307024725724,
-      "grad_norm": 0.17258231341838837,
-      "learning_rate": 0.00019991434237454087,
-      "loss": 12.3201,
+      "grad_norm": 0.24559257924556732,
+      "learning_rate": 0.00018066666666666668,
+      "loss": 12.3289,
       "step": 271
     },
     {
       "epoch": 0.044539053545112166,
-      "grad_norm": 0.19413842260837555,
-      "learning_rate": 0.00019991362933069468,
-      "loss": 12.2855,
+      "grad_norm": 0.2690947949886322,
+      "learning_rate": 0.00018133333333333334,
+      "loss": 12.3016,
       "step": 272
     },
     {
       "epoch": 0.04470280006549861,
-      "grad_norm": 0.14625532925128937,
-      "learning_rate": 0.00019991291628684849,
-      "loss": 12.2546,
+      "grad_norm": 0.14213307201862335,
+      "learning_rate": 0.000182,
+      "loss": 12.2888,
       "step": 273
     },
     {
       "epoch": 0.04486654658588505,
-      "grad_norm": 0.2281506359577179,
-      "learning_rate": 0.0001999122032430023,
-      "loss": 12.2741,
+      "grad_norm": 0.23441320657730103,
+      "learning_rate": 0.00018266666666666667,
+      "loss": 12.2924,
       "step": 274
     },
     {
       "epoch": 0.04503029310627149,
-      "grad_norm": 0.207796111702919,
-      "learning_rate": 0.00019991147564724088,
-      "loss": 12.2596,
+      "grad_norm": 0.1753687560558319,
+      "learning_rate": 0.00018333333333333334,
+      "loss": 12.2822,
       "step": 275
     },
     {
       "epoch": 0.04519403962665793,
-      "grad_norm": 0.1390552818775177,
-      "learning_rate": 0.00019991074805147946,
-      "loss": 12.2327,
+      "grad_norm": 0.13752730190753937,
+      "learning_rate": 0.00018400000000000003,
+      "loss": 12.2564,
       "step": 276
     },
     {
       "epoch": 0.045357786147044374,
-      "grad_norm": 0.1534932404756546,
-      "learning_rate": 0.00019991002045571804,
-      "loss": 12.2644,
+      "grad_norm": 0.09531578421592712,
+      "learning_rate": 0.00018466666666666666,
+      "loss": 12.2847,
       "step": 277
     },
     {
       "epoch": 0.045521532667430815,
-      "grad_norm": 0.142225444316864,
-      "learning_rate": 0.00019990929285995662,
-      "loss": 12.2712,
+      "grad_norm": 0.13418786227703094,
+      "learning_rate": 0.00018533333333333333,
+      "loss": 12.2926,
       "step": 278
     },
     {
       "epoch": 0.04568527918781726,
-      "grad_norm": 0.1784576177597046,
-      "learning_rate": 0.0001999085652641952,
-      "loss": 12.2751,
+      "grad_norm": 0.12498784065246582,
+      "learning_rate": 0.00018600000000000002,
+      "loss": 12.2995,
       "step": 279
     },
     {
       "epoch": 0.0458490257082037,
-      "grad_norm": 0.13212628662586212,
-      "learning_rate": 0.00019990782311651856,
-      "loss": 12.2734,
+      "grad_norm": 0.1601720005273819,
+      "learning_rate": 0.0001866666666666667,
+      "loss": 12.2968,
       "step": 280
     },
     {
       "epoch": 0.04601277222859014,
-      "grad_norm": 0.12999513745307922,
-      "learning_rate": 0.0001999070809688419,
-      "loss": 12.2971,
+      "grad_norm": 0.14898695051670074,
+      "learning_rate": 0.00018733333333333335,
+      "loss": 12.3042,
       "step": 281
     },
     {
       "epoch": 0.04617651874897658,
-      "grad_norm": 0.14513814449310303,
-      "learning_rate": 0.00019990633882116526,
-      "loss": 12.257,
+      "grad_norm": 0.13938522338867188,
+      "learning_rate": 0.000188,
+      "loss": 12.2859,
       "step": 282
     },
     {
       "epoch": 0.046340265269363023,
-      "grad_norm": 0.1383667141199112,
-      "learning_rate": 0.00019990559667348862,
-      "loss": 12.2826,
+      "grad_norm": 0.17422890663146973,
+      "learning_rate": 0.00018866666666666668,
+      "loss": 12.3031,
       "step": 283
     },
     {
       "epoch": 0.046504011789749465,
-      "grad_norm": 0.16239354014396667,
-      "learning_rate": 0.00019990485452581197,
-      "loss": 12.2747,
+      "grad_norm": 0.20269818603992462,
+      "learning_rate": 0.00018933333333333335,
+      "loss": 12.2896,
       "step": 284
     },
     {
       "epoch": 0.04666775831013591,
-      "grad_norm": 0.14773111045360565,
-      "learning_rate": 0.0001999040978262201,
-      "loss": 12.2803,
+      "grad_norm": 0.16185009479522705,
+      "learning_rate": 0.00019,
+      "loss": 12.3044,
       "step": 285
     },
     {
       "epoch": 0.04683150483052235,
-      "grad_norm": 0.20306552946567535,
-      "learning_rate": 0.00019990334112662822,
-      "loss": 12.307,
+      "grad_norm": 0.26534372568130493,
+      "learning_rate": 0.00019066666666666668,
+      "loss": 12.3174,
       "step": 286
     },
     {
       "epoch": 0.04699525135090879,
-      "grad_norm": 0.1623079627752304,
-      "learning_rate": 0.00019990258442703635,
-      "loss": 12.3055,
+      "grad_norm": 0.19738201797008514,
+      "learning_rate": 0.00019133333333333334,
+      "loss": 12.3236,
       "step": 287
     },
     {
       "epoch": 0.04715899787129523,
-      "grad_norm": 0.17853973805904388,
-      "learning_rate": 0.00019990182772744447,
-      "loss": 12.3012,
+      "grad_norm": 0.28078433871269226,
+      "learning_rate": 0.000192,
+      "loss": 12.3156,
       "step": 288
     },
     {
       "epoch": 0.04732274439168168,
-      "grad_norm": 0.207291841506958,
-      "learning_rate": 0.00019990105647593737,
-      "loss": 12.2876,
+      "grad_norm": 0.2100210189819336,
+      "learning_rate": 0.0001926666666666667,
+      "loss": 12.2992,
       "step": 289
     },
     {
       "epoch": 0.04748649091206812,
-      "grad_norm": 0.15179339051246643,
-      "learning_rate": 0.0001999002997763455,
-      "loss": 12.2683,
+      "grad_norm": 0.15641532838344574,
+      "learning_rate": 0.00019333333333333333,
+      "loss": 12.2878,
       "step": 290
     },
     {
       "epoch": 0.04765023743245456,
-      "grad_norm": 0.18993042409420013,
-      "learning_rate": 0.00019989954307675362,
-      "loss": 12.2622,
+      "grad_norm": 0.2536814212799072,
+      "learning_rate": 0.000194,
+      "loss": 12.2886,
       "step": 291
     },
     {
       "epoch": 0.047813983952841005,
-      "grad_norm": 0.1102190762758255,
-      "learning_rate": 0.00019989875727333128,
-      "loss": 12.2814,
+      "grad_norm": 0.11571547389030457,
+      "learning_rate": 0.0001946666666666667,
+      "loss": 12.3017,
       "step": 292
     },
     {
       "epoch": 0.047977730473227446,
-      "grad_norm": 0.1876610964536667,
-      "learning_rate": 0.00019989798602182418,
-      "loss": 12.2715,
+      "grad_norm": 0.16971823573112488,
+      "learning_rate": 0.00019533333333333336,
+      "loss": 12.2832,
       "step": 293
     },
     {
       "epoch": 0.04814147699361389,
-      "grad_norm": 0.1995493471622467,
-      "learning_rate": 0.00019989721477031708,
-      "loss": 12.2674,
+      "grad_norm": 0.2792551815509796,
+      "learning_rate": 0.000196,
+      "loss": 12.3043,
       "step": 294
     },
     {
       "epoch": 0.04830522351400033,
-      "grad_norm": 0.11874358355998993,
-      "learning_rate": 0.00019989642896689475,
-      "loss": 12.3595,
+      "grad_norm": 0.21620360016822815,
+      "learning_rate": 0.00019666666666666666,
+      "loss": 12.3755,
       "step": 295
     },
     {
       "epoch": 0.04846897003438677,
-      "grad_norm": 0.2674122452735901,
-      "learning_rate": 0.00019989565771538764,
-      "loss": 12.3017,
+      "grad_norm": 0.24013638496398926,
+      "learning_rate": 0.00019733333333333335,
+      "loss": 12.3221,
       "step": 296
     },
     {
       "epoch": 0.04863271655477321,
-      "grad_norm": 0.15029270946979523,
-      "learning_rate": 0.00019989485736005008,
-      "loss": 12.268,
+      "grad_norm": 0.2197066694498062,
+      "learning_rate": 0.00019800000000000002,
+      "loss": 12.2921,
       "step": 297
     },
     {
       "epoch": 0.048796463075159655,
-      "grad_norm": 0.17938172817230225,
-      "learning_rate": 0.00019989407155662775,
-      "loss": 12.2819,
+      "grad_norm": 0.20803050696849823,
+      "learning_rate": 0.00019866666666666668,
+      "loss": 12.3009,
       "step": 298
     },
     {
       "epoch": 0.048960209595546096,
-      "grad_norm": 0.19023428857326508,
-      "learning_rate": 0.00019989328575320542,
-      "loss": 12.2706,
+      "grad_norm": 0.1672317534685135,
+      "learning_rate": 0.00019933333333333334,
+      "loss": 12.2926,
       "step": 299
     },
     {
       "epoch": 0.04912395611593254,
-      "grad_norm": 0.21805034577846527,
-      "learning_rate": 0.00019989248539786786,
-      "loss": 12.278,
-      "step": 300
-    },
-    {
-      "epoch": 0.04912395611593254,
-      "eval_loss": 12.275172233581543,
-      "eval_runtime": 7.4494,
-      "eval_samples_per_second": 33.023,
-      "eval_steps_per_second": 16.511,
+      "grad_norm": 0.1740259975194931,
+      "learning_rate": 0.0002,
+      "loss": 12.3002,
       "step": 300
     },
     {
       "epoch": 0.04928770263631898,
-      "grad_norm": 0.1406513899564743,
-      "learning_rate": 0.00019989169959444553,
-      "loss": 12.2771,
+      "grad_norm": 0.13165144622325897,
+      "learning_rate": 0.00019999999986637393,
+      "loss": 12.3066,
       "step": 301
     },
     {
       "epoch": 0.04945144915670542,
-      "grad_norm": 0.14678552746772766,
-      "learning_rate": 0.00019989088468719274,
-      "loss": 12.2657,
+      "grad_norm": 0.23863595724105835,
+      "learning_rate": 0.00019999999946549563,
+      "loss": 12.2885,
       "step": 302
     },
     {
       "epoch": 0.04961519567709186,
-      "grad_norm": 0.14424839615821838,
-      "learning_rate": 0.00019989008433185518,
-      "loss": 12.2738,
+      "grad_norm": 0.12646012008190155,
+      "learning_rate": 0.00019999999879736518,
+      "loss": 12.2915,
       "step": 303
     },
     {
       "epoch": 0.049778942197478304,
-      "grad_norm": 0.2018839716911316,
-      "learning_rate": 0.0001998892694246024,
-      "loss": 12.2814,
+      "grad_norm": 0.2852458953857422,
+      "learning_rate": 0.00019999999786198252,
+      "loss": 12.3148,
       "step": 304
     },
     {
       "epoch": 0.049942688717864746,
-      "grad_norm": 0.20368194580078125,
-      "learning_rate": 0.00019988846906926483,
-      "loss": 12.2506,
+      "grad_norm": 0.21312469244003296,
+      "learning_rate": 0.00019999999665934766,
+      "loss": 12.2737,
       "step": 305
     },
     {
       "epoch": 0.05010643523825119,
-      "grad_norm": 0.21352413296699524,
-      "learning_rate": 0.00019988765416201204,
-      "loss": 12.288,
+      "grad_norm": 0.21012352406978607,
+      "learning_rate": 0.00019999999518946062,
+      "loss": 12.3127,
       "step": 306
     },
     {
       "epoch": 0.05027018175863763,
-      "grad_norm": 0.11416824907064438,
-      "learning_rate": 0.00019988683925475925,
-      "loss": 12.2772,
+      "grad_norm": 0.12454304844141006,
+      "learning_rate": 0.00019999999345232143,
+      "loss": 12.2986,
       "step": 307
     },
     {
       "epoch": 0.05043392827902407,
-      "grad_norm": 0.18459837138652802,
-      "learning_rate": 0.00019988602434750646,
-      "loss": 12.3078,
+      "grad_norm": 0.20746897161006927,
+      "learning_rate": 0.00019999999144793007,
+      "loss": 12.3273,
       "step": 308
     },
     {
       "epoch": 0.05059767479941051,
-      "grad_norm": 0.1840011328458786,
-      "learning_rate": 0.00019988519488833845,
-      "loss": 12.2627,
+      "grad_norm": 0.17081008851528168,
+      "learning_rate": 0.0001999999891762865,
+      "loss": 12.2905,
       "step": 309
     },
     {
       "epoch": 0.050761421319796954,
-      "grad_norm": 0.14293089509010315,
-      "learning_rate": 0.00019988437998108566,
-      "loss": 12.2646,
+      "grad_norm": 0.1272573620080948,
+      "learning_rate": 0.0001999999866373908,
+      "loss": 12.2907,
       "step": 310
     },
     {
       "epoch": 0.050925167840183395,
-      "grad_norm": 0.1082555428147316,
-      "learning_rate": 0.00019988355052191764,
-      "loss": 12.2839,
+      "grad_norm": 0.1364651918411255,
+      "learning_rate": 0.00019999998383124298,
+      "loss": 12.2988,
       "step": 311
     },
     {
       "epoch": 0.05108891436056984,
-      "grad_norm": 0.5544154047966003,
-      "learning_rate": 0.00019988272106274962,
-      "loss": 12.2913,
+      "grad_norm": 0.4523601531982422,
+      "learning_rate": 0.00019999998075784293,
+      "loss": 12.2987,
       "step": 312
     },
     {
       "epoch": 0.05125266088095628,
-      "grad_norm": 0.1190885454416275,
-      "learning_rate": 0.00019988187705166638,
-      "loss": 12.2789,
+      "grad_norm": 0.1788499653339386,
+      "learning_rate": 0.0001999999774171908,
+      "loss": 12.3047,
       "step": 313
     },
     {
       "epoch": 0.05141640740134272,
-      "grad_norm": 0.14885587990283966,
-      "learning_rate": 0.00019988104759249836,
-      "loss": 12.2705,
+      "grad_norm": 0.21642278134822845,
+      "learning_rate": 0.00019999997380928654,
+      "loss": 12.2901,
       "step": 314
     },
     {
       "epoch": 0.05158015392172916,
-      "grad_norm": 0.10743315517902374,
-      "learning_rate": 0.00019988021813333035,
-      "loss": 12.2572,
+      "grad_norm": 0.12551110982894897,
+      "learning_rate": 0.00019999996993413013,
+      "loss": 12.2777,
       "step": 315
     },
     {
       "epoch": 0.0517439004421156,
-      "grad_norm": 0.10766734182834625,
-      "learning_rate": 0.0001998793741222471,
-      "loss": 12.274,
+      "grad_norm": 0.08959595859050751,
+      "learning_rate": 0.00019999996579172166,
+      "loss": 12.2922,
       "step": 316
     },
     {
       "epoch": 0.051907646962502045,
-      "grad_norm": 0.14055076241493225,
-      "learning_rate": 0.00019987851555924863,
-      "loss": 12.2629,
+      "grad_norm": 0.17989438772201538,
+      "learning_rate": 0.00019999996138206103,
+      "loss": 12.2858,
       "step": 317
     },
     {
       "epoch": 0.05207139348288849,
-      "grad_norm": 0.13312725722789764,
-      "learning_rate": 0.00019987767154816538,
-      "loss": 12.2709,
+      "grad_norm": 0.11149830371141434,
+      "learning_rate": 0.00019999995670514834,
+      "loss": 12.2864,
       "step": 318
     },
     {
       "epoch": 0.05223514000327493,
-      "grad_norm": 0.3041207790374756,
-      "learning_rate": 0.00019987682753708214,
-      "loss": 12.3052,
+      "grad_norm": 0.37905353307724,
+      "learning_rate": 0.00019999995176098358,
+      "loss": 12.3123,
       "step": 319
     },
     {
       "epoch": 0.05239888652366137,
-      "grad_norm": 0.14904162287712097,
-      "learning_rate": 0.00019987596897408366,
-      "loss": 12.2843,
+      "grad_norm": 0.1828998625278473,
+      "learning_rate": 0.00019999994654956678,
+      "loss": 12.2975,
       "step": 320
     },
     {
       "epoch": 0.05256263304404781,
-      "grad_norm": 0.21144121885299683,
-      "learning_rate": 0.0001998751104110852,
-      "loss": 12.2775,
+      "grad_norm": 0.2369372397661209,
+      "learning_rate": 0.00019999994107089793,
+      "loss": 12.2982,
       "step": 321
     },
     {
       "epoch": 0.05272637956443425,
-      "grad_norm": 0.1579839438199997,
-      "learning_rate": 0.00019987425184808671,
-      "loss": 12.2626,
+      "grad_norm": 0.21192120015621185,
+      "learning_rate": 0.000199999935324977,
+      "loss": 12.2798,
       "step": 322
     },
     {
       "epoch": 0.052890126084820695,
-      "grad_norm": 0.15492835640907288,
-      "learning_rate": 0.00019987339328508824,
-      "loss": 12.2489,
+      "grad_norm": 0.13581956923007965,
+      "learning_rate": 0.0001999999293118041,
+      "loss": 12.2714,
       "step": 323
     },
     {
       "epoch": 0.053053872605207136,
-      "grad_norm": 0.14935417473316193,
-      "learning_rate": 0.00019987252017017454,
-      "loss": 12.2721,
+      "grad_norm": 0.11289360374212265,
+      "learning_rate": 0.00019999992303137916,
+      "loss": 12.2879,
       "step": 324
     },
     {
       "epoch": 0.05321761912559358,
-      "grad_norm": 0.16872552037239075,
-      "learning_rate": 0.00019987164705526084,
-      "loss": 12.2601,
+      "grad_norm": 0.12893497943878174,
+      "learning_rate": 0.00019999991648370226,
+      "loss": 12.2844,
       "step": 325
     },
     {
       "epoch": 0.05338136564598002,
-      "grad_norm": 0.14731909334659576,
-      "learning_rate": 0.00019987078849226236,
-      "loss": 12.2728,
+      "grad_norm": 0.12965922057628632,
+      "learning_rate": 0.0001999999096687734,
+      "loss": 12.2866,
       "step": 326
     },
     {
       "epoch": 0.05354511216636647,
-      "grad_norm": 0.28716838359832764,
-      "learning_rate": 0.00019986991537734866,
-      "loss": 12.2734,
+      "grad_norm": 0.2003125697374344,
+      "learning_rate": 0.0001999999025865926,
+      "loss": 12.2908,
       "step": 327
     },
     {
       "epoch": 0.05370885868675291,
-      "grad_norm": 0.145146906375885,
-      "learning_rate": 0.00019986902771051973,
-      "loss": 12.2719,
+      "grad_norm": 0.1551949828863144,
+      "learning_rate": 0.00019999989523715984,
+      "loss": 12.2969,
       "step": 328
     },
     {
       "epoch": 0.05387260520713935,
-      "grad_norm": 0.12571807205677032,
-      "learning_rate": 0.00019986815459560603,
-      "loss": 12.2818,
+      "grad_norm": 0.11883686482906342,
+      "learning_rate": 0.00019999988762047516,
+      "loss": 12.3078,
       "step": 329
     },
     {
       "epoch": 0.05403635172752579,
-      "grad_norm": 0.1811470240354538,
-      "learning_rate": 0.00019986728148069233,
-      "loss": 12.2653,
+      "grad_norm": 0.14822140336036682,
+      "learning_rate": 0.00019999987973653863,
+      "loss": 12.2868,
       "step": 330
     },
     {
       "epoch": 0.054200098247912235,
-      "grad_norm": 0.18098489940166473,
-      "learning_rate": 0.00019986637926194817,
-      "loss": 12.2768,
+      "grad_norm": 0.16526789963245392,
+      "learning_rate": 0.00019999987158535022,
+      "loss": 12.3022,
       "step": 331
     },
     {
       "epoch": 0.054363844768298676,
-      "grad_norm": 0.1682722270488739,
-      "learning_rate": 0.00019986550614703447,
-      "loss": 12.2737,
+      "grad_norm": 0.22066651284694672,
+      "learning_rate": 0.00019999986316690996,
+      "loss": 12.2937,
       "step": 332
     },
     {
       "epoch": 0.05452759128868512,
-      "grad_norm": 0.18899443745613098,
-      "learning_rate": 0.0001998646039282903,
-      "loss": 12.2599,
+      "grad_norm": 0.23073990643024445,
+      "learning_rate": 0.00019999985448121787,
+      "loss": 12.2761,
       "step": 333
     },
     {
       "epoch": 0.05469133780907156,
-      "grad_norm": 0.10069891810417175,
-      "learning_rate": 0.00019986371626146138,
-      "loss": 12.2526,
+      "grad_norm": 0.1183820590376854,
+      "learning_rate": 0.000199999845528274,
+      "loss": 12.2706,
       "step": 334
     },
     {
       "epoch": 0.054855084329458,
-      "grad_norm": 0.14798307418823242,
-      "learning_rate": 0.00019986281404271722,
-      "loss": 12.2724,
+      "grad_norm": 0.15629184246063232,
+      "learning_rate": 0.0001999998363080783,
+      "loss": 12.2955,
       "step": 335
     },
     {
       "epoch": 0.05501883084984444,
-      "grad_norm": 0.15758149325847626,
-      "learning_rate": 0.00019986191182397306,
-      "loss": 12.2839,
+      "grad_norm": 0.21205270290374756,
+      "learning_rate": 0.0001999998268206309,
+      "loss": 12.2933,
       "step": 336
     },
     {
       "epoch": 0.055182577370230884,
-      "grad_norm": 0.2598915100097656,
-      "learning_rate": 0.0001998610096052289,
-      "loss": 12.2593,
+      "grad_norm": 0.27476778626441956,
+      "learning_rate": 0.00019999981706593173,
+      "loss": 12.2768,
       "step": 337
     },
     {
       "epoch": 0.055346323890617326,
-      "grad_norm": 0.22992730140686035,
-      "learning_rate": 0.00019986010738648474,
-      "loss": 12.2799,
+      "grad_norm": 0.23682667315006256,
+      "learning_rate": 0.0001999998070439809,
+      "loss": 12.3049,
       "step": 338
     },
     {
       "epoch": 0.05551007041100377,
-      "grad_norm": 0.14556151628494263,
-      "learning_rate": 0.00019985919061582536,
-      "loss": 12.2907,
+      "grad_norm": 0.1238231286406517,
+      "learning_rate": 0.00019999979675477839,
+      "loss": 12.3007,
       "step": 339
     },
     {
       "epoch": 0.05567381693139021,
-      "grad_norm": 0.11856741458177567,
-      "learning_rate": 0.00019985827384516597,
-      "loss": 12.2717,
+      "grad_norm": 0.10456238687038422,
+      "learning_rate": 0.0001999997861983242,
+      "loss": 12.2874,
       "step": 340
     },
     {
       "epoch": 0.05583756345177665,
-      "grad_norm": 0.14395655691623688,
-      "learning_rate": 0.00019985735707450658,
-      "loss": 12.2652,
+      "grad_norm": 0.11022137105464935,
+      "learning_rate": 0.00019999977537461843,
+      "loss": 12.2897,
       "step": 341
     },
     {
       "epoch": 0.05600130997216309,
-      "grad_norm": 0.09158793836832047,
-      "learning_rate": 0.0001998564403038472,
-      "loss": 12.261,
+      "grad_norm": 0.11375322192907333,
+      "learning_rate": 0.00019999976428366104,
+      "loss": 12.2778,
       "step": 342
     },
     {
       "epoch": 0.056165056492549534,
-      "grad_norm": 0.16784970462322235,
-      "learning_rate": 0.0001998555235331878,
-      "loss": 12.2369,
+      "grad_norm": 0.23204772174358368,
+      "learning_rate": 0.00019999975292545212,
+      "loss": 12.2546,
       "step": 343
     },
     {
       "epoch": 0.056328803012935975,
-      "grad_norm": 0.24563707411289215,
-      "learning_rate": 0.0001998545922106132,
-      "loss": 12.2968,
+      "grad_norm": 0.2459680140018463,
+      "learning_rate": 0.00019999974129999165,
+      "loss": 12.3037,
       "step": 344
     },
     {
       "epoch": 0.05649254953332242,
-      "grad_norm": 0.18693608045578003,
-      "learning_rate": 0.0001998536754399538,
-      "loss": 12.2752,
+      "grad_norm": 0.15076689422130585,
+      "learning_rate": 0.0001999997294072797,
+      "loss": 12.2902,
       "step": 345
     },
     {
       "epoch": 0.05665629605370886,
-      "grad_norm": 0.16343215107917786,
-      "learning_rate": 0.0001998527441173792,
-      "loss": 12.2639,
+      "grad_norm": 0.18627764284610748,
+      "learning_rate": 0.00019999971724731625,
+      "loss": 12.2869,
       "step": 346
     },
     {
       "epoch": 0.0568200425740953,
-      "grad_norm": 0.17143461108207703,
-      "learning_rate": 0.00019985181279480457,
-      "loss": 12.2966,
+      "grad_norm": 0.17207014560699463,
+      "learning_rate": 0.0001999997048201014,
+      "loss": 12.3093,
       "step": 347
     },
     {
       "epoch": 0.05698378909448174,
-      "grad_norm": 0.20530003309249878,
-      "learning_rate": 0.00019985088147222996,
-      "loss": 12.263,
+      "grad_norm": 0.24697749316692352,
+      "learning_rate": 0.00019999969212563512,
+      "loss": 12.2833,
       "step": 348
     },
     {
       "epoch": 0.05714753561486818,
-      "grad_norm": 0.15935033559799194,
-      "learning_rate": 0.00019984993559774011,
-      "loss": 12.2844,
+      "grad_norm": 0.1407640129327774,
+      "learning_rate": 0.0001999996791639175,
+      "loss": 12.3039,
       "step": 349
     },
     {
       "epoch": 0.057311282135254625,
-      "grad_norm": 0.22338813543319702,
-      "learning_rate": 0.00019984898972325027,
-      "loss": 12.288,
+      "grad_norm": 0.15878401696681976,
+      "learning_rate": 0.0001999996659349485,
+      "loss": 12.3057,
       "step": 350
     },
     {
       "epoch": 0.05747502865564107,
-      "grad_norm": 0.1451302468776703,
-      "learning_rate": 0.00019984804384876043,
-      "loss": 12.273,
+      "grad_norm": 0.4590788781642914,
+      "learning_rate": 0.00019999965243872828,
+      "loss": 12.2928,
       "step": 351
     },
     {
       "epoch": 0.05763877517602751,
-      "grad_norm": 0.21671491861343384,
-      "learning_rate": 0.00019984709797427058,
-      "loss": 12.2899,
+      "grad_norm": 0.1928272247314453,
+      "learning_rate": 0.00019999963867525675,
+      "loss": 12.3032,
       "step": 352
     },
     {
       "epoch": 0.05780252169641395,
-      "grad_norm": 0.1976754367351532,
-      "learning_rate": 0.0001998461375478655,
-      "loss": 12.2657,
+      "grad_norm": 0.22454416751861572,
+      "learning_rate": 0.000199999624644534,
+      "loss": 12.2843,
       "step": 353
     },
     {
       "epoch": 0.05796626821680039,
-      "grad_norm": 0.12198355793952942,
-      "learning_rate": 0.00019984519167337567,
-      "loss": 12.2725,
+      "grad_norm": 0.12228063493967056,
+      "learning_rate": 0.00019999961034656004,
+      "loss": 12.2829,
       "step": 354
     },
     {
       "epoch": 0.05813001473718683,
-      "grad_norm": 0.16143521666526794,
-      "learning_rate": 0.0001998442312469706,
-      "loss": 12.2536,
+      "grad_norm": 0.17021706700325012,
+      "learning_rate": 0.00019999959578133499,
+      "loss": 12.2823,
       "step": 355
     },
     {
       "epoch": 0.058293761257573275,
-      "grad_norm": 0.19851113855838776,
-      "learning_rate": 0.00019984328537248075,
-      "loss": 12.2631,
+      "grad_norm": 0.13477741181850433,
+      "learning_rate": 0.0001999995809488588,
+      "loss": 12.2836,
       "step": 356
     },
     {
       "epoch": 0.058457507777959716,
-      "grad_norm": 0.2403157353401184,
-      "learning_rate": 0.00019984231039416045,
-      "loss": 12.2417,
+      "grad_norm": 0.2285226583480835,
+      "learning_rate": 0.00019999956584913154,
+      "loss": 12.2625,
       "step": 357
     },
     {
       "epoch": 0.05862125429834616,
-      "grad_norm": 0.14061123132705688,
-      "learning_rate": 0.00019984134996775538,
-      "loss": 12.2621,
+      "grad_norm": 0.1142636239528656,
+      "learning_rate": 0.00019999955048215324,
+      "loss": 12.2804,
       "step": 358
     },
     {
       "epoch": 0.0587850008187326,
-      "grad_norm": 0.4004269540309906,
-      "learning_rate": 0.0001998403895413503,
-      "loss": 12.2795,
+      "grad_norm": 0.37811943888664246,
+      "learning_rate": 0.00019999953484792395,
+      "loss": 12.2842,
       "step": 359
     },
     {
       "epoch": 0.05894874733911904,
-      "grad_norm": 0.14407266676425934,
-      "learning_rate": 0.00019983941456303,
-      "loss": 12.2585,
+      "grad_norm": 0.14085888862609863,
+      "learning_rate": 0.00019999951894644373,
+      "loss": 12.2805,
       "step": 360
     },
     {
       "epoch": 0.05911249385950548,
-      "grad_norm": 0.1705065369606018,
-      "learning_rate": 0.00019983842503279448,
-      "loss": 12.2578,
+      "grad_norm": 0.19372887909412384,
+      "learning_rate": 0.0001999995027777126,
+      "loss": 12.2667,
       "step": 361
     },
     {
       "epoch": 0.059276240379891924,
-      "grad_norm": 0.19344770908355713,
-      "learning_rate": 0.00019983745005447417,
-      "loss": 12.2567,
+      "grad_norm": 0.2030063271522522,
+      "learning_rate": 0.00019999948634173063,
+      "loss": 12.2733,
       "step": 362
     },
     {
       "epoch": 0.059439986900278366,
-      "grad_norm": 0.2022704929113388,
-      "learning_rate": 0.00019983647507615387,
-      "loss": 12.2933,
+      "grad_norm": 0.19858768582344055,
+      "learning_rate": 0.0001999994696384978,
+      "loss": 12.3034,
       "step": 363
     },
     {
       "epoch": 0.05960373342066481,
-      "grad_norm": 0.1704675704240799,
-      "learning_rate": 0.00019983550009783357,
-      "loss": 12.2529,
+      "grad_norm": 0.15349508821964264,
+      "learning_rate": 0.00019999945266801423,
+      "loss": 12.2819,
       "step": 364
     },
     {
       "epoch": 0.059767479941051256,
-      "grad_norm": 0.2408781498670578,
-      "learning_rate": 0.00019983451056759804,
-      "loss": 12.2733,
+      "grad_norm": 0.17492233216762543,
+      "learning_rate": 0.0001999994354302799,
+      "loss": 12.2909,
       "step": 365
     },
     {
       "epoch": 0.0599312264614377,
-      "grad_norm": 0.12872779369354248,
-      "learning_rate": 0.00019983352103736252,
-      "loss": 12.2573,
+      "grad_norm": 0.15136563777923584,
+      "learning_rate": 0.0001999994179252949,
+      "loss": 12.2796,
       "step": 366
     },
     {
       "epoch": 0.06009497298182414,
-      "grad_norm": 0.12638714909553528,
-      "learning_rate": 0.000199832531507127,
-      "loss": 12.2392,
+      "grad_norm": 0.17633739113807678,
+      "learning_rate": 0.00019999940015305928,
+      "loss": 12.2582,
       "step": 367
     },
     {
       "epoch": 0.06025871950221058,
-      "grad_norm": 0.19075384736061096,
-      "learning_rate": 0.00019983152742497623,
-      "loss": 12.2769,
+      "grad_norm": 0.2000981867313385,
+      "learning_rate": 0.0001999993821135731,
+      "loss": 12.2918,
       "step": 368
     },
     {
       "epoch": 0.06042246602259702,
-      "grad_norm": 0.3625982999801636,
-      "learning_rate": 0.0001998305378947407,
-      "loss": 12.3061,
+      "grad_norm": 0.3609507381916046,
+      "learning_rate": 0.00019999936380683632,
+      "loss": 12.3054,
       "step": 369
     },
     {
       "epoch": 0.060586212542983464,
-      "grad_norm": 0.1546419858932495,
-      "learning_rate": 0.00019982953381258994,
-      "loss": 12.256,
+      "grad_norm": 0.1492471545934677,
+      "learning_rate": 0.00019999934523284908,
+      "loss": 12.2646,
       "step": 370
     },
     {
       "epoch": 0.060749959063369906,
-      "grad_norm": 0.13882146775722504,
-      "learning_rate": 0.00019982852973043919,
-      "loss": 12.268,
+      "grad_norm": 0.14887075126171112,
+      "learning_rate": 0.0001999993263916114,
+      "loss": 12.2835,
       "step": 371
     },
     {
       "epoch": 0.06091370558375635,
-      "grad_norm": 0.12971815466880798,
-      "learning_rate": 0.00019982752564828843,
-      "loss": 12.276,
+      "grad_norm": 0.1308159977197647,
+      "learning_rate": 0.00019999930728312334,
+      "loss": 12.287,
       "step": 372
     },
     {
       "epoch": 0.06107745210414279,
-      "grad_norm": 0.15311676263809204,
-      "learning_rate": 0.00019982650701422244,
-      "loss": 12.2588,
+      "grad_norm": 0.14279422163963318,
+      "learning_rate": 0.00019999928790738494,
+      "loss": 12.2745,
       "step": 373
     },
     {
       "epoch": 0.06124119862452923,
-      "grad_norm": 0.18194103240966797,
-      "learning_rate": 0.00019982550293207169,
-      "loss": 12.281,
+      "grad_norm": 0.13700923323631287,
+      "learning_rate": 0.00019999926826439623,
+      "loss": 12.3028,
       "step": 374
     },
     {
       "epoch": 0.06140494514491567,
-      "grad_norm": 0.24688465893268585,
-      "learning_rate": 0.0001998244842980057,
-      "loss": 12.2534,
+      "grad_norm": 0.17791202664375305,
+      "learning_rate": 0.0001999992483541573,
+      "loss": 12.2664,
       "step": 375
     },
     {
       "epoch": 0.061568691665302114,
-      "grad_norm": 0.17040275037288666,
-      "learning_rate": 0.00019982346566393971,
-      "loss": 12.2575,
+      "grad_norm": 0.28744640946388245,
+      "learning_rate": 0.00019999922817666817,
+      "loss": 12.2859,
       "step": 376
     },
     {
       "epoch": 0.061732438185688555,
-      "grad_norm": 0.16558203101158142,
-      "learning_rate": 0.00019982244702987373,
-      "loss": 12.2771,
+      "grad_norm": 0.1286764144897461,
+      "learning_rate": 0.00019999920773192896,
+      "loss": 12.2844,
       "step": 377
     },
     {
       "epoch": 0.061896184706075,
-      "grad_norm": 0.1640801578760147,
-      "learning_rate": 0.00019982141384389251,
-      "loss": 12.2924,
+      "grad_norm": 0.16422635316848755,
+      "learning_rate": 0.00019999918701993965,
+      "loss": 12.2979,
       "step": 378
     },
     {
       "epoch": 0.06205993122646144,
-      "grad_norm": 0.1380811631679535,
-      "learning_rate": 0.00019982039520982653,
-      "loss": 12.2528,
+      "grad_norm": 0.18279294669628143,
+      "learning_rate": 0.00019999916604070033,
+      "loss": 12.2773,
       "step": 379
     },
     {
       "epoch": 0.06222367774684788,
-      "grad_norm": 0.14711108803749084,
-      "learning_rate": 0.00019981936202384531,
-      "loss": 12.2581,
+      "grad_norm": 0.11040064692497253,
+      "learning_rate": 0.000199999144794211,
+      "loss": 12.2776,
       "step": 380
     },
     {
       "epoch": 0.06238742426723432,
-      "grad_norm": 0.10983294993638992,
-      "learning_rate": 0.0001998183288378641,
-      "loss": 12.2904,
+      "grad_norm": 0.11583856493234634,
+      "learning_rate": 0.00019999912328047184,
+      "loss": 12.2984,
       "step": 381
     },
     {
       "epoch": 0.06255117078762076,
-      "grad_norm": 0.1386169046163559,
-      "learning_rate": 0.0001998172956518829,
-      "loss": 12.2774,
+      "grad_norm": 0.15946663916110992,
+      "learning_rate": 0.0001999991014994828,
+      "loss": 12.2942,
       "step": 382
     },
     {
       "epoch": 0.0627149173080072,
-      "grad_norm": 0.22444948554039001,
-      "learning_rate": 0.00019981626246590167,
-      "loss": 12.2621,
+      "grad_norm": 0.1915806531906128,
+      "learning_rate": 0.000199999079451244,
+      "loss": 12.2742,
       "step": 383
     },
     {
       "epoch": 0.06287866382839365,
-      "grad_norm": 0.20805659890174866,
-      "learning_rate": 0.00019981521472800523,
-      "loss": 12.2589,
+      "grad_norm": 0.19050228595733643,
+      "learning_rate": 0.00019999905713575543,
+      "loss": 12.2752,
       "step": 384
     },
     {
       "epoch": 0.0630424103487801,
-      "grad_norm": 0.13287568092346191,
-      "learning_rate": 0.00019981415243819356,
-      "loss": 12.2598,
+      "grad_norm": 0.1742391288280487,
+      "learning_rate": 0.00019999903455301718,
+      "loss": 12.2773,
       "step": 385
     },
     {
       "epoch": 0.06320615686916653,
-      "grad_norm": 0.11777439713478088,
-      "learning_rate": 0.00019981310470029712,
-      "loss": 12.2512,
+      "grad_norm": 0.1563049852848053,
+      "learning_rate": 0.00019999901170302935,
+      "loss": 12.2585,
       "step": 386
     },
     {
       "epoch": 0.06336990338955298,
-      "grad_norm": 0.15062777698040009,
-      "learning_rate": 0.00019981205696240067,
-      "loss": 12.2682,
+      "grad_norm": 0.1489725112915039,
+      "learning_rate": 0.00019999898858579195,
+      "loss": 12.2925,
       "step": 387
     },
     {
       "epoch": 0.06353364990993941,
-      "grad_norm": 0.2239951193332672,
-      "learning_rate": 0.00019981100922450423,
-      "loss": 12.2345,
+      "grad_norm": 0.17745822668075562,
+      "learning_rate": 0.0001999989652013051,
+      "loss": 12.2546,
       "step": 388
     },
     {
       "epoch": 0.06369739643032586,
-      "grad_norm": 0.3479856252670288,
-      "learning_rate": 0.00019980994693469256,
-      "loss": 12.2865,
+      "grad_norm": 0.29051586985588074,
+      "learning_rate": 0.00019999894154956879,
+      "loss": 12.2911,
       "step": 389
     },
     {
       "epoch": 0.0638611429507123,
-      "grad_norm": 0.16244293749332428,
-      "learning_rate": 0.0001998088846448809,
-      "loss": 12.2478,
+      "grad_norm": 0.12695226073265076,
+      "learning_rate": 0.00019999891763058312,
+      "loss": 12.2646,
       "step": 390
     },
     {
       "epoch": 0.06402488947109874,
-      "grad_norm": 0.14563092589378357,
-      "learning_rate": 0.00019980782235506922,
-      "loss": 12.2693,
+      "grad_norm": 0.153972327709198,
+      "learning_rate": 0.00019999889344434819,
+      "loss": 12.2832,
       "step": 391
     },
     {
       "epoch": 0.06418863599148518,
-      "grad_norm": 0.11829731613397598,
-      "learning_rate": 0.00019980677461717278,
-      "loss": 12.2523,
+      "grad_norm": 0.13381552696228027,
+      "learning_rate": 0.00019999886899086397,
+      "loss": 12.261,
       "step": 392
     },
     {
       "epoch": 0.06435238251187163,
-      "grad_norm": 0.18642814457416534,
-      "learning_rate": 0.00019980569777544588,
-      "loss": 12.2565,
+      "grad_norm": 0.19647999107837677,
+      "learning_rate": 0.00019999884427013062,
+      "loss": 12.2679,
       "step": 393
     },
     {
       "epoch": 0.06451612903225806,
-      "grad_norm": 0.1708955317735672,
-      "learning_rate": 0.00019980462093371898,
-      "loss": 12.2734,
+      "grad_norm": 0.1712377965450287,
+      "learning_rate": 0.00019999881928214818,
+      "loss": 12.2874,
       "step": 394
     },
     {
       "epoch": 0.06467987555264451,
-      "grad_norm": 0.1829138845205307,
-      "learning_rate": 0.00019980354409199208,
-      "loss": 12.2634,
+      "grad_norm": 0.1798669546842575,
+      "learning_rate": 0.00019999879402691668,
+      "loss": 12.2939,
       "step": 395
     },
     {
       "epoch": 0.06484362207303095,
-      "grad_norm": 0.22328996658325195,
-      "learning_rate": 0.0001998024818021804,
-      "loss": 12.2646,
+      "grad_norm": 0.2037380337715149,
+      "learning_rate": 0.00019999876850443623,
+      "loss": 12.2857,
       "step": 396
     },
     {
       "epoch": 0.0650073685934174,
-      "grad_norm": 0.30070918798446655,
-      "learning_rate": 0.00019980139040853828,
-      "loss": 12.2608,
+      "grad_norm": 0.24441306293010712,
+      "learning_rate": 0.00019999874271470685,
+      "loss": 12.2994,
       "step": 397
     },
     {
       "epoch": 0.06517111511380383,
-      "grad_norm": 0.21575552225112915,
-      "learning_rate": 0.00019980031356681138,
-      "loss": 12.2592,
+      "grad_norm": 0.16837072372436523,
+      "learning_rate": 0.00019999871665772866,
+      "loss": 12.2756,
       "step": 398
     },
     {
       "epoch": 0.06533486163419028,
-      "grad_norm": 0.1523241251707077,
-      "learning_rate": 0.00019979922217316926,
-      "loss": 12.2764,
+      "grad_norm": 0.132755309343338,
+      "learning_rate": 0.00019999869033350174,
+      "loss": 12.2839,
       "step": 399
     },
     {
       "epoch": 0.06549860815457671,
-      "grad_norm": 0.13779282569885254,
-      "learning_rate": 0.00019979813077952713,
-      "loss": 12.282,
-      "step": 400
-    },
-    {
-      "epoch": 0.06549860815457671,
-      "eval_loss": 12.26255989074707,
-      "eval_runtime": 7.4384,
-      "eval_samples_per_second": 33.072,
-      "eval_steps_per_second": 16.536,
+      "grad_norm": 0.16452300548553467,
+      "learning_rate": 0.00019999866374202608,
+      "loss": 12.2943,
       "step": 400
     },
     {
       "epoch": 0.06566235467496316,
-      "grad_norm": 0.29484638571739197,
-      "learning_rate": 0.000199797039385885,
-      "loss": 12.2572,
+      "grad_norm": 0.2534669041633606,
+      "learning_rate": 0.00019999863688330183,
+      "loss": 12.2687,
       "step": 401
     },
     {
       "epoch": 0.0658261011953496,
-      "grad_norm": 0.19167940318584442,
-      "learning_rate": 0.00019979594799224287,
-      "loss": 12.2659,
+      "grad_norm": 0.19958250224590302,
+      "learning_rate": 0.00019999860975732903,
+      "loss": 12.2845,
       "step": 402
     },
     {
       "epoch": 0.06598984771573604,
-      "grad_norm": 0.20377042889595032,
-      "learning_rate": 0.00019979484204668552,
-      "loss": 12.2456,
+      "grad_norm": 0.13449449837207794,
+      "learning_rate": 0.00019999858236410776,
+      "loss": 12.2794,
       "step": 403
     },
     {
       "epoch": 0.06615359423612248,
-      "grad_norm": 0.1689855009317398,
-      "learning_rate": 0.0001997937506530434,
-      "loss": 12.2753,
+      "grad_norm": 0.17238451540470123,
+      "learning_rate": 0.0001999985547036381,
+      "loss": 12.3065,
       "step": 404
     },
     {
       "epoch": 0.06631734075650893,
-      "grad_norm": 0.22132129967212677,
-      "learning_rate": 0.00019979264470748603,
-      "loss": 12.2987,
+      "grad_norm": 0.15234483778476715,
+      "learning_rate": 0.0001999985267759201,
+      "loss": 12.3007,
       "step": 405
     },
     {
       "epoch": 0.06648108727689536,
-      "grad_norm": 0.15825076401233673,
-      "learning_rate": 0.00019979153876192868,
-      "loss": 12.256,
+      "grad_norm": 0.17298907041549683,
+      "learning_rate": 0.00019999849858095382,
+      "loss": 12.2824,
       "step": 406
     },
     {
       "epoch": 0.06664483379728181,
-      "grad_norm": 0.19967804849147797,
-      "learning_rate": 0.0001997904182644561,
-      "loss": 12.2423,
+      "grad_norm": 0.1439378559589386,
+      "learning_rate": 0.00019999847011873938,
+      "loss": 12.2567,
       "step": 407
     },
     {
       "epoch": 0.06680858031766825,
-      "grad_norm": 0.16729998588562012,
-      "learning_rate": 0.00019978931231889874,
-      "loss": 12.2698,
+      "grad_norm": 0.19181743264198303,
+      "learning_rate": 0.00019999844138927683,
+      "loss": 12.2789,
       "step": 408
     },
     {
       "epoch": 0.0669723268380547,
-      "grad_norm": 0.2205231636762619,
-      "learning_rate": 0.00019978819182142615,
-      "loss": 12.2751,
+      "grad_norm": 0.2083439975976944,
+      "learning_rate": 0.00019999841239256628,
+      "loss": 12.2807,
       "step": 409
     },
     {
       "epoch": 0.06713607335844113,
-      "grad_norm": 0.30310294032096863,
-      "learning_rate": 0.00019978707132395357,
-      "loss": 12.2969,
+      "grad_norm": 0.3232664167881012,
+      "learning_rate": 0.00019999838312860776,
+      "loss": 12.3084,
       "step": 410
     },
     {
       "epoch": 0.06729981987882758,
-      "grad_norm": 0.1694662868976593,
-      "learning_rate": 0.00019978595082648098,
-      "loss": 12.2553,
+      "grad_norm": 0.14184612035751343,
+      "learning_rate": 0.0001999983535974014,
+      "loss": 12.272,
       "step": 411
     },
     {
       "epoch": 0.06746356639921401,
-      "grad_norm": 0.21690568327903748,
-      "learning_rate": 0.0001997848303290084,
-      "loss": 12.2774,
+      "grad_norm": 0.30172014236450195,
+      "learning_rate": 0.00019999832379894721,
+      "loss": 12.2954,
       "step": 412
     },
     {
       "epoch": 0.06762731291960046,
-      "grad_norm": 0.12437091767787933,
-      "learning_rate": 0.0001997836952796206,
-      "loss": 12.254,
+      "grad_norm": 0.16100570559501648,
+      "learning_rate": 0.00019999829373324533,
+      "loss": 12.2742,
       "step": 413
     },
     {
       "epoch": 0.0677910594399869,
-      "grad_norm": 0.21863418817520142,
-      "learning_rate": 0.000199782574782148,
-      "loss": 12.3004,
+      "grad_norm": 0.15702582895755768,
+      "learning_rate": 0.00019999826340029583,
+      "loss": 12.3039,
       "step": 414
     },
     {
       "epoch": 0.06795480596037334,
-      "grad_norm": 0.14826977252960205,
-      "learning_rate": 0.0001997814397327602,
-      "loss": 12.2813,
+      "grad_norm": 0.1590198427438736,
+      "learning_rate": 0.00019999823280009878,
+      "loss": 12.2853,
       "step": 415
     },
     {
       "epoch": 0.06811855248075978,
-      "grad_norm": 0.2234913855791092,
-      "learning_rate": 0.00019978030468337238,
-      "loss": 12.277,
+      "grad_norm": 0.1716587245464325,
+      "learning_rate": 0.00019999820193265425,
+      "loss": 12.2875,
       "step": 416
     },
     {
       "epoch": 0.06828229900114623,
-      "grad_norm": 0.3348887264728546,
-      "learning_rate": 0.00019977915508206934,
-      "loss": 12.2771,
+      "grad_norm": 0.27681276202201843,
+      "learning_rate": 0.00019999817079796234,
+      "loss": 12.2874,
       "step": 417
     },
     {
       "epoch": 0.06844604552153266,
-      "grad_norm": 0.14655959606170654,
-      "learning_rate": 0.00019977802003268152,
-      "loss": 12.2347,
+      "grad_norm": 0.13572442531585693,
+      "learning_rate": 0.00019999813939602313,
+      "loss": 12.2638,
       "step": 418
     },
     {
       "epoch": 0.06860979204191911,
-      "grad_norm": 0.12391780316829681,
-      "learning_rate": 0.0001997768849832937,
-      "loss": 12.2504,
+      "grad_norm": 0.1282089799642563,
+      "learning_rate": 0.0001999981077268367,
+      "loss": 12.2649,
       "step": 419
     },
     {
       "epoch": 0.06877353856230554,
-      "grad_norm": 0.23915357887744904,
-      "learning_rate": 0.00019977572083007544,
-      "loss": 12.2661,
+      "grad_norm": 0.17424018681049347,
+      "learning_rate": 0.00019999807579040315,
+      "loss": 12.2719,
       "step": 420
     },
     {
       "epoch": 0.068937285082692,
-      "grad_norm": 0.10997791588306427,
-      "learning_rate": 0.00019977458578068763,
-      "loss": 12.2542,
+      "grad_norm": 0.13602033257484436,
+      "learning_rate": 0.00019999804358672253,
+      "loss": 12.2614,
       "step": 421
     },
     {
       "epoch": 0.06910103160307843,
-      "grad_norm": 0.1418733149766922,
-      "learning_rate": 0.00019977342162746936,
-      "loss": 12.2543,
+      "grad_norm": 0.14215390384197235,
+      "learning_rate": 0.00019999801111579498,
+      "loss": 12.2676,
       "step": 422
     },
     {
       "epoch": 0.06926477812346488,
-      "grad_norm": 0.3322834372520447,
-      "learning_rate": 0.00019977227202616632,
-      "loss": 12.3091,
+      "grad_norm": 0.3541161119937897,
+      "learning_rate": 0.00019999797837762053,
+      "loss": 12.3134,
       "step": 423
     },
     {
       "epoch": 0.06942852464385132,
-      "grad_norm": 0.20287387073040009,
-      "learning_rate": 0.00019977110787294805,
-      "loss": 12.2878,
+      "grad_norm": 0.2593739628791809,
+      "learning_rate": 0.00019999794537219932,
+      "loss": 12.2915,
       "step": 424
     },
     {
       "epoch": 0.06959227116423776,
-      "grad_norm": 0.19083108007907867,
-      "learning_rate": 0.00019976994371972978,
-      "loss": 12.24,
+      "grad_norm": 0.20327959954738617,
+      "learning_rate": 0.00019999791209953136,
+      "loss": 12.276,
       "step": 425
     },
     {
       "epoch": 0.06975601768462421,
-      "grad_norm": 0.24128662049770355,
-      "learning_rate": 0.0001997687795665115,
-      "loss": 12.2481,
+      "grad_norm": 0.18611687421798706,
+      "learning_rate": 0.00019999787855961684,
+      "loss": 12.265,
       "step": 426
     },
     {
       "epoch": 0.06991976420501064,
-      "grad_norm": 0.12994429469108582,
-      "learning_rate": 0.00019976761541329324,
-      "loss": 12.2503,
+      "grad_norm": 0.13483114540576935,
+      "learning_rate": 0.00019999784475245577,
+      "loss": 12.2595,
       "step": 427
     },
     {
       "epoch": 0.07008351072539709,
-      "grad_norm": 0.2344224601984024,
-      "learning_rate": 0.00019976643670815974,
-      "loss": 12.2626,
+      "grad_norm": 0.3187796473503113,
+      "learning_rate": 0.00019999781067804828,
+      "loss": 12.2764,
       "step": 428
     },
     {
       "epoch": 0.07024725724578353,
-      "grad_norm": 0.21825483441352844,
-      "learning_rate": 0.00019976525800302625,
-      "loss": 12.2574,
+      "grad_norm": 0.3139594793319702,
+      "learning_rate": 0.00019999777633639444,
+      "loss": 12.2634,
       "step": 429
     },
     {
       "epoch": 0.07041100376616997,
-      "grad_norm": 0.2866182327270508,
-      "learning_rate": 0.00019976409384980798,
-      "loss": 12.2815,
+      "grad_norm": 0.1972879022359848,
+      "learning_rate": 0.00019999774172749438,
+      "loss": 12.2803,
       "step": 430
     },
     {
       "epoch": 0.07057475028655641,
-      "grad_norm": 0.15483808517456055,
-      "learning_rate": 0.00019976291514467448,
-      "loss": 12.2937,
+      "grad_norm": 0.1937691867351532,
+      "learning_rate": 0.00019999770685134817,
+      "loss": 12.3016,
       "step": 431
     },
     {
       "epoch": 0.07073849680694286,
-      "grad_norm": 0.2085985690355301,
-      "learning_rate": 0.00019976172188762575,
-      "loss": 12.2691,
+      "grad_norm": 0.195434108376503,
+      "learning_rate": 0.00019999767170795588,
+      "loss": 12.2838,
       "step": 432
     },
     {
       "epoch": 0.07090224332732929,
-      "grad_norm": 0.1951405107975006,
-      "learning_rate": 0.00019976054318249226,
-      "loss": 12.2768,
+      "grad_norm": 0.19516132771968842,
+      "learning_rate": 0.00019999763629731761,
+      "loss": 12.2821,
       "step": 433
     },
     {
       "epoch": 0.07106598984771574,
-      "grad_norm": 0.14644214510917664,
-      "learning_rate": 0.00019975934992544353,
-      "loss": 12.267,
+      "grad_norm": 0.2725695073604584,
+      "learning_rate": 0.00019999760061943346,
+      "loss": 12.2809,
       "step": 434
     },
     {
       "epoch": 0.07122973636810218,
-      "grad_norm": 0.19653159379959106,
-      "learning_rate": 0.0001997581566683948,
-      "loss": 12.2755,
+      "grad_norm": 0.27861475944519043,
+      "learning_rate": 0.00019999756467430356,
+      "loss": 12.2857,
       "step": 435
     },
     {
       "epoch": 0.07139348288848862,
-      "grad_norm": 0.2027158886194229,
-      "learning_rate": 0.00019975696341134608,
-      "loss": 12.2935,
+      "grad_norm": 0.19172325730323792,
+      "learning_rate": 0.00019999752846192796,
+      "loss": 12.2979,
       "step": 436
     },
     {
       "epoch": 0.07155722940887506,
-      "grad_norm": 0.14168038964271545,
-      "learning_rate": 0.00019975575560238212,
-      "loss": 12.2662,
+      "grad_norm": 0.21028302609920502,
+      "learning_rate": 0.0001999974919823068,
+      "loss": 12.2816,
       "step": 437
     },
     {
       "epoch": 0.07172097592926151,
-      "grad_norm": 0.21052201092243195,
-      "learning_rate": 0.00019975454779341817,
-      "loss": 12.2358,
+      "grad_norm": 0.2027662843465805,
+      "learning_rate": 0.00019999745523544013,
+      "loss": 12.2401,
       "step": 438
     },
     {
       "epoch": 0.07188472244964794,
-      "grad_norm": 0.19395172595977783,
-      "learning_rate": 0.00019975335453636944,
-      "loss": 12.26,
+      "grad_norm": 0.20287398993968964,
+      "learning_rate": 0.00019999741822132808,
+      "loss": 12.2591,
       "step": 439
     },
     {
       "epoch": 0.07204846897003439,
-      "grad_norm": 0.19278693199157715,
-      "learning_rate": 0.0001997521467274055,
-      "loss": 12.258,
+      "grad_norm": 0.22502224147319794,
+      "learning_rate": 0.00019999738093997075,
+      "loss": 12.2593,
       "step": 440
     },
     {
       "epoch": 0.07221221549042083,
-      "grad_norm": 0.22367942333221436,
-      "learning_rate": 0.00019975093891844153,
-      "loss": 12.2787,
+      "grad_norm": 0.23881785571575165,
+      "learning_rate": 0.00019999734339136823,
+      "loss": 12.2821,
       "step": 441
     },
     {
       "epoch": 0.07237596201080727,
-      "grad_norm": 0.22102567553520203,
-      "learning_rate": 0.00019974973110947758,
-      "loss": 12.2644,
+      "grad_norm": 0.1868753433227539,
+      "learning_rate": 0.00019999730557552058,
+      "loss": 12.2635,
       "step": 442
     },
     {
       "epoch": 0.07253970853119371,
-      "grad_norm": 0.16479933261871338,
-      "learning_rate": 0.0001997485087485984,
-      "loss": 12.2687,
+      "grad_norm": 0.19476260244846344,
+      "learning_rate": 0.00019999726749242796,
+      "loss": 12.2884,
       "step": 443
     },
     {
       "epoch": 0.07270345505158016,
-      "grad_norm": 0.1651541292667389,
-      "learning_rate": 0.00019974730093963444,
-      "loss": 12.2566,
+      "grad_norm": 0.15796904265880585,
+      "learning_rate": 0.00019999722914209046,
+      "loss": 12.2677,
       "step": 444
     },
     {
       "epoch": 0.07286720157196659,
-      "grad_norm": 0.12695051729679108,
-      "learning_rate": 0.00019974606402684003,
-      "loss": 12.2622,
+      "grad_norm": 0.3154045045375824,
+      "learning_rate": 0.00019999719052450821,
+      "loss": 12.2752,
       "step": 445
     },
     {
       "epoch": 0.07303094809235304,
-      "grad_norm": 0.1750200092792511,
-      "learning_rate": 0.00019974484166596085,
-      "loss": 12.2779,
+      "grad_norm": 0.25553789734840393,
+      "learning_rate": 0.00019999715163968122,
+      "loss": 12.2866,
       "step": 446
     },
     {
       "epoch": 0.07319469461273947,
-      "grad_norm": 0.1328081488609314,
-      "learning_rate": 0.00019974360475316644,
-      "loss": 12.2475,
+      "grad_norm": 0.28748029470443726,
+      "learning_rate": 0.00019999711248760968,
+      "loss": 12.25,
       "step": 447
     },
     {
       "epoch": 0.07335844113312592,
-      "grad_norm": 0.16384023427963257,
-      "learning_rate": 0.00019974238239228725,
-      "loss": 12.3052,
+      "grad_norm": 0.18394634127616882,
+      "learning_rate": 0.00019999707306829367,
+      "loss": 12.3081,
       "step": 448
     },
     {
       "epoch": 0.07352218765351236,
-      "grad_norm": 0.26911428570747375,
-      "learning_rate": 0.00019974116003140807,
-      "loss": 12.2702,
+      "grad_norm": 0.19124747812747955,
+      "learning_rate": 0.00019999703338173327,
+      "loss": 12.2823,
       "step": 449
     },
     {
       "epoch": 0.0736859341738988,
-      "grad_norm": 0.16559433937072754,
-      "learning_rate": 0.00019973992311861366,
-      "loss": 12.2486,
+      "grad_norm": 0.19708351790905,
+      "learning_rate": 0.00019999699342792862,
+      "loss": 12.2666,
       "step": 450
     },
     {
       "epoch": 0.07384968069428524,
-      "grad_norm": 0.13143081963062286,
-      "learning_rate": 0.00019973867165390402,
-      "loss": 12.2532,
+      "grad_norm": 0.12283062934875488,
+      "learning_rate": 0.00019999695320687983,
+      "loss": 12.2661,
       "step": 451
     },
     {
       "epoch": 0.07401342721467169,
-      "grad_norm": 0.12418577075004578,
-      "learning_rate": 0.0001997374347411096,
-      "loss": 12.2817,
+      "grad_norm": 0.13629688322544098,
+      "learning_rate": 0.00019999691271858702,
+      "loss": 12.2952,
       "step": 452
     },
     {
       "epoch": 0.07417717373505812,
-      "grad_norm": 0.17400050163269043,
-      "learning_rate": 0.00019973618327639997,
-      "loss": 12.2573,
+      "grad_norm": 0.176702082157135,
+      "learning_rate": 0.00019999687196305019,
+      "loss": 12.266,
       "step": 453
     },
     {
       "epoch": 0.07434092025544457,
-      "grad_norm": 0.21183834969997406,
-      "learning_rate": 0.00019973494636360556,
-      "loss": 12.2938,
+      "grad_norm": 0.21978086233139038,
+      "learning_rate": 0.0001999968309402696,
+      "loss": 12.2909,
       "step": 454
     },
     {
       "epoch": 0.07450466677583101,
-      "grad_norm": 0.14846572279930115,
-      "learning_rate": 0.00019973369489889592,
-      "loss": 12.2649,
+      "grad_norm": 0.18183192610740662,
+      "learning_rate": 0.00019999678965024526,
+      "loss": 12.2789,
       "step": 455
     },
     {
       "epoch": 0.07466841329621746,
-      "grad_norm": 0.14805927872657776,
-      "learning_rate": 0.00019973242888227105,
-      "loss": 12.2494,
+      "grad_norm": 0.2290555238723755,
+      "learning_rate": 0.0001999967480929773,
+      "loss": 12.257,
       "step": 456
     },
     {
       "epoch": 0.07483215981660389,
-      "grad_norm": 0.174244225025177,
-      "learning_rate": 0.0001997311774175614,
-      "loss": 12.249,
+      "grad_norm": 0.22575747966766357,
+      "learning_rate": 0.00019999670626846589,
+      "loss": 12.2811,
       "step": 457
     },
     {
       "epoch": 0.07499590633699034,
-      "grad_norm": 0.21895098686218262,
-      "learning_rate": 0.00019972991140093654,
-      "loss": 12.2541,
+      "grad_norm": 0.2953977882862091,
+      "learning_rate": 0.00019999666417671103,
+      "loss": 12.2646,
       "step": 458
     },
     {
       "epoch": 0.07515965285737677,
-      "grad_norm": 0.15302148461341858,
-      "learning_rate": 0.00019972864538431168,
-      "loss": 12.2487,
+      "grad_norm": 0.18846358358860016,
+      "learning_rate": 0.0001999966218177129,
+      "loss": 12.2522,
       "step": 459
     },
     {
       "epoch": 0.07532339937776322,
-      "grad_norm": 0.16815854609012604,
-      "learning_rate": 0.00019972739391960204,
-      "loss": 12.2604,
+      "grad_norm": 0.3193408250808716,
+      "learning_rate": 0.00019999657919147167,
+      "loss": 12.2645,
       "step": 460
     },
     {
       "epoch": 0.07548714589814967,
-      "grad_norm": 0.16916708648204803,
-      "learning_rate": 0.00019972612790297717,
-      "loss": 12.2466,
+      "grad_norm": 0.27576708793640137,
+      "learning_rate": 0.00019999653629798733,
+      "loss": 12.2474,
       "step": 461
     },
     {
       "epoch": 0.0756508924185361,
-      "grad_norm": 0.22577545046806335,
-      "learning_rate": 0.00019972484733443707,
-      "loss": 12.2672,
+      "grad_norm": 0.19615419209003448,
+      "learning_rate": 0.00019999649313726009,
+      "loss": 12.2696,
       "step": 462
     },
     {
       "epoch": 0.07581463893892255,
-      "grad_norm": 0.3655492961406708,
-      "learning_rate": 0.00019972356676589698,
-      "loss": 12.264,
+      "grad_norm": 0.1791532337665558,
+      "learning_rate": 0.00019999644970929002,
+      "loss": 12.2756,
       "step": 463
     },
     {
       "epoch": 0.07597838545930899,
-      "grad_norm": 0.19828443229198456,
-      "learning_rate": 0.0001997223007492721,
-      "loss": 12.2637,
+      "grad_norm": 0.3186524510383606,
+      "learning_rate": 0.00019999640601407723,
+      "loss": 12.2628,
       "step": 464
     },
     {
       "epoch": 0.07614213197969544,
-      "grad_norm": 0.17368406057357788,
-      "learning_rate": 0.000199721020180732,
+      "grad_norm": 0.19734470546245575,
+      "learning_rate": 0.0001999963620516219,
       "loss": 12.2741,
       "step": 465
     },
     {
       "epoch": 0.07630587850008187,
-      "grad_norm": 0.18248553574085236,
-      "learning_rate": 0.00019971973961219192,
-      "loss": 12.2579,
+      "grad_norm": 0.28205740451812744,
+      "learning_rate": 0.00019999631782192405,
+      "loss": 12.2534,
       "step": 466
     },
     {
       "epoch": 0.07646962502046832,
-      "grad_norm": 0.2927585542201996,
-      "learning_rate": 0.0001997184444917366,
-      "loss": 12.2693,
+      "grad_norm": 0.2390183061361313,
+      "learning_rate": 0.0001999962733249839,
+      "loss": 12.2746,
       "step": 467
     },
     {
       "epoch": 0.07663337154085476,
-      "grad_norm": 0.2267952412366867,
-      "learning_rate": 0.0001997171639231965,
-      "loss": 12.2665,
+      "grad_norm": 0.19656427204608917,
+      "learning_rate": 0.0001999962285608015,
+      "loss": 12.2728,
       "step": 468
     },
     {
       "epoch": 0.0767971180612412,
-      "grad_norm": 0.18384003639221191,
-      "learning_rate": 0.00019971585425082594,
-      "loss": 12.2695,
+      "grad_norm": 0.14260216057300568,
+      "learning_rate": 0.00019999618352937697,
+      "loss": 12.2769,
       "step": 469
     },
     {
       "epoch": 0.07696086458162764,
-      "grad_norm": 0.19173267483711243,
-      "learning_rate": 0.00019971457368228585,
-      "loss": 12.2794,
+      "grad_norm": 0.1668563187122345,
+      "learning_rate": 0.0001999961382307105,
+      "loss": 12.2899,
       "step": 470
     },
     {
       "epoch": 0.07712461110201409,
-      "grad_norm": 0.16286374628543854,
-      "learning_rate": 0.00019971327856183052,
-      "loss": 12.2504,
+      "grad_norm": 0.186238631606102,
+      "learning_rate": 0.0001999960926648021,
+      "loss": 12.2611,
       "step": 471
     },
     {
       "epoch": 0.07728835762240052,
-      "grad_norm": 0.15662431716918945,
-      "learning_rate": 0.00019971196888945997,
-      "loss": 12.28,
+      "grad_norm": 0.16535009443759918,
+      "learning_rate": 0.00019999604683165197,
+      "loss": 12.2853,
       "step": 472
     },
     {
       "epoch": 0.07745210414278697,
-      "grad_norm": 0.26529473066329956,
-      "learning_rate": 0.00019971065921708941,
-      "loss": 12.3013,
+      "grad_norm": 0.23412054777145386,
+      "learning_rate": 0.00019999600073126024,
+      "loss": 12.3178,
       "step": 473
     },
     {
       "epoch": 0.0776158506631734,
-      "grad_norm": 0.2134474515914917,
-      "learning_rate": 0.0001997093640966341,
-      "loss": 12.2591,
+      "grad_norm": 0.21077093482017517,
+      "learning_rate": 0.00019999595436362698,
+      "loss": 12.2663,
       "step": 474
     },
     {
       "epoch": 0.07777959718355985,
-      "grad_norm": 0.16773010790348053,
-      "learning_rate": 0.00019970805442426354,
-      "loss": 12.2776,
+      "grad_norm": 0.1960388571023941,
+      "learning_rate": 0.00019999590772875236,
+      "loss": 12.2773,
       "step": 475
     },
     {
       "epoch": 0.07794334370394629,
-      "grad_norm": 0.1953994333744049,
-      "learning_rate": 0.00019970674475189298,
-      "loss": 12.2714,
+      "grad_norm": 0.16811984777450562,
+      "learning_rate": 0.0001999958608266365,
+      "loss": 12.2815,
       "step": 476
     },
     {
       "epoch": 0.07810709022433274,
-      "grad_norm": 0.23002862930297852,
-      "learning_rate": 0.0001997054205276072,
-      "loss": 12.289,
+      "grad_norm": 0.30715492367744446,
+      "learning_rate": 0.00019999581365727948,
+      "loss": 12.2967,
       "step": 477
     },
     {
       "epoch": 0.07827083674471917,
-      "grad_norm": 0.17535601556301117,
-      "learning_rate": 0.00019970411085523665,
-      "loss": 12.2659,
+      "grad_norm": 0.17825697362422943,
+      "learning_rate": 0.00019999576622068148,
+      "loss": 12.2716,
       "step": 478
     },
     {
       "epoch": 0.07843458326510562,
-      "grad_norm": 0.16866976022720337,
-      "learning_rate": 0.00019970278663095087,
-      "loss": 12.2542,
+      "grad_norm": 0.18646034598350525,
+      "learning_rate": 0.0001999957185168426,
+      "loss": 12.2571,
       "step": 479
     },
     {
       "epoch": 0.07859832978549205,
-      "grad_norm": 0.2173595428466797,
-      "learning_rate": 0.0001997014624066651,
-      "loss": 12.2807,
+      "grad_norm": 0.2670734226703644,
+      "learning_rate": 0.00019999567054576297,
+      "loss": 12.2827,
       "step": 480
     },
     {
       "epoch": 0.0787620763058785,
-      "grad_norm": 0.1581399142742157,
-      "learning_rate": 0.00019970012363046408,
-      "loss": 12.25,
+      "grad_norm": 0.17917779088020325,
+      "learning_rate": 0.0001999956223074427,
+      "loss": 12.2706,
       "step": 481
     },
     {
       "epoch": 0.07892582282626494,
-      "grad_norm": 0.41668128967285156,
-      "learning_rate": 0.0001996987994061783,
-      "loss": 12.3047,
+      "grad_norm": 0.3791571259498596,
+      "learning_rate": 0.000199995573801882,
+      "loss": 12.3039,
       "step": 482
     },
     {
       "epoch": 0.07908956934665139,
-      "grad_norm": 0.1310993880033493,
-      "learning_rate": 0.00019969746062997729,
-      "loss": 12.2899,
+      "grad_norm": 0.2250203788280487,
+      "learning_rate": 0.00019999552502908088,
+      "loss": 12.2971,
       "step": 483
     },
     {
       "epoch": 0.07925331586703782,
-      "grad_norm": 0.17166323959827423,
-      "learning_rate": 0.0001996961364056915,
-      "loss": 12.2605,
+      "grad_norm": 0.21701763570308685,
+      "learning_rate": 0.00019999547598903954,
+      "loss": 12.2741,
       "step": 484
     },
     {
       "epoch": 0.07941706238742427,
-      "grad_norm": 0.18003998696804047,
-      "learning_rate": 0.00019969478307757527,
-      "loss": 12.2674,
+      "grad_norm": 0.22662462294101715,
+      "learning_rate": 0.00019999542668175813,
+      "loss": 12.2691,
       "step": 485
     },
     {
       "epoch": 0.0795808089078107,
-      "grad_norm": 0.1621619164943695,
-      "learning_rate": 0.00019969344430137426,
-      "loss": 12.2738,
+      "grad_norm": 0.1721991002559662,
+      "learning_rate": 0.00019999537710723673,
+      "loss": 12.2777,
       "step": 486
     },
     {
       "epoch": 0.07974455542819715,
-      "grad_norm": 0.17165407538414001,
-      "learning_rate": 0.00019969210552517325,
-      "loss": 12.2644,
+      "grad_norm": 0.2554982900619507,
+      "learning_rate": 0.0001999953272654755,
+      "loss": 12.272,
       "step": 487
     },
     {
       "epoch": 0.07990830194858359,
-      "grad_norm": 0.2042619287967682,
-      "learning_rate": 0.000199690752197057,
-      "loss": 12.2444,
+      "grad_norm": 0.16333116590976715,
+      "learning_rate": 0.00019999527715647454,
+      "loss": 12.2473,
       "step": 488
     },
     {
       "epoch": 0.08007204846897004,
-      "grad_norm": 0.17771846055984497,
-      "learning_rate": 0.00019968939886894077,
-      "loss": 12.2738,
+      "grad_norm": 0.2844250798225403,
+      "learning_rate": 0.00019999522678023404,
+      "loss": 12.2779,
       "step": 489
     },
     {
       "epoch": 0.08023579498935647,
-      "grad_norm": 0.1405569314956665,
-      "learning_rate": 0.00019968804554082453,
-      "loss": 12.2692,
+      "grad_norm": 0.15430493652820587,
+      "learning_rate": 0.00019999517613675412,
+      "loss": 12.2759,
       "step": 490
     },
     {
       "epoch": 0.08039954150974292,
-      "grad_norm": 0.1747368574142456,
-      "learning_rate": 0.0001996866922127083,
-      "loss": 12.2532,
+      "grad_norm": 0.14520032703876495,
+      "learning_rate": 0.0001999951252260349,
+      "loss": 12.2643,
       "step": 491
     },
     {
       "epoch": 0.08056328803012935,
-      "grad_norm": 0.21995700895786285,
-      "learning_rate": 0.00019968532433267683,
-      "loss": 12.2536,
+      "grad_norm": 0.20975728332996368,
+      "learning_rate": 0.0001999950740480765,
+      "loss": 12.2562,
       "step": 492
     },
     {
       "epoch": 0.0807270345505158,
-      "grad_norm": 0.2268180549144745,
-      "learning_rate": 0.0001996839710045606,
-      "loss": 12.2637,
+      "grad_norm": 0.24926480650901794,
+      "learning_rate": 0.00019999502260287905,
+      "loss": 12.2656,
       "step": 493
     },
     {
       "epoch": 0.08089078107090224,
-      "grad_norm": 0.18887090682983398,
-      "learning_rate": 0.00019968260312452912,
-      "loss": 12.2552,
+      "grad_norm": 0.1939597725868225,
+      "learning_rate": 0.00019999497089044277,
+      "loss": 12.2797,
       "step": 494
     },
     {
       "epoch": 0.08105452759128869,
-      "grad_norm": 0.14496204257011414,
-      "learning_rate": 0.00019968123524449766,
-      "loss": 12.2786,
+      "grad_norm": 0.14060169458389282,
+      "learning_rate": 0.0001999949189107677,
+      "loss": 12.2859,
       "step": 495
     },
     {
       "epoch": 0.08121827411167512,
-      "grad_norm": 0.2236098051071167,
-      "learning_rate": 0.00019967985281255096,
-      "loss": 12.2744,
+      "grad_norm": 0.21786072850227356,
+      "learning_rate": 0.00019999486666385404,
+      "loss": 12.2858,
       "step": 496
     },
     {
       "epoch": 0.08138202063206157,
-      "grad_norm": 0.15895530581474304,
-      "learning_rate": 0.0001996784849325195,
-      "loss": 12.2531,
+      "grad_norm": 0.2889421880245209,
+      "learning_rate": 0.0001999948141497019,
+      "loss": 12.2572,
       "step": 497
     },
     {
       "epoch": 0.081545767152448,
-      "grad_norm": 0.17476806044578552,
-      "learning_rate": 0.0001996771025005728,
-      "loss": 12.2172,
+      "grad_norm": 0.291425496339798,
+      "learning_rate": 0.0001999947613683114,
+      "loss": 12.2282,
       "step": 498
     },
     {
       "epoch": 0.08170951367283445,
-      "grad_norm": 0.19695644080638885,
-      "learning_rate": 0.0001996757200686261,
-      "loss": 12.2513,
+      "grad_norm": 0.25705039501190186,
+      "learning_rate": 0.00019999470831968274,
+      "loss": 12.2507,
       "step": 499
     },
     {
       "epoch": 0.0818732601932209,
-      "grad_norm": 0.1432948112487793,
-      "learning_rate": 0.0001996743376366794,
-      "loss": 12.2683,
+      "grad_norm": 0.1913681924343109,
+      "learning_rate": 0.00019999465500381605,
+      "loss": 12.2724,
       "step": 500
     },
     {
-      "epoch": 0.0818732601932209,
-      "eval_loss": 12.256741523742676,
-      "eval_runtime": 7.3749,
-      "eval_samples_per_second": 33.356,
-      "eval_steps_per_second": 16.678,
-      "step": 500
+      "epoch": 0.08203700671360734,
+      "grad_norm": 0.2260674387216568,
+      "learning_rate": 0.00019999460142071143,
+      "loss": 12.2539,
+      "step": 501
+    },
+    {
+      "epoch": 0.08220075323399378,
+      "grad_norm": 0.17089596390724182,
+      "learning_rate": 0.00019999454757036906,
+      "loss": 12.2639,
+      "step": 502
+    },
+    {
+      "epoch": 0.08236449975438022,
+      "grad_norm": 0.21298366785049438,
+      "learning_rate": 0.00019999449345278904,
+      "loss": 12.2546,
+      "step": 503
+    },
+    {
+      "epoch": 0.08252824627476667,
+      "grad_norm": 0.1498648226261139,
+      "learning_rate": 0.0001999944390679716,
+      "loss": 12.2882,
+      "step": 504
+    },
+    {
+      "epoch": 0.0826919927951531,
+      "grad_norm": 0.2605227530002594,
+      "learning_rate": 0.0001999943844159168,
+      "loss": 12.2627,
+      "step": 505
+    },
+    {
+      "epoch": 0.08285573931553955,
+      "grad_norm": 0.2724616229534149,
+      "learning_rate": 0.00019999432949662483,
+      "loss": 12.261,
+      "step": 506
+    },
+    {
+      "epoch": 0.08301948583592599,
+      "grad_norm": 0.17513243854045868,
+      "learning_rate": 0.00019999427431009582,
+      "loss": 12.2554,
+      "step": 507
+    },
+    {
+      "epoch": 0.08318323235631243,
+      "grad_norm": 0.3738495409488678,
+      "learning_rate": 0.00019999421885632992,
+      "loss": 12.2488,
+      "step": 508
+    },
+    {
+      "epoch": 0.08334697887669887,
+      "grad_norm": 0.17396804690361023,
+      "learning_rate": 0.00019999416313532726,
+      "loss": 12.2296,
+      "step": 509
+    },
+    {
+      "epoch": 0.08351072539708532,
+      "grad_norm": 0.2673768699169159,
+      "learning_rate": 0.00019999410714708802,
+      "loss": 12.2929,
+      "step": 510
+    },
+    {
+      "epoch": 0.08367447191747175,
+      "grad_norm": 0.126251682639122,
+      "learning_rate": 0.00019999405089161237,
+      "loss": 12.2418,
+      "step": 511
+    },
+    {
+      "epoch": 0.0838382184378582,
+      "grad_norm": 0.13301542401313782,
+      "learning_rate": 0.00019999399436890038,
+      "loss": 12.2771,
+      "step": 512
+    },
+    {
+      "epoch": 0.08400196495824463,
+      "grad_norm": 0.36997753381729126,
+      "learning_rate": 0.0001999939375789523,
+      "loss": 12.282,
+      "step": 513
+    },
+    {
+      "epoch": 0.08416571147863108,
+      "grad_norm": 0.1922840178012848,
+      "learning_rate": 0.00019999388052176822,
+      "loss": 12.2512,
+      "step": 514
+    },
+    {
+      "epoch": 0.08432945799901752,
+      "grad_norm": 0.1612626612186432,
+      "learning_rate": 0.00019999382319734827,
+      "loss": 12.2564,
+      "step": 515
+    },
+    {
+      "epoch": 0.08449320451940397,
+      "grad_norm": 0.3599528670310974,
+      "learning_rate": 0.00019999376560569267,
+      "loss": 12.3231,
+      "step": 516
+    },
+    {
+      "epoch": 0.0846569510397904,
+      "grad_norm": 0.2174975723028183,
+      "learning_rate": 0.00019999370774680152,
+      "loss": 12.2482,
+      "step": 517
+    },
+    {
+      "epoch": 0.08482069756017685,
+      "grad_norm": 0.16101105511188507,
+      "learning_rate": 0.00019999364962067502,
+      "loss": 12.2453,
+      "step": 518
+    },
+    {
+      "epoch": 0.08498444408056328,
+      "grad_norm": 0.16098256409168243,
+      "learning_rate": 0.0001999935912273133,
+      "loss": 12.2844,
+      "step": 519
+    },
+    {
+      "epoch": 0.08514819060094973,
+      "grad_norm": 0.21801027655601501,
+      "learning_rate": 0.0001999935325667165,
+      "loss": 12.2693,
+      "step": 520
+    },
+    {
+      "epoch": 0.08531193712133617,
+      "grad_norm": 0.14086532592773438,
+      "learning_rate": 0.0001999934736388848,
+      "loss": 12.2613,
+      "step": 521
+    },
+    {
+      "epoch": 0.08547568364172262,
+      "grad_norm": 0.20445305109024048,
+      "learning_rate": 0.00019999341444381833,
+      "loss": 12.2395,
+      "step": 522
+    },
+    {
+      "epoch": 0.08563943016210905,
+      "grad_norm": 0.21074581146240234,
+      "learning_rate": 0.00019999335498151733,
+      "loss": 12.2754,
+      "step": 523
+    },
+    {
+      "epoch": 0.0858031766824955,
+      "grad_norm": 0.1751207858324051,
+      "learning_rate": 0.00019999329525198186,
+      "loss": 12.2557,
+      "step": 524
+    },
+    {
+      "epoch": 0.08596692320288193,
+      "grad_norm": 0.19772769510746002,
+      "learning_rate": 0.00019999323525521213,
+      "loss": 12.2841,
+      "step": 525
+    },
+    {
+      "epoch": 0.08613066972326838,
+      "grad_norm": 0.1892375349998474,
+      "learning_rate": 0.00019999317499120827,
+      "loss": 12.2425,
+      "step": 526
+    },
+    {
+      "epoch": 0.08629441624365482,
+      "grad_norm": 0.13868379592895508,
+      "learning_rate": 0.00019999311445997045,
+      "loss": 12.2425,
+      "step": 527
+    },
+    {
+      "epoch": 0.08645816276404127,
+      "grad_norm": 0.19545531272888184,
+      "learning_rate": 0.00019999305366149886,
+      "loss": 12.2506,
+      "step": 528
+    },
+    {
+      "epoch": 0.0866219092844277,
+      "grad_norm": 0.16759023070335388,
+      "learning_rate": 0.00019999299259579366,
+      "loss": 12.2702,
+      "step": 529
+    },
+    {
+      "epoch": 0.08678565580481415,
+      "grad_norm": 0.1422002762556076,
+      "learning_rate": 0.000199992931262855,
+      "loss": 12.2798,
+      "step": 530
+    },
+    {
+      "epoch": 0.08694940232520058,
+      "grad_norm": 0.1551176756620407,
+      "learning_rate": 0.00019999286966268303,
+      "loss": 12.2362,
+      "step": 531
+    },
+    {
+      "epoch": 0.08711314884558703,
+      "grad_norm": 0.19540736079216003,
+      "learning_rate": 0.00019999280779527793,
+      "loss": 12.2627,
+      "step": 532
+    },
+    {
+      "epoch": 0.08727689536597347,
+      "grad_norm": 0.1344379335641861,
+      "learning_rate": 0.00019999274566063983,
+      "loss": 12.2668,
+      "step": 533
+    },
+    {
+      "epoch": 0.08744064188635992,
+      "grad_norm": 0.1458573043346405,
+      "learning_rate": 0.00019999268325876895,
+      "loss": 12.2599,
+      "step": 534
+    },
+    {
+      "epoch": 0.08760438840674635,
+      "grad_norm": 0.2072553187608719,
+      "learning_rate": 0.00019999262058966542,
+      "loss": 12.2642,
+      "step": 535
+    },
+    {
+      "epoch": 0.0877681349271328,
+      "grad_norm": 0.14165370166301727,
+      "learning_rate": 0.00019999255765332946,
+      "loss": 12.2656,
+      "step": 536
+    },
+    {
+      "epoch": 0.08793188144751923,
+      "grad_norm": 0.1711515188217163,
+      "learning_rate": 0.00019999249444976118,
+      "loss": 12.2662,
+      "step": 537
+    },
+    {
+      "epoch": 0.08809562796790568,
+      "grad_norm": 0.1479502171278,
+      "learning_rate": 0.00019999243097896076,
+      "loss": 12.2564,
+      "step": 538
+    },
+    {
+      "epoch": 0.08825937448829213,
+      "grad_norm": 0.32752886414527893,
+      "learning_rate": 0.0001999923672409284,
+      "loss": 12.3072,
+      "step": 539
+    },
+    {
+      "epoch": 0.08842312100867857,
+      "grad_norm": 0.19929148256778717,
+      "learning_rate": 0.00019999230323566422,
+      "loss": 12.298,
+      "step": 540
+    },
+    {
+      "epoch": 0.08858686752906501,
+      "grad_norm": 0.33046862483024597,
+      "learning_rate": 0.00019999223896316845,
+      "loss": 12.248,
+      "step": 541
+    },
+    {
+      "epoch": 0.08875061404945145,
+      "grad_norm": 0.13765937089920044,
+      "learning_rate": 0.0001999921744234412,
+      "loss": 12.2542,
+      "step": 542
+    },
+    {
+      "epoch": 0.0889143605698379,
+      "grad_norm": 0.19766588509082794,
+      "learning_rate": 0.00019999210961648269,
+      "loss": 12.2662,
+      "step": 543
+    },
+    {
+      "epoch": 0.08907810709022433,
+      "grad_norm": 0.23486214876174927,
+      "learning_rate": 0.00019999204454229308,
+      "loss": 12.2628,
+      "step": 544
+    },
+    {
+      "epoch": 0.08924185361061078,
+      "grad_norm": 0.33818957209587097,
+      "learning_rate": 0.00019999197920087254,
+      "loss": 12.2958,
+      "step": 545
+    },
+    {
+      "epoch": 0.08940560013099721,
+      "grad_norm": 0.17376133799552917,
+      "learning_rate": 0.00019999191359222125,
+      "loss": 12.2765,
+      "step": 546
+    },
+    {
+      "epoch": 0.08956934665138366,
+      "grad_norm": 0.20563820004463196,
+      "learning_rate": 0.00019999184771633938,
+      "loss": 12.2809,
+      "step": 547
+    },
+    {
+      "epoch": 0.0897330931717701,
+      "grad_norm": 0.23043566942214966,
+      "learning_rate": 0.00019999178157322707,
+      "loss": 12.2716,
+      "step": 548
+    },
+    {
+      "epoch": 0.08989683969215655,
+      "grad_norm": 0.17227397859096527,
+      "learning_rate": 0.00019999171516288456,
+      "loss": 12.2736,
+      "step": 549
+    },
+    {
+      "epoch": 0.09006058621254298,
+      "grad_norm": 0.20385326445102692,
+      "learning_rate": 0.00019999164848531203,
+      "loss": 12.2394,
+      "step": 550
+    },
+    {
+      "epoch": 0.09022433273292943,
+      "grad_norm": 0.13971176743507385,
+      "learning_rate": 0.0001999915815405096,
+      "loss": 12.2582,
+      "step": 551
+    },
+    {
+      "epoch": 0.09038807925331586,
+      "grad_norm": 0.15711809694766998,
+      "learning_rate": 0.0001999915143284775,
+      "loss": 12.2369,
+      "step": 552
+    },
+    {
+      "epoch": 0.09055182577370231,
+      "grad_norm": 0.20646773278713226,
+      "learning_rate": 0.00019999144684921584,
+      "loss": 12.2469,
+      "step": 553
+    },
+    {
+      "epoch": 0.09071557229408875,
+      "grad_norm": 0.1829550713300705,
+      "learning_rate": 0.00019999137910272488,
+      "loss": 12.2541,
+      "step": 554
+    },
+    {
+      "epoch": 0.0908793188144752,
+      "grad_norm": 0.14914344251155853,
+      "learning_rate": 0.0001999913110890048,
+      "loss": 12.2867,
+      "step": 555
+    },
+    {
+      "epoch": 0.09104306533486163,
+      "grad_norm": 0.1926674097776413,
+      "learning_rate": 0.00019999124280805573,
+      "loss": 12.2441,
+      "step": 556
+    },
+    {
+      "epoch": 0.09120681185524808,
+      "grad_norm": 0.12586253881454468,
+      "learning_rate": 0.00019999117425987785,
+      "loss": 12.2711,
+      "step": 557
+    },
+    {
+      "epoch": 0.09137055837563451,
+      "grad_norm": 0.2592686414718628,
+      "learning_rate": 0.00019999110544447138,
+      "loss": 12.238,
+      "step": 558
+    },
+    {
+      "epoch": 0.09153430489602096,
+      "grad_norm": 0.1788288652896881,
+      "learning_rate": 0.0001999910363618365,
+      "loss": 12.2804,
+      "step": 559
+    },
+    {
+      "epoch": 0.0916980514164074,
+      "grad_norm": 0.15797553956508636,
+      "learning_rate": 0.00019999096701197339,
+      "loss": 12.2578,
+      "step": 560
+    },
+    {
+      "epoch": 0.09186179793679385,
+      "grad_norm": 0.155698761343956,
+      "learning_rate": 0.00019999089739488221,
+      "loss": 12.2505,
+      "step": 561
+    },
+    {
+      "epoch": 0.09202554445718028,
+      "grad_norm": 0.201356902718544,
+      "learning_rate": 0.00019999082751056318,
+      "loss": 12.2666,
+      "step": 562
+    },
+    {
+      "epoch": 0.09218929097756673,
+      "grad_norm": 0.4484696090221405,
+      "learning_rate": 0.00019999075735901647,
+      "loss": 12.2981,
+      "step": 563
+    },
+    {
+      "epoch": 0.09235303749795316,
+      "grad_norm": 0.23958955705165863,
+      "learning_rate": 0.00019999068694024228,
+      "loss": 12.3007,
+      "step": 564
+    },
+    {
+      "epoch": 0.09251678401833961,
+      "grad_norm": 0.20138677954673767,
+      "learning_rate": 0.0001999906162542408,
+      "loss": 12.2547,
+      "step": 565
+    },
+    {
+      "epoch": 0.09268053053872605,
+      "grad_norm": 0.2650439739227295,
+      "learning_rate": 0.00019999054530101218,
+      "loss": 12.2488,
+      "step": 566
+    },
+    {
+      "epoch": 0.0928442770591125,
+      "grad_norm": 0.16292192041873932,
+      "learning_rate": 0.00019999047408055665,
+      "loss": 12.2734,
+      "step": 567
+    },
+    {
+      "epoch": 0.09300802357949893,
+      "grad_norm": 0.16789360344409943,
+      "learning_rate": 0.0001999904025928744,
+      "loss": 12.2769,
+      "step": 568
+    },
+    {
+      "epoch": 0.09317177009988538,
+      "grad_norm": 0.18500173091888428,
+      "learning_rate": 0.0001999903308379656,
+      "loss": 12.2368,
+      "step": 569
+    },
+    {
+      "epoch": 0.09333551662027181,
+      "grad_norm": 0.166462704539299,
+      "learning_rate": 0.00019999025881583044,
+      "loss": 12.2278,
+      "step": 570
+    },
+    {
+      "epoch": 0.09349926314065826,
+      "grad_norm": 0.16099901497364044,
+      "learning_rate": 0.00019999018652646912,
+      "loss": 12.2619,
+      "step": 571
+    },
+    {
+      "epoch": 0.0936630096610447,
+      "grad_norm": 0.1871333122253418,
+      "learning_rate": 0.00019999011396988186,
+      "loss": 12.2631,
+      "step": 572
+    },
+    {
+      "epoch": 0.09382675618143115,
+      "grad_norm": 0.16076967120170593,
+      "learning_rate": 0.0001999900411460688,
+      "loss": 12.2759,
+      "step": 573
+    },
+    {
+      "epoch": 0.09399050270181758,
+      "grad_norm": 0.1936284452676773,
+      "learning_rate": 0.0001999899680550302,
+      "loss": 12.2962,
+      "step": 574
+    },
+    {
+      "epoch": 0.09415424922220403,
+      "grad_norm": 0.15248098969459534,
+      "learning_rate": 0.00019998989469676622,
+      "loss": 12.2729,
+      "step": 575
+    },
+    {
+      "epoch": 0.09431799574259046,
+      "grad_norm": 0.2867307960987091,
+      "learning_rate": 0.00019998982107127702,
+      "loss": 12.256,
+      "step": 576
+    },
+    {
+      "epoch": 0.09448174226297691,
+      "grad_norm": 0.23930412530899048,
+      "learning_rate": 0.00019998974717856286,
+      "loss": 12.2622,
+      "step": 577
+    },
+    {
+      "epoch": 0.09464548878336336,
+      "grad_norm": 0.23292279243469238,
+      "learning_rate": 0.0001999896730186239,
+      "loss": 12.2749,
+      "step": 578
+    },
+    {
+      "epoch": 0.0948092353037498,
+      "grad_norm": 0.20313963294029236,
+      "learning_rate": 0.00019998959859146036,
+      "loss": 12.2894,
+      "step": 579
+    },
+    {
+      "epoch": 0.09497298182413624,
+      "grad_norm": 0.18153154850006104,
+      "learning_rate": 0.00019998952389707242,
+      "loss": 12.2516,
+      "step": 580
+    },
+    {
+      "epoch": 0.09513672834452268,
+      "grad_norm": 0.15528979897499084,
+      "learning_rate": 0.0001999894489354603,
+      "loss": 12.2622,
+      "step": 581
+    },
+    {
+      "epoch": 0.09530047486490913,
+      "grad_norm": 0.15820947289466858,
+      "learning_rate": 0.00019998937370662416,
+      "loss": 12.2784,
+      "step": 582
+    },
+    {
+      "epoch": 0.09546422138529556,
+      "grad_norm": 0.1906697005033493,
+      "learning_rate": 0.00019998929821056426,
+      "loss": 12.2721,
+      "step": 583
+    },
+    {
+      "epoch": 0.09562796790568201,
+      "grad_norm": 0.16757844388484955,
+      "learning_rate": 0.00019998922244728076,
+      "loss": 12.2518,
+      "step": 584
+    },
+    {
+      "epoch": 0.09579171442606844,
+      "grad_norm": 0.1887599378824234,
+      "learning_rate": 0.0001999891464167739,
+      "loss": 12.2657,
+      "step": 585
+    },
+    {
+      "epoch": 0.09595546094645489,
+      "grad_norm": 0.2148314267396927,
+      "learning_rate": 0.0001999890701190438,
+      "loss": 12.2641,
+      "step": 586
+    },
+    {
+      "epoch": 0.09611920746684133,
+      "grad_norm": 0.2525731921195984,
+      "learning_rate": 0.00019998899355409076,
+      "loss": 12.2726,
+      "step": 587
+    },
+    {
+      "epoch": 0.09628295398722778,
+      "grad_norm": 0.25445112586021423,
+      "learning_rate": 0.00019998891672191494,
+      "loss": 12.2299,
+      "step": 588
+    },
+    {
+      "epoch": 0.09644670050761421,
+      "grad_norm": 0.20000016689300537,
+      "learning_rate": 0.00019998883962251654,
+      "loss": 12.2536,
+      "step": 589
+    },
+    {
+      "epoch": 0.09661044702800066,
+      "grad_norm": 0.20457811653614044,
+      "learning_rate": 0.00019998876225589578,
+      "loss": 12.2577,
+      "step": 590
+    },
+    {
+      "epoch": 0.0967741935483871,
+      "grad_norm": 0.17679806053638458,
+      "learning_rate": 0.00019998868462205285,
+      "loss": 12.2519,
+      "step": 591
+    },
+    {
+      "epoch": 0.09693794006877354,
+      "grad_norm": 0.1711779534816742,
+      "learning_rate": 0.000199988606720988,
+      "loss": 12.2187,
+      "step": 592
+    },
+    {
+      "epoch": 0.09710168658915998,
+      "grad_norm": 0.2674786448478699,
+      "learning_rate": 0.0001999885285527014,
+      "loss": 12.2609,
+      "step": 593
+    },
+    {
+      "epoch": 0.09726543310954643,
+      "grad_norm": 0.14570537209510803,
+      "learning_rate": 0.00019998845011719326,
+      "loss": 12.2463,
+      "step": 594
+    },
+    {
+      "epoch": 0.09742917962993286,
+      "grad_norm": 0.17581459879875183,
+      "learning_rate": 0.00019998837141446378,
+      "loss": 12.2543,
+      "step": 595
+    },
+    {
+      "epoch": 0.09759292615031931,
+      "grad_norm": 0.1694401651620865,
+      "learning_rate": 0.0001999882924445132,
+      "loss": 12.2882,
+      "step": 596
+    },
+    {
+      "epoch": 0.09775667267070574,
+      "grad_norm": 0.18744853138923645,
+      "learning_rate": 0.00019998821320734177,
+      "loss": 12.2401,
+      "step": 597
+    },
+    {
+      "epoch": 0.09792041919109219,
+      "grad_norm": 0.26006919145584106,
+      "learning_rate": 0.00019998813370294957,
+      "loss": 12.2708,
+      "step": 598
+    },
+    {
+      "epoch": 0.09808416571147863,
+      "grad_norm": 0.17062494158744812,
+      "learning_rate": 0.00019998805393133692,
+      "loss": 12.2635,
+      "step": 599
+    },
+    {
+      "epoch": 0.09824791223186508,
+      "grad_norm": 0.19302265346050262,
+      "learning_rate": 0.00019998797389250404,
+      "loss": 12.2255,
+      "step": 600
+    },
+    {
+      "epoch": 0.09841165875225151,
+      "grad_norm": 0.1742558777332306,
+      "learning_rate": 0.00019998789358645106,
+      "loss": 12.2327,
+      "step": 601
+    },
+    {
+      "epoch": 0.09857540527263796,
+      "grad_norm": 0.21890263259410858,
+      "learning_rate": 0.0001999878130131783,
+      "loss": 12.2857,
+      "step": 602
+    },
+    {
+      "epoch": 0.0987391517930244,
+      "grad_norm": 0.16165485978126526,
+      "learning_rate": 0.00019998773217268586,
+      "loss": 12.2459,
+      "step": 603
+    },
+    {
+      "epoch": 0.09890289831341084,
+      "grad_norm": 0.2369285672903061,
+      "learning_rate": 0.0001999876510649741,
+      "loss": 12.2954,
+      "step": 604
+    },
+    {
+      "epoch": 0.09906664483379728,
+      "grad_norm": 0.1774766743183136,
+      "learning_rate": 0.00019998756969004307,
+      "loss": 12.2627,
+      "step": 605
+    },
+    {
+      "epoch": 0.09923039135418373,
+      "grad_norm": 0.14726974070072174,
+      "learning_rate": 0.00019998748804789308,
+      "loss": 12.2593,
+      "step": 606
+    },
+    {
+      "epoch": 0.09939413787457016,
+      "grad_norm": 0.20986835658550262,
+      "learning_rate": 0.0001999874061385244,
+      "loss": 12.2769,
+      "step": 607
+    },
+    {
+      "epoch": 0.09955788439495661,
+      "grad_norm": 0.15654174983501434,
+      "learning_rate": 0.0001999873239619371,
+      "loss": 12.2858,
+      "step": 608
+    },
+    {
+      "epoch": 0.09972163091534304,
+      "grad_norm": 0.2046964168548584,
+      "learning_rate": 0.00019998724151813155,
+      "loss": 12.2597,
+      "step": 609
+    },
+    {
+      "epoch": 0.09988537743572949,
+      "grad_norm": 0.1860288828611374,
+      "learning_rate": 0.0001999871588071079,
+      "loss": 12.2755,
+      "step": 610
+    },
+    {
+      "epoch": 0.10004912395611593,
+      "grad_norm": 0.17672719061374664,
+      "learning_rate": 0.00019998707582886635,
+      "loss": 12.2937,
+      "step": 611
+    },
+    {
+      "epoch": 0.10021287047650237,
+      "grad_norm": 0.25083354115486145,
+      "learning_rate": 0.00019998699258340718,
+      "loss": 12.2747,
+      "step": 612
+    },
+    {
+      "epoch": 0.10037661699688881,
+      "grad_norm": 0.2215733677148819,
+      "learning_rate": 0.00019998690907073056,
+      "loss": 12.2694,
+      "step": 613
+    },
+    {
+      "epoch": 0.10054036351727526,
+      "grad_norm": 0.2608155310153961,
+      "learning_rate": 0.00019998682529083676,
+      "loss": 12.2544,
+      "step": 614
+    },
+    {
+      "epoch": 0.1007041100376617,
+      "grad_norm": 0.25067582726478577,
+      "learning_rate": 0.00019998674124372596,
+      "loss": 12.258,
+      "step": 615
+    },
+    {
+      "epoch": 0.10086785655804814,
+      "grad_norm": 0.21041239798069,
+      "learning_rate": 0.00019998665692939842,
+      "loss": 12.2686,
+      "step": 616
+    },
+    {
+      "epoch": 0.10103160307843459,
+      "grad_norm": 0.1990176886320114,
+      "learning_rate": 0.00019998657234785434,
+      "loss": 12.2596,
+      "step": 617
+    },
+    {
+      "epoch": 0.10119534959882102,
+      "grad_norm": 0.3006889820098877,
+      "learning_rate": 0.00019998648749909397,
+      "loss": 12.2786,
+      "step": 618
+    },
+    {
+      "epoch": 0.10135909611920747,
+      "grad_norm": 0.19156216084957123,
+      "learning_rate": 0.0001999864023831175,
+      "loss": 12.2623,
+      "step": 619
+    },
+    {
+      "epoch": 0.10152284263959391,
+      "grad_norm": 0.24337586760520935,
+      "learning_rate": 0.0001999863169999252,
+      "loss": 12.2687,
+      "step": 620
+    },
+    {
+      "epoch": 0.10168658915998036,
+      "grad_norm": 0.2112797200679779,
+      "learning_rate": 0.00019998623134951727,
+      "loss": 12.2371,
+      "step": 621
+    },
+    {
+      "epoch": 0.10185033568036679,
+      "grad_norm": 0.3540230095386505,
+      "learning_rate": 0.00019998614543189394,
+      "loss": 12.2661,
+      "step": 622
+    },
+    {
+      "epoch": 0.10201408220075324,
+      "grad_norm": 0.21008434891700745,
+      "learning_rate": 0.00019998605924705546,
+      "loss": 12.2417,
+      "step": 623
+    },
+    {
+      "epoch": 0.10217782872113967,
+      "grad_norm": 0.3391290307044983,
+      "learning_rate": 0.000199985972795002,
+      "loss": 12.2789,
+      "step": 624
+    },
+    {
+      "epoch": 0.10234157524152612,
+      "grad_norm": 0.22646580636501312,
+      "learning_rate": 0.0001999858860757339,
+      "loss": 12.2695,
+      "step": 625
+    },
+    {
+      "epoch": 0.10250532176191256,
+      "grad_norm": 0.20563830435276031,
+      "learning_rate": 0.0001999857990892513,
+      "loss": 12.2392,
+      "step": 626
+    },
+    {
+      "epoch": 0.102669068282299,
+      "grad_norm": 0.14008118212223053,
+      "learning_rate": 0.00019998571183555447,
+      "loss": 12.2677,
+      "step": 627
+    },
+    {
+      "epoch": 0.10283281480268544,
+      "grad_norm": 0.18549852073192596,
+      "learning_rate": 0.00019998562431464365,
+      "loss": 12.2683,
+      "step": 628
+    },
+    {
+      "epoch": 0.10299656132307189,
+      "grad_norm": 0.1716676652431488,
+      "learning_rate": 0.00019998553652651903,
+      "loss": 12.2592,
+      "step": 629
+    },
+    {
+      "epoch": 0.10316030784345832,
+      "grad_norm": 0.18004333972930908,
+      "learning_rate": 0.0001999854484711809,
+      "loss": 12.2507,
+      "step": 630
+    },
+    {
+      "epoch": 0.10332405436384477,
+      "grad_norm": 0.1485266536474228,
+      "learning_rate": 0.00019998536014862944,
+      "loss": 12.2692,
+      "step": 631
+    },
+    {
+      "epoch": 0.1034878008842312,
+      "grad_norm": 0.16121861338615417,
+      "learning_rate": 0.00019998527155886496,
+      "loss": 12.2907,
+      "step": 632
+    },
+    {
+      "epoch": 0.10365154740461766,
+      "grad_norm": 0.2284204214811325,
+      "learning_rate": 0.00019998518270188763,
+      "loss": 12.2752,
+      "step": 633
+    },
+    {
+      "epoch": 0.10381529392500409,
+      "grad_norm": 0.21021804213523865,
+      "learning_rate": 0.0001999850935776977,
+      "loss": 12.2439,
+      "step": 634
+    },
+    {
+      "epoch": 0.10397904044539054,
+      "grad_norm": 0.18460194766521454,
+      "learning_rate": 0.0001999850041862954,
+      "loss": 12.2636,
+      "step": 635
+    },
+    {
+      "epoch": 0.10414278696577697,
+      "grad_norm": 0.235703706741333,
+      "learning_rate": 0.00019998491452768102,
+      "loss": 12.2853,
+      "step": 636
+    },
+    {
+      "epoch": 0.10430653348616342,
+      "grad_norm": 0.1810152679681778,
+      "learning_rate": 0.00019998482460185474,
+      "loss": 12.273,
+      "step": 637
+    },
+    {
+      "epoch": 0.10447028000654986,
+      "grad_norm": 0.26445272564888,
+      "learning_rate": 0.00019998473440881686,
+      "loss": 12.2545,
+      "step": 638
+    },
+    {
+      "epoch": 0.1046340265269363,
+      "grad_norm": 0.47541436553001404,
+      "learning_rate": 0.00019998464394856757,
+      "loss": 12.3134,
+      "step": 639
+    },
+    {
+      "epoch": 0.10479777304732274,
+      "grad_norm": 0.20061855018138885,
+      "learning_rate": 0.00019998455322110714,
+      "loss": 12.259,
+      "step": 640
+    },
+    {
+      "epoch": 0.10496151956770919,
+      "grad_norm": 0.19952458143234253,
+      "learning_rate": 0.00019998446222643579,
+      "loss": 12.2633,
+      "step": 641
+    },
+    {
+      "epoch": 0.10512526608809562,
+      "grad_norm": 0.18077236413955688,
+      "learning_rate": 0.00019998437096455375,
+      "loss": 12.2649,
+      "step": 642
+    },
+    {
+      "epoch": 0.10528901260848207,
+      "grad_norm": 0.19510169327259064,
+      "learning_rate": 0.00019998427943546134,
+      "loss": 12.2537,
+      "step": 643
+    },
+    {
+      "epoch": 0.1054527591288685,
+      "grad_norm": 0.1354212760925293,
+      "learning_rate": 0.00019998418763915872,
+      "loss": 12.2671,
+      "step": 644
+    },
+    {
+      "epoch": 0.10561650564925495,
+      "grad_norm": 0.15665219724178314,
+      "learning_rate": 0.0001999840955756462,
+      "loss": 12.2936,
+      "step": 645
+    },
+    {
+      "epoch": 0.10578025216964139,
+      "grad_norm": 0.17888222634792328,
+      "learning_rate": 0.00019998400324492394,
+      "loss": 12.2707,
+      "step": 646
+    },
+    {
+      "epoch": 0.10594399869002784,
+      "grad_norm": 0.20793989300727844,
+      "learning_rate": 0.0001999839106469923,
+      "loss": 12.263,
+      "step": 647
+    },
+    {
+      "epoch": 0.10610774521041427,
+      "grad_norm": 0.2504526674747467,
+      "learning_rate": 0.00019998381778185142,
+      "loss": 12.2566,
+      "step": 648
+    },
+    {
+      "epoch": 0.10627149173080072,
+      "grad_norm": 0.2541908621788025,
+      "learning_rate": 0.00019998372464950163,
+      "loss": 12.2473,
+      "step": 649
+    },
+    {
+      "epoch": 0.10643523825118716,
+      "grad_norm": 0.1839291751384735,
+      "learning_rate": 0.00019998363124994314,
+      "loss": 12.2588,
+      "step": 650
+    },
+    {
+      "epoch": 0.1065989847715736,
+      "grad_norm": 0.1613001674413681,
+      "learning_rate": 0.00019998353758317618,
+      "loss": 12.2363,
+      "step": 651
+    },
+    {
+      "epoch": 0.10676273129196004,
+      "grad_norm": 0.29961979389190674,
+      "learning_rate": 0.00019998344364920108,
+      "loss": 12.2315,
+      "step": 652
+    },
+    {
+      "epoch": 0.10692647781234649,
+      "grad_norm": 0.1734955608844757,
+      "learning_rate": 0.000199983349448018,
+      "loss": 12.2566,
+      "step": 653
+    },
+    {
+      "epoch": 0.10709022433273294,
+      "grad_norm": 0.22714467346668243,
+      "learning_rate": 0.00019998325497962721,
+      "loss": 12.2773,
+      "step": 654
+    },
+    {
+      "epoch": 0.10725397085311937,
+      "grad_norm": 0.18775632977485657,
+      "learning_rate": 0.00019998316024402902,
+      "loss": 12.2499,
+      "step": 655
+    },
+    {
+      "epoch": 0.10741771737350582,
+      "grad_norm": 0.14911091327667236,
+      "learning_rate": 0.00019998306524122365,
+      "loss": 12.2585,
+      "step": 656
+    },
+    {
+      "epoch": 0.10758146389389225,
+      "grad_norm": 0.153729647397995,
+      "learning_rate": 0.00019998296997121134,
+      "loss": 12.2517,
+      "step": 657
+    },
+    {
+      "epoch": 0.1077452104142787,
+      "grad_norm": 0.12172205001115799,
+      "learning_rate": 0.00019998287443399233,
+      "loss": 12.2633,
+      "step": 658
+    },
+    {
+      "epoch": 0.10790895693466514,
+      "grad_norm": 0.2324371337890625,
+      "learning_rate": 0.0001999827786295669,
+      "loss": 12.2705,
+      "step": 659
+    },
+    {
+      "epoch": 0.10807270345505159,
+      "grad_norm": 0.14951297640800476,
+      "learning_rate": 0.00019998268255793533,
+      "loss": 12.2622,
+      "step": 660
+    },
+    {
+      "epoch": 0.10823644997543802,
+      "grad_norm": 0.2147216498851776,
+      "learning_rate": 0.0001999825862190978,
+      "loss": 12.2956,
+      "step": 661
+    },
+    {
+      "epoch": 0.10840019649582447,
+      "grad_norm": 0.3348814845085144,
+      "learning_rate": 0.00019998248961305471,
+      "loss": 12.2714,
+      "step": 662
+    },
+    {
+      "epoch": 0.1085639430162109,
+      "grad_norm": 0.2494417130947113,
+      "learning_rate": 0.00019998239273980617,
+      "loss": 12.2685,
+      "step": 663
+    },
+    {
+      "epoch": 0.10872768953659735,
+      "grad_norm": 0.20005349814891815,
+      "learning_rate": 0.00019998229559935247,
+      "loss": 12.2377,
+      "step": 664
+    },
+    {
+      "epoch": 0.10889143605698379,
+      "grad_norm": 0.2329307198524475,
+      "learning_rate": 0.00019998219819169396,
+      "loss": 12.2541,
+      "step": 665
+    },
+    {
+      "epoch": 0.10905518257737024,
+      "grad_norm": 0.3226028382778168,
+      "learning_rate": 0.0001999821005168308,
+      "loss": 12.2794,
+      "step": 666
+    },
+    {
+      "epoch": 0.10921892909775667,
+      "grad_norm": 0.21378913521766663,
+      "learning_rate": 0.00019998200257476326,
+      "loss": 12.2466,
+      "step": 667
+    },
+    {
+      "epoch": 0.10938267561814312,
+      "grad_norm": 0.18209829926490784,
+      "learning_rate": 0.00019998190436549166,
+      "loss": 12.246,
+      "step": 668
+    },
+    {
+      "epoch": 0.10954642213852955,
+      "grad_norm": 0.1942809522151947,
+      "learning_rate": 0.00019998180588901625,
+      "loss": 12.2976,
+      "step": 669
+    },
+    {
+      "epoch": 0.109710168658916,
+      "grad_norm": 0.2823975384235382,
+      "learning_rate": 0.00019998170714533725,
+      "loss": 12.2609,
+      "step": 670
+    },
+    {
+      "epoch": 0.10987391517930244,
+      "grad_norm": 0.1592366099357605,
+      "learning_rate": 0.00019998160813445494,
+      "loss": 12.2825,
+      "step": 671
+    },
+    {
+      "epoch": 0.11003766169968889,
+      "grad_norm": 0.16642484068870544,
+      "learning_rate": 0.0001999815088563696,
+      "loss": 12.2322,
+      "step": 672
+    },
+    {
+      "epoch": 0.11020140822007532,
+      "grad_norm": 0.3897826373577118,
+      "learning_rate": 0.00019998140931108148,
+      "loss": 12.2343,
+      "step": 673
+    },
+    {
+      "epoch": 0.11036515474046177,
+      "grad_norm": 0.14695951342582703,
+      "learning_rate": 0.00019998130949859088,
+      "loss": 12.2712,
+      "step": 674
+    },
+    {
+      "epoch": 0.1105289012608482,
+      "grad_norm": 0.28502780199050903,
+      "learning_rate": 0.00019998120941889803,
+      "loss": 12.2653,
+      "step": 675
+    },
+    {
+      "epoch": 0.11069264778123465,
+      "grad_norm": 0.20605143904685974,
+      "learning_rate": 0.0001999811090720032,
+      "loss": 12.2505,
+      "step": 676
+    },
+    {
+      "epoch": 0.11085639430162109,
+      "grad_norm": 0.17322997748851776,
+      "learning_rate": 0.00019998100845790667,
+      "loss": 12.2404,
+      "step": 677
+    },
+    {
+      "epoch": 0.11102014082200753,
+      "grad_norm": 0.22566843032836914,
+      "learning_rate": 0.00019998090757660872,
+      "loss": 12.2536,
+      "step": 678
+    },
+    {
+      "epoch": 0.11118388734239397,
+      "grad_norm": 0.2020934671163559,
+      "learning_rate": 0.00019998080642810959,
+      "loss": 12.2919,
+      "step": 679
+    },
+    {
+      "epoch": 0.11134763386278042,
+      "grad_norm": 0.20571619272232056,
+      "learning_rate": 0.00019998070501240958,
+      "loss": 12.2496,
+      "step": 680
+    },
+    {
+      "epoch": 0.11151138038316685,
+      "grad_norm": 0.23394832015037537,
+      "learning_rate": 0.0001999806033295089,
+      "loss": 12.3042,
+      "step": 681
+    },
+    {
+      "epoch": 0.1116751269035533,
+      "grad_norm": 0.37446317076683044,
+      "learning_rate": 0.00019998050137940793,
+      "loss": 12.2232,
+      "step": 682
+    },
+    {
+      "epoch": 0.11183887342393974,
+      "grad_norm": 0.24774238467216492,
+      "learning_rate": 0.00019998039916210684,
+      "loss": 12.2569,
+      "step": 683
+    },
+    {
+      "epoch": 0.11200261994432618,
+      "grad_norm": 0.18902882933616638,
+      "learning_rate": 0.00019998029667760595,
+      "loss": 12.2678,
+      "step": 684
+    },
+    {
+      "epoch": 0.11216636646471262,
+      "grad_norm": 0.24386021494865417,
+      "learning_rate": 0.00019998019392590552,
+      "loss": 12.2515,
+      "step": 685
+    },
+    {
+      "epoch": 0.11233011298509907,
+      "grad_norm": 0.1919771432876587,
+      "learning_rate": 0.00019998009090700585,
+      "loss": 12.2585,
+      "step": 686
+    },
+    {
+      "epoch": 0.1124938595054855,
+      "grad_norm": 0.2406018227338791,
+      "learning_rate": 0.0001999799876209072,
+      "loss": 12.2547,
+      "step": 687
+    },
+    {
+      "epoch": 0.11265760602587195,
+      "grad_norm": 0.18448154628276825,
+      "learning_rate": 0.00019997988406760984,
+      "loss": 12.2652,
+      "step": 688
+    },
+    {
+      "epoch": 0.11282135254625839,
+      "grad_norm": 0.26016390323638916,
+      "learning_rate": 0.00019997978024711405,
+      "loss": 12.2443,
+      "step": 689
+    },
+    {
+      "epoch": 0.11298509906664483,
+      "grad_norm": 0.22282028198242188,
+      "learning_rate": 0.0001999796761594201,
+      "loss": 12.2521,
+      "step": 690
+    },
+    {
+      "epoch": 0.11314884558703127,
+      "grad_norm": 0.19451838731765747,
+      "learning_rate": 0.0001999795718045283,
+      "loss": 12.2511,
+      "step": 691
+    },
+    {
+      "epoch": 0.11331259210741772,
+      "grad_norm": 0.13449028134346008,
+      "learning_rate": 0.00019997946718243887,
+      "loss": 12.2649,
+      "step": 692
+    },
+    {
+      "epoch": 0.11347633862780417,
+      "grad_norm": 0.16171801090240479,
+      "learning_rate": 0.00019997936229315216,
+      "loss": 12.2522,
+      "step": 693
+    },
+    {
+      "epoch": 0.1136400851481906,
+      "grad_norm": 0.2671642601490021,
+      "learning_rate": 0.00019997925713666835,
+      "loss": 12.2789,
+      "step": 694
+    },
+    {
+      "epoch": 0.11380383166857705,
+      "grad_norm": 0.20152407884597778,
+      "learning_rate": 0.00019997915171298788,
+      "loss": 12.2889,
+      "step": 695
+    },
+    {
+      "epoch": 0.11396757818896348,
+      "grad_norm": 0.14468339085578918,
+      "learning_rate": 0.00019997904602211085,
+      "loss": 12.2395,
+      "step": 696
+    },
+    {
+      "epoch": 0.11413132470934993,
+      "grad_norm": 0.12371987104415894,
+      "learning_rate": 0.00019997894006403767,
+      "loss": 12.2567,
+      "step": 697
+    },
+    {
+      "epoch": 0.11429507122973637,
+      "grad_norm": 0.2297051101922989,
+      "learning_rate": 0.00019997883383876858,
+      "loss": 12.2663,
+      "step": 698
+    },
+    {
+      "epoch": 0.11445881775012282,
+      "grad_norm": 0.24164938926696777,
+      "learning_rate": 0.0001999787273463039,
+      "loss": 12.2613,
+      "step": 699
+    },
+    {
+      "epoch": 0.11462256427050925,
+      "grad_norm": 0.12142845243215561,
+      "learning_rate": 0.00019997862058664383,
+      "loss": 12.2554,
+      "step": 700
+    },
+    {
+      "epoch": 0.1147863107908957,
+      "grad_norm": 0.27234968543052673,
+      "learning_rate": 0.00019997851355978873,
+      "loss": 12.2556,
+      "step": 701
+    },
+    {
+      "epoch": 0.11495005731128213,
+      "grad_norm": 0.2992333173751831,
+      "learning_rate": 0.00019997840626573887,
+      "loss": 12.2827,
+      "step": 702
+    },
+    {
+      "epoch": 0.11511380383166858,
+      "grad_norm": 0.16893671452999115,
+      "learning_rate": 0.0001999782987044945,
+      "loss": 12.2516,
+      "step": 703
+    },
+    {
+      "epoch": 0.11527755035205502,
+      "grad_norm": 0.15710364282131195,
+      "learning_rate": 0.00019997819087605597,
+      "loss": 12.2315,
+      "step": 704
+    },
+    {
+      "epoch": 0.11544129687244147,
+      "grad_norm": 0.1487736850976944,
+      "learning_rate": 0.0001999780827804235,
+      "loss": 12.2508,
+      "step": 705
+    },
+    {
+      "epoch": 0.1156050433928279,
+      "grad_norm": 0.19448354840278625,
+      "learning_rate": 0.00019997797441759745,
+      "loss": 12.2404,
+      "step": 706
+    },
+    {
+      "epoch": 0.11576878991321435,
+      "grad_norm": 0.2013469785451889,
+      "learning_rate": 0.00019997786578757808,
+      "loss": 12.2744,
+      "step": 707
+    },
+    {
+      "epoch": 0.11593253643360078,
+      "grad_norm": 0.2695065438747406,
+      "learning_rate": 0.00019997775689036565,
+      "loss": 12.2469,
+      "step": 708
+    },
+    {
+      "epoch": 0.11609628295398723,
+      "grad_norm": 0.1316147893667221,
+      "learning_rate": 0.00019997764772596046,
+      "loss": 12.2832,
+      "step": 709
+    },
+    {
+      "epoch": 0.11626002947437367,
+      "grad_norm": 0.22008389234542847,
+      "learning_rate": 0.00019997753829436286,
+      "loss": 12.2447,
+      "step": 710
+    },
+    {
+      "epoch": 0.11642377599476011,
+      "grad_norm": 0.14515216648578644,
+      "learning_rate": 0.00019997742859557307,
+      "loss": 12.2219,
+      "step": 711
+    },
+    {
+      "epoch": 0.11658752251514655,
+      "grad_norm": 0.25737878680229187,
+      "learning_rate": 0.0001999773186295914,
+      "loss": 12.2293,
+      "step": 712
+    },
+    {
+      "epoch": 0.116751269035533,
+      "grad_norm": 0.22486746311187744,
+      "learning_rate": 0.0001999772083964182,
+      "loss": 12.2661,
+      "step": 713
+    },
+    {
+      "epoch": 0.11691501555591943,
+      "grad_norm": 0.17334704101085663,
+      "learning_rate": 0.0001999770978960537,
+      "loss": 12.2547,
+      "step": 714
+    },
+    {
+      "epoch": 0.11707876207630588,
+      "grad_norm": 0.1596781462430954,
+      "learning_rate": 0.00019997698712849823,
+      "loss": 12.2811,
+      "step": 715
+    },
+    {
+      "epoch": 0.11724250859669232,
+      "grad_norm": 0.2053455412387848,
+      "learning_rate": 0.00019997687609375203,
+      "loss": 12.2605,
+      "step": 716
+    },
+    {
+      "epoch": 0.11740625511707876,
+      "grad_norm": 0.15111824870109558,
+      "learning_rate": 0.00019997676479181547,
+      "loss": 12.2554,
+      "step": 717
+    },
+    {
+      "epoch": 0.1175700016374652,
+      "grad_norm": 0.14962074160575867,
+      "learning_rate": 0.00019997665322268881,
+      "loss": 12.2361,
+      "step": 718
+    },
+    {
+      "epoch": 0.11773374815785165,
+      "grad_norm": 0.18393626809120178,
+      "learning_rate": 0.00019997654138637238,
+      "loss": 12.237,
+      "step": 719
+    },
+    {
+      "epoch": 0.11789749467823808,
+      "grad_norm": 0.22599177062511444,
+      "learning_rate": 0.0001999764292828664,
+      "loss": 12.2619,
+      "step": 720
+    },
+    {
+      "epoch": 0.11806124119862453,
+      "grad_norm": 0.20265071094036102,
+      "learning_rate": 0.00019997631691217127,
+      "loss": 12.2589,
+      "step": 721
+    },
+    {
+      "epoch": 0.11822498771901097,
+      "grad_norm": 0.2219390720129013,
+      "learning_rate": 0.00019997620427428722,
+      "loss": 12.244,
+      "step": 722
+    },
+    {
+      "epoch": 0.11838873423939741,
+      "grad_norm": 0.2106860876083374,
+      "learning_rate": 0.0001999760913692146,
+      "loss": 12.2522,
+      "step": 723
+    },
+    {
+      "epoch": 0.11855248075978385,
+      "grad_norm": 0.19955146312713623,
+      "learning_rate": 0.00019997597819695364,
+      "loss": 12.2614,
+      "step": 724
+    },
+    {
+      "epoch": 0.1187162272801703,
+      "grad_norm": 0.24155764281749725,
+      "learning_rate": 0.00019997586475750475,
+      "loss": 12.2692,
+      "step": 725
+    },
+    {
+      "epoch": 0.11887997380055673,
+      "grad_norm": 0.18328611552715302,
+      "learning_rate": 0.0001999757510508681,
+      "loss": 12.2532,
+      "step": 726
+    },
+    {
+      "epoch": 0.11904372032094318,
+      "grad_norm": 0.28729379177093506,
+      "learning_rate": 0.0001999756370770441,
+      "loss": 12.2833,
+      "step": 727
+    },
+    {
+      "epoch": 0.11920746684132962,
+      "grad_norm": 0.13738590478897095,
+      "learning_rate": 0.00019997552283603306,
+      "loss": 12.2574,
+      "step": 728
+    },
+    {
+      "epoch": 0.11937121336171606,
+      "grad_norm": 0.2414904534816742,
+      "learning_rate": 0.00019997540832783522,
+      "loss": 12.3013,
+      "step": 729
+    },
+    {
+      "epoch": 0.11953495988210251,
+      "grad_norm": 0.1890541911125183,
+      "learning_rate": 0.0001999752935524509,
+      "loss": 12.351,
+      "step": 730
+    },
+    {
+      "epoch": 0.11969870640248895,
+      "grad_norm": 0.2413313090801239,
+      "learning_rate": 0.0001999751785098804,
+      "loss": 12.2582,
+      "step": 731
+    },
+    {
+      "epoch": 0.1198624529228754,
+      "grad_norm": 0.18245859444141388,
+      "learning_rate": 0.00019997506320012408,
+      "loss": 12.2701,
+      "step": 732
+    },
+    {
+      "epoch": 0.12002619944326183,
+      "grad_norm": 0.17451536655426025,
+      "learning_rate": 0.00019997494762318221,
+      "loss": 12.2617,
+      "step": 733
+    },
+    {
+      "epoch": 0.12018994596364828,
+      "grad_norm": 0.31427887082099915,
+      "learning_rate": 0.0001999748317790551,
+      "loss": 12.2727,
+      "step": 734
+    },
+    {
+      "epoch": 0.12035369248403471,
+      "grad_norm": 0.22707721590995789,
+      "learning_rate": 0.00019997471566774304,
+      "loss": 12.2636,
+      "step": 735
+    },
+    {
+      "epoch": 0.12051743900442116,
+      "grad_norm": 0.16001681983470917,
+      "learning_rate": 0.00019997459928924638,
+      "loss": 12.2485,
+      "step": 736
+    },
+    {
+      "epoch": 0.1206811855248076,
+      "grad_norm": 0.2794734239578247,
+      "learning_rate": 0.00019997448264356545,
+      "loss": 12.2922,
+      "step": 737
+    },
+    {
+      "epoch": 0.12084493204519405,
+      "grad_norm": 0.18110767006874084,
+      "learning_rate": 0.00019997436573070048,
+      "loss": 12.2591,
+      "step": 738
+    },
+    {
+      "epoch": 0.12100867856558048,
+      "grad_norm": 0.20784614980220795,
+      "learning_rate": 0.00019997424855065183,
+      "loss": 12.2375,
+      "step": 739
+    },
+    {
+      "epoch": 0.12117242508596693,
+      "grad_norm": 0.3365764617919922,
+      "learning_rate": 0.00019997413110341982,
+      "loss": 12.297,
+      "step": 740
+    },
+    {
+      "epoch": 0.12133617160635336,
+      "grad_norm": 0.17696967720985413,
+      "learning_rate": 0.00019997401338900476,
+      "loss": 12.2253,
+      "step": 741
+    },
+    {
+      "epoch": 0.12149991812673981,
+      "grad_norm": 0.1352595090866089,
+      "learning_rate": 0.00019997389540740693,
+      "loss": 12.2659,
+      "step": 742
+    },
+    {
+      "epoch": 0.12166366464712625,
+      "grad_norm": 0.268904447555542,
+      "learning_rate": 0.00019997377715862672,
+      "loss": 12.2673,
+      "step": 743
+    },
+    {
+      "epoch": 0.1218274111675127,
+      "grad_norm": 0.22296397387981415,
+      "learning_rate": 0.00019997365864266438,
+      "loss": 12.2469,
+      "step": 744
+    },
+    {
+      "epoch": 0.12199115768789913,
+      "grad_norm": 0.22052326798439026,
+      "learning_rate": 0.00019997353985952025,
+      "loss": 12.2621,
+      "step": 745
+    },
+    {
+      "epoch": 0.12215490420828558,
+      "grad_norm": 0.3132556974887848,
+      "learning_rate": 0.00019997342080919466,
+      "loss": 12.2708,
+      "step": 746
+    },
+    {
+      "epoch": 0.12231865072867201,
+      "grad_norm": 0.1781475841999054,
+      "learning_rate": 0.0001999733014916879,
+      "loss": 12.22,
+      "step": 747
+    },
+    {
+      "epoch": 0.12248239724905846,
+      "grad_norm": 0.21517692506313324,
+      "learning_rate": 0.0001999731819070003,
+      "loss": 12.257,
+      "step": 748
+    },
+    {
+      "epoch": 0.1226461437694449,
+      "grad_norm": 0.22570882737636566,
+      "learning_rate": 0.00019997306205513218,
+      "loss": 12.2332,
+      "step": 749
+    },
+    {
+      "epoch": 0.12280989028983134,
+      "grad_norm": 0.17157399654388428,
+      "learning_rate": 0.00019997294193608388,
+      "loss": 12.2591,
+      "step": 750
+    },
+    {
+      "epoch": 0.12297363681021778,
+      "grad_norm": 0.2372014820575714,
+      "learning_rate": 0.00019997282154985568,
+      "loss": 12.2546,
+      "step": 751
+    },
+    {
+      "epoch": 0.12313738333060423,
+      "grad_norm": 0.2063087373971939,
+      "learning_rate": 0.00019997270089644792,
+      "loss": 12.2637,
+      "step": 752
+    },
+    {
+      "epoch": 0.12330112985099066,
+      "grad_norm": 0.18634669482707977,
+      "learning_rate": 0.00019997257997586093,
+      "loss": 12.2695,
+      "step": 753
+    },
+    {
+      "epoch": 0.12346487637137711,
+      "grad_norm": 0.231937438249588,
+      "learning_rate": 0.00019997245878809508,
+      "loss": 12.2457,
+      "step": 754
+    },
+    {
+      "epoch": 0.12362862289176355,
+      "grad_norm": 0.17607882618904114,
+      "learning_rate": 0.0001999723373331506,
+      "loss": 12.2468,
+      "step": 755
+    },
+    {
+      "epoch": 0.12379236941215,
+      "grad_norm": 0.2614947259426117,
+      "learning_rate": 0.00019997221561102787,
+      "loss": 12.2423,
+      "step": 756
+    },
+    {
+      "epoch": 0.12395611593253643,
+      "grad_norm": 0.30709508061408997,
+      "learning_rate": 0.0001999720936217272,
+      "loss": 12.3144,
+      "step": 757
+    },
+    {
+      "epoch": 0.12411986245292288,
+      "grad_norm": 0.13760733604431152,
+      "learning_rate": 0.0001999719713652489,
+      "loss": 12.2767,
+      "step": 758
+    },
+    {
+      "epoch": 0.12428360897330931,
+      "grad_norm": 0.19334371387958527,
+      "learning_rate": 0.00019997184884159336,
+      "loss": 12.2511,
+      "step": 759
+    },
+    {
+      "epoch": 0.12444735549369576,
+      "grad_norm": 0.24581773579120636,
+      "learning_rate": 0.00019997172605076084,
+      "loss": 12.2743,
+      "step": 760
+    },
+    {
+      "epoch": 0.1246111020140822,
+      "grad_norm": 0.19460995495319366,
+      "learning_rate": 0.0001999716029927517,
+      "loss": 12.2359,
+      "step": 761
+    },
+    {
+      "epoch": 0.12477484853446864,
+      "grad_norm": 0.25555893778800964,
+      "learning_rate": 0.00019997147966756623,
+      "loss": 12.2551,
+      "step": 762
+    },
+    {
+      "epoch": 0.12493859505485508,
+      "grad_norm": 0.2933256924152374,
+      "learning_rate": 0.00019997135607520482,
+      "loss": 12.2914,
+      "step": 763
+    },
+    {
+      "epoch": 0.12510234157524153,
+      "grad_norm": 0.14119243621826172,
+      "learning_rate": 0.0001999712322156678,
+      "loss": 12.2373,
+      "step": 764
+    },
+    {
+      "epoch": 0.12526608809562798,
+      "grad_norm": 0.18366780877113342,
+      "learning_rate": 0.00019997110808895542,
+      "loss": 12.2386,
+      "step": 765
+    },
+    {
+      "epoch": 0.1254298346160144,
+      "grad_norm": 0.19770678877830505,
+      "learning_rate": 0.0001999709836950681,
+      "loss": 12.3011,
+      "step": 766
+    },
+    {
+      "epoch": 0.12559358113640084,
+      "grad_norm": 0.17909152805805206,
+      "learning_rate": 0.00019997085903400614,
+      "loss": 12.2668,
+      "step": 767
+    },
+    {
+      "epoch": 0.1257573276567873,
+      "grad_norm": 0.1801517903804779,
+      "learning_rate": 0.00019997073410576985,
+      "loss": 12.2533,
+      "step": 768
+    },
+    {
+      "epoch": 0.12592107417717374,
+      "grad_norm": 0.1824531853199005,
+      "learning_rate": 0.0001999706089103596,
+      "loss": 12.2619,
+      "step": 769
+    },
+    {
+      "epoch": 0.1260848206975602,
+      "grad_norm": 0.2011738419532776,
+      "learning_rate": 0.00019997048344777568,
+      "loss": 12.2424,
+      "step": 770
+    },
+    {
+      "epoch": 0.1262485672179466,
+      "grad_norm": 0.17008745670318604,
+      "learning_rate": 0.00019997035771801848,
+      "loss": 12.2381,
+      "step": 771
+    },
+    {
+      "epoch": 0.12641231373833306,
+      "grad_norm": 0.13051459193229675,
+      "learning_rate": 0.00019997023172108828,
+      "loss": 12.2417,
+      "step": 772
+    },
+    {
+      "epoch": 0.1265760602587195,
+      "grad_norm": 0.18956008553504944,
+      "learning_rate": 0.00019997010545698548,
+      "loss": 12.2951,
+      "step": 773
+    },
+    {
+      "epoch": 0.12673980677910596,
+      "grad_norm": 0.17199958860874176,
+      "learning_rate": 0.00019996997892571036,
+      "loss": 12.2622,
+      "step": 774
+    },
+    {
+      "epoch": 0.12690355329949238,
+      "grad_norm": 0.23833072185516357,
+      "learning_rate": 0.00019996985212726332,
+      "loss": 12.2614,
+      "step": 775
+    },
+    {
+      "epoch": 0.12706729981987883,
+      "grad_norm": 0.1904534250497818,
+      "learning_rate": 0.00019996972506164463,
+      "loss": 12.2211,
+      "step": 776
+    },
+    {
+      "epoch": 0.12723104634026527,
+      "grad_norm": 0.22543828189373016,
+      "learning_rate": 0.00019996959772885468,
+      "loss": 12.2184,
+      "step": 777
+    },
+    {
+      "epoch": 0.12739479286065172,
+      "grad_norm": 0.15488769114017487,
+      "learning_rate": 0.00019996947012889375,
+      "loss": 12.249,
+      "step": 778
+    },
+    {
+      "epoch": 0.12755853938103814,
+      "grad_norm": 0.21749526262283325,
+      "learning_rate": 0.00019996934226176226,
+      "loss": 12.2726,
+      "step": 779
+    },
+    {
+      "epoch": 0.1277222859014246,
+      "grad_norm": 0.2531960904598236,
+      "learning_rate": 0.00019996921412746048,
+      "loss": 12.2587,
+      "step": 780
+    },
+    {
+      "epoch": 0.12788603242181104,
+      "grad_norm": 0.17422235012054443,
+      "learning_rate": 0.0001999690857259888,
+      "loss": 12.2422,
+      "step": 781
+    },
+    {
+      "epoch": 0.1280497789421975,
+      "grad_norm": 0.3590430021286011,
+      "learning_rate": 0.00019996895705734756,
+      "loss": 12.2198,
+      "step": 782
+    },
+    {
+      "epoch": 0.1282135254625839,
+      "grad_norm": 0.24437008798122406,
+      "learning_rate": 0.0001999688281215371,
+      "loss": 12.2169,
+      "step": 783
+    },
+    {
+      "epoch": 0.12837727198297036,
+      "grad_norm": 0.23471295833587646,
+      "learning_rate": 0.00019996869891855773,
+      "loss": 12.2544,
+      "step": 784
+    },
+    {
+      "epoch": 0.1285410185033568,
+      "grad_norm": 0.1604817807674408,
+      "learning_rate": 0.00019996856944840986,
+      "loss": 12.2623,
+      "step": 785
+    },
+    {
+      "epoch": 0.12870476502374326,
+      "grad_norm": 0.3521434962749481,
+      "learning_rate": 0.00019996843971109378,
+      "loss": 12.2066,
+      "step": 786
+    },
+    {
+      "epoch": 0.12886851154412968,
+      "grad_norm": 0.1714937835931778,
+      "learning_rate": 0.00019996830970660985,
+      "loss": 12.306,
+      "step": 787
+    },
+    {
+      "epoch": 0.12903225806451613,
+      "grad_norm": 0.19912664592266083,
+      "learning_rate": 0.00019996817943495842,
+      "loss": 12.2221,
+      "step": 788
+    },
+    {
+      "epoch": 0.12919600458490257,
+      "grad_norm": 0.16007201373577118,
+      "learning_rate": 0.00019996804889613984,
+      "loss": 12.2631,
+      "step": 789
+    },
+    {
+      "epoch": 0.12935975110528902,
+      "grad_norm": 0.18096835911273956,
+      "learning_rate": 0.00019996791809015447,
+      "loss": 12.2448,
+      "step": 790
+    },
+    {
+      "epoch": 0.12952349762567544,
+      "grad_norm": 0.11847981065511703,
+      "learning_rate": 0.00019996778701700265,
+      "loss": 12.2355,
+      "step": 791
+    },
+    {
+      "epoch": 0.1296872441460619,
+      "grad_norm": 0.20610485970973969,
+      "learning_rate": 0.00019996765567668475,
+      "loss": 12.2543,
+      "step": 792
+    },
+    {
+      "epoch": 0.12985099066644834,
+      "grad_norm": 0.18722973763942719,
+      "learning_rate": 0.00019996752406920107,
+      "loss": 12.2384,
+      "step": 793
+    },
+    {
+      "epoch": 0.1300147371868348,
+      "grad_norm": 0.11530886590480804,
+      "learning_rate": 0.00019996739219455199,
+      "loss": 12.2489,
+      "step": 794
+    },
+    {
+      "epoch": 0.1301784837072212,
+      "grad_norm": 0.20853202044963837,
+      "learning_rate": 0.00019996726005273787,
+      "loss": 12.2275,
+      "step": 795
+    },
+    {
+      "epoch": 0.13034223022760766,
+      "grad_norm": 0.16415412724018097,
+      "learning_rate": 0.00019996712764375907,
+      "loss": 12.2659,
+      "step": 796
+    },
+    {
+      "epoch": 0.1305059767479941,
+      "grad_norm": 0.2464807629585266,
+      "learning_rate": 0.00019996699496761594,
+      "loss": 12.278,
+      "step": 797
+    },
+    {
+      "epoch": 0.13066972326838056,
+      "grad_norm": 0.16332204639911652,
+      "learning_rate": 0.00019996686202430878,
+      "loss": 12.2765,
+      "step": 798
+    },
+    {
+      "epoch": 0.13083346978876698,
+      "grad_norm": 0.14092905819416046,
+      "learning_rate": 0.00019996672881383805,
+      "loss": 12.2648,
+      "step": 799
+    },
+    {
+      "epoch": 0.13099721630915342,
+      "grad_norm": 0.15404187142848969,
+      "learning_rate": 0.00019996659533620404,
+      "loss": 12.2341,
+      "step": 800
+    },
+    {
+      "epoch": 0.13116096282953987,
+      "grad_norm": 0.26723140478134155,
+      "learning_rate": 0.0001999664615914071,
+      "loss": 12.2548,
+      "step": 801
+    },
+    {
+      "epoch": 0.13132470934992632,
+      "grad_norm": 0.16581694781780243,
+      "learning_rate": 0.00019996632757944758,
+      "loss": 12.2801,
+      "step": 802
+    },
+    {
+      "epoch": 0.13148845587031274,
+      "grad_norm": 0.15285004675388336,
+      "learning_rate": 0.00019996619330032588,
+      "loss": 12.25,
+      "step": 803
+    },
+    {
+      "epoch": 0.1316522023906992,
+      "grad_norm": 0.25647279620170593,
+      "learning_rate": 0.00019996605875404234,
+      "loss": 12.2543,
+      "step": 804
+    },
+    {
+      "epoch": 0.13181594891108564,
+      "grad_norm": 0.17818157374858856,
+      "learning_rate": 0.00019996592394059732,
+      "loss": 12.269,
+      "step": 805
+    },
+    {
+      "epoch": 0.1319796954314721,
+      "grad_norm": 0.1777380108833313,
+      "learning_rate": 0.00019996578885999117,
+      "loss": 12.2547,
+      "step": 806
+    },
+    {
+      "epoch": 0.13214344195185854,
+      "grad_norm": 0.1474923938512802,
+      "learning_rate": 0.0001999656535122243,
+      "loss": 12.2643,
+      "step": 807
+    },
+    {
+      "epoch": 0.13230718847224496,
+      "grad_norm": 0.24534763395786285,
+      "learning_rate": 0.000199965517897297,
+      "loss": 12.2275,
+      "step": 808
+    },
+    {
+      "epoch": 0.1324709349926314,
+      "grad_norm": 0.148993581533432,
+      "learning_rate": 0.00019996538201520964,
+      "loss": 12.244,
+      "step": 809
+    },
+    {
+      "epoch": 0.13263468151301785,
+      "grad_norm": 0.20542854070663452,
+      "learning_rate": 0.00019996524586596263,
+      "loss": 12.2345,
+      "step": 810
+    },
+    {
+      "epoch": 0.1327984280334043,
+      "grad_norm": 0.1916378140449524,
+      "learning_rate": 0.0001999651094495563,
+      "loss": 12.2586,
+      "step": 811
+    },
+    {
+      "epoch": 0.13296217455379072,
+      "grad_norm": 0.153681218624115,
+      "learning_rate": 0.00019996497276599103,
+      "loss": 12.2098,
+      "step": 812
+    },
+    {
+      "epoch": 0.13312592107417717,
+      "grad_norm": 0.19748173654079437,
+      "learning_rate": 0.00019996483581526722,
+      "loss": 12.2717,
+      "step": 813
+    },
+    {
+      "epoch": 0.13328966759456362,
+      "grad_norm": 0.1617303192615509,
+      "learning_rate": 0.0001999646985973852,
+      "loss": 12.2432,
+      "step": 814
+    },
+    {
+      "epoch": 0.13345341411495007,
+      "grad_norm": 0.16775915026664734,
+      "learning_rate": 0.00019996456111234527,
+      "loss": 12.261,
+      "step": 815
+    },
+    {
+      "epoch": 0.1336171606353365,
+      "grad_norm": 0.36850646138191223,
+      "learning_rate": 0.0001999644233601479,
+      "loss": 12.223,
+      "step": 816
+    },
+    {
+      "epoch": 0.13378090715572294,
+      "grad_norm": 0.3190401792526245,
+      "learning_rate": 0.00019996428534079338,
+      "loss": 12.2746,
+      "step": 817
+    },
+    {
+      "epoch": 0.1339446536761094,
+      "grad_norm": 0.16210894286632538,
+      "learning_rate": 0.00019996414705428217,
+      "loss": 12.2498,
+      "step": 818
+    },
+    {
+      "epoch": 0.13410840019649584,
+      "grad_norm": 0.21445512771606445,
+      "learning_rate": 0.00019996400850061456,
+      "loss": 12.2344,
+      "step": 819
+    },
+    {
+      "epoch": 0.13427214671688226,
+      "grad_norm": 0.3667347729206085,
+      "learning_rate": 0.00019996386967979096,
+      "loss": 12.2759,
+      "step": 820
+    },
+    {
+      "epoch": 0.1344358932372687,
+      "grad_norm": 0.2724533975124359,
+      "learning_rate": 0.00019996373059181174,
+      "loss": 12.2529,
+      "step": 821
+    },
+    {
+      "epoch": 0.13459963975765515,
+      "grad_norm": 0.15931159257888794,
+      "learning_rate": 0.00019996359123667726,
+      "loss": 12.2872,
+      "step": 822
+    },
+    {
+      "epoch": 0.1347633862780416,
+      "grad_norm": 0.12415755540132523,
+      "learning_rate": 0.00019996345161438786,
+      "loss": 12.2219,
+      "step": 823
+    },
+    {
+      "epoch": 0.13492713279842802,
+      "grad_norm": 0.16354034841060638,
+      "learning_rate": 0.00019996331172494395,
+      "loss": 12.2623,
+      "step": 824
+    },
+    {
+      "epoch": 0.13509087931881447,
+      "grad_norm": 0.17267750203609467,
+      "learning_rate": 0.00019996317156834593,
+      "loss": 12.2492,
+      "step": 825
+    },
+    {
+      "epoch": 0.13525462583920092,
+      "grad_norm": 0.1510535180568695,
+      "learning_rate": 0.00019996303114459414,
+      "loss": 12.2511,
+      "step": 826
+    },
+    {
+      "epoch": 0.13541837235958737,
+      "grad_norm": 0.13352897763252258,
+      "learning_rate": 0.00019996289045368895,
+      "loss": 12.2484,
+      "step": 827
+    },
+    {
+      "epoch": 0.1355821188799738,
+      "grad_norm": 0.2510572075843811,
+      "learning_rate": 0.00019996274949563074,
+      "loss": 12.2706,
+      "step": 828
+    },
+    {
+      "epoch": 0.13574586540036024,
+      "grad_norm": 0.1663379818201065,
+      "learning_rate": 0.0001999626082704199,
+      "loss": 12.2767,
+      "step": 829
+    },
+    {
+      "epoch": 0.1359096119207467,
+      "grad_norm": 0.22475042939186096,
+      "learning_rate": 0.0001999624667780568,
+      "loss": 12.2486,
+      "step": 830
+    },
+    {
+      "epoch": 0.13607335844113314,
+      "grad_norm": 0.17449508607387543,
+      "learning_rate": 0.00019996232501854182,
+      "loss": 12.2641,
+      "step": 831
+    },
+    {
+      "epoch": 0.13623710496151956,
+      "grad_norm": 0.19512225687503815,
+      "learning_rate": 0.00019996218299187532,
+      "loss": 12.2233,
+      "step": 832
+    },
+    {
+      "epoch": 0.136400851481906,
+      "grad_norm": 0.2455431967973709,
+      "learning_rate": 0.00019996204069805772,
+      "loss": 12.2716,
+      "step": 833
+    },
+    {
+      "epoch": 0.13656459800229245,
+      "grad_norm": 0.18555228412151337,
+      "learning_rate": 0.00019996189813708938,
+      "loss": 12.2554,
+      "step": 834
+    },
+    {
+      "epoch": 0.1367283445226789,
+      "grad_norm": 0.17858745157718658,
+      "learning_rate": 0.0001999617553089707,
+      "loss": 12.2599,
+      "step": 835
+    },
+    {
+      "epoch": 0.13689209104306532,
+      "grad_norm": 0.23524503409862518,
+      "learning_rate": 0.00019996161221370198,
+      "loss": 12.2692,
+      "step": 836
+    },
+    {
+      "epoch": 0.13705583756345177,
+      "grad_norm": 0.1710844486951828,
+      "learning_rate": 0.0001999614688512837,
+      "loss": 12.2646,
+      "step": 837
+    },
+    {
+      "epoch": 0.13721958408383822,
+      "grad_norm": 0.35525500774383545,
+      "learning_rate": 0.00019996132522171617,
+      "loss": 12.3043,
+      "step": 838
+    },
+    {
+      "epoch": 0.13738333060422467,
+      "grad_norm": 0.13505268096923828,
+      "learning_rate": 0.00019996118132499985,
+      "loss": 12.2324,
+      "step": 839
+    },
+    {
+      "epoch": 0.1375470771246111,
+      "grad_norm": 0.23420777916908264,
+      "learning_rate": 0.00019996103716113508,
+      "loss": 12.262,
+      "step": 840
+    },
+    {
+      "epoch": 0.13771082364499754,
+      "grad_norm": 0.22859814763069153,
+      "learning_rate": 0.00019996089273012225,
+      "loss": 12.2815,
+      "step": 841
+    },
+    {
+      "epoch": 0.137874570165384,
+      "grad_norm": 0.23992104828357697,
+      "learning_rate": 0.0001999607480319617,
+      "loss": 12.2568,
+      "step": 842
+    },
+    {
+      "epoch": 0.13803831668577043,
+      "grad_norm": 0.2638041377067566,
+      "learning_rate": 0.0001999606030666539,
+      "loss": 12.2447,
+      "step": 843
+    },
+    {
+      "epoch": 0.13820206320615686,
+      "grad_norm": 0.15055051445960999,
+      "learning_rate": 0.00019996045783419919,
+      "loss": 12.2772,
+      "step": 844
+    },
+    {
+      "epoch": 0.1383658097265433,
+      "grad_norm": 0.31120771169662476,
+      "learning_rate": 0.00019996031233459797,
+      "loss": 12.2348,
+      "step": 845
+    },
+    {
+      "epoch": 0.13852955624692975,
+      "grad_norm": 0.14593814313411713,
+      "learning_rate": 0.00019996016656785063,
+      "loss": 12.2528,
+      "step": 846
+    },
+    {
+      "epoch": 0.1386933027673162,
+      "grad_norm": 0.28545406460762024,
+      "learning_rate": 0.00019996002053395757,
+      "loss": 12.2506,
+      "step": 847
+    },
+    {
+      "epoch": 0.13885704928770265,
+      "grad_norm": 0.1730160117149353,
+      "learning_rate": 0.00019995987423291914,
+      "loss": 12.2506,
+      "step": 848
+    },
+    {
+      "epoch": 0.13902079580808907,
+      "grad_norm": 0.17055395245552063,
+      "learning_rate": 0.00019995972766473577,
+      "loss": 12.246,
+      "step": 849
+    },
+    {
+      "epoch": 0.13918454232847552,
+      "grad_norm": 0.14731362462043762,
+      "learning_rate": 0.00019995958082940783,
+      "loss": 12.2518,
+      "step": 850
+    },
+    {
+      "epoch": 0.13934828884886197,
+      "grad_norm": 0.3664889633655548,
+      "learning_rate": 0.00019995943372693573,
+      "loss": 12.291,
+      "step": 851
+    },
+    {
+      "epoch": 0.13951203536924842,
+      "grad_norm": 0.23178958892822266,
+      "learning_rate": 0.00019995928635731987,
+      "loss": 12.2564,
+      "step": 852
+    },
+    {
+      "epoch": 0.13967578188963484,
+      "grad_norm": 0.2134544551372528,
+      "learning_rate": 0.0001999591387205606,
+      "loss": 12.2453,
+      "step": 853
+    },
+    {
+      "epoch": 0.13983952841002129,
+      "grad_norm": 0.24148419499397278,
+      "learning_rate": 0.00019995899081665835,
+      "loss": 12.2928,
+      "step": 854
+    },
+    {
+      "epoch": 0.14000327493040773,
+      "grad_norm": 0.18369217216968536,
+      "learning_rate": 0.00019995884264561352,
+      "loss": 12.2615,
+      "step": 855
+    },
+    {
+      "epoch": 0.14016702145079418,
+      "grad_norm": 0.1529937982559204,
+      "learning_rate": 0.0001999586942074265,
+      "loss": 12.2502,
+      "step": 856
+    },
+    {
+      "epoch": 0.1403307679711806,
+      "grad_norm": 0.21245932579040527,
+      "learning_rate": 0.00019995854550209762,
+      "loss": 12.2318,
+      "step": 857
+    },
+    {
+      "epoch": 0.14049451449156705,
+      "grad_norm": 0.19340169429779053,
+      "learning_rate": 0.0001999583965296274,
+      "loss": 12.2613,
+      "step": 858
+    },
+    {
+      "epoch": 0.1406582610119535,
+      "grad_norm": 0.16757652163505554,
+      "learning_rate": 0.00019995824729001615,
+      "loss": 12.2642,
+      "step": 859
+    },
+    {
+      "epoch": 0.14082200753233995,
+      "grad_norm": 0.1938764750957489,
+      "learning_rate": 0.0001999580977832643,
+      "loss": 12.263,
+      "step": 860
+    },
+    {
+      "epoch": 0.14098575405272637,
+      "grad_norm": 0.15432967245578766,
+      "learning_rate": 0.00019995794800937227,
+      "loss": 12.2581,
+      "step": 861
+    },
+    {
+      "epoch": 0.14114950057311282,
+      "grad_norm": 0.17737895250320435,
+      "learning_rate": 0.00019995779796834042,
+      "loss": 12.2488,
+      "step": 862
+    },
+    {
+      "epoch": 0.14131324709349927,
+      "grad_norm": 0.21752485632896423,
+      "learning_rate": 0.00019995764766016913,
+      "loss": 12.2289,
+      "step": 863
+    },
+    {
+      "epoch": 0.14147699361388572,
+      "grad_norm": 0.21756303310394287,
+      "learning_rate": 0.00019995749708485888,
+      "loss": 12.2532,
+      "step": 864
+    },
+    {
+      "epoch": 0.14164074013427214,
+      "grad_norm": 0.18429669737815857,
+      "learning_rate": 0.00019995734624241002,
+      "loss": 12.2623,
+      "step": 865
+    },
+    {
+      "epoch": 0.14180448665465858,
+      "grad_norm": 0.26883766055107117,
+      "learning_rate": 0.00019995719513282297,
+      "loss": 12.246,
+      "step": 866
+    },
+    {
+      "epoch": 0.14196823317504503,
+      "grad_norm": 0.16113273799419403,
+      "learning_rate": 0.00019995704375609812,
+      "loss": 12.2727,
+      "step": 867
+    },
+    {
+      "epoch": 0.14213197969543148,
+      "grad_norm": 0.18849468231201172,
+      "learning_rate": 0.00019995689211223589,
+      "loss": 12.2752,
+      "step": 868
+    },
+    {
+      "epoch": 0.1422957262158179,
+      "grad_norm": 0.18139471113681793,
+      "learning_rate": 0.00019995674020123664,
+      "loss": 12.2799,
+      "step": 869
+    },
+    {
+      "epoch": 0.14245947273620435,
+      "grad_norm": 0.1637876331806183,
+      "learning_rate": 0.00019995658802310085,
+      "loss": 12.2431,
+      "step": 870
+    },
+    {
+      "epoch": 0.1426232192565908,
+      "grad_norm": 0.2694747745990753,
+      "learning_rate": 0.00019995643557782887,
+      "loss": 12.2353,
+      "step": 871
+    },
+    {
+      "epoch": 0.14278696577697725,
+      "grad_norm": 0.18601356446743011,
+      "learning_rate": 0.00019995628286542113,
+      "loss": 12.2567,
+      "step": 872
+    },
+    {
+      "epoch": 0.14295071229736367,
+      "grad_norm": 0.15186317265033722,
+      "learning_rate": 0.00019995612988587803,
+      "loss": 12.2468,
+      "step": 873
+    },
+    {
+      "epoch": 0.14311445881775012,
+      "grad_norm": 0.1920851171016693,
+      "learning_rate": 0.00019995597663920002,
+      "loss": 12.2523,
+      "step": 874
+    },
+    {
+      "epoch": 0.14327820533813657,
+      "grad_norm": 0.16891133785247803,
+      "learning_rate": 0.00019995582312538744,
+      "loss": 12.258,
+      "step": 875
+    },
+    {
+      "epoch": 0.14344195185852301,
+      "grad_norm": 0.20723503828048706,
+      "learning_rate": 0.00019995566934444075,
+      "loss": 12.261,
+      "step": 876
+    },
+    {
+      "epoch": 0.14360569837890944,
+      "grad_norm": 0.1818312555551529,
+      "learning_rate": 0.00019995551529636033,
+      "loss": 12.2568,
+      "step": 877
+    },
+    {
+      "epoch": 0.14376944489929588,
+      "grad_norm": 0.2277580350637436,
+      "learning_rate": 0.0001999553609811466,
+      "loss": 12.2225,
+      "step": 878
+    },
+    {
+      "epoch": 0.14393319141968233,
+      "grad_norm": 0.2139032483100891,
+      "learning_rate": 0.00019995520639880002,
+      "loss": 12.2652,
+      "step": 879
+    },
+    {
+      "epoch": 0.14409693794006878,
+      "grad_norm": 0.14729733765125275,
+      "learning_rate": 0.00019995505154932092,
+      "loss": 12.2582,
+      "step": 880
+    },
+    {
+      "epoch": 0.1442606844604552,
+      "grad_norm": 0.2961966395378113,
+      "learning_rate": 0.00019995489643270976,
+      "loss": 12.256,
+      "step": 881
+    },
+    {
+      "epoch": 0.14442443098084165,
+      "grad_norm": 0.25167980790138245,
+      "learning_rate": 0.00019995474104896697,
+      "loss": 12.2299,
+      "step": 882
+    },
+    {
+      "epoch": 0.1445881775012281,
+      "grad_norm": 0.19484156370162964,
+      "learning_rate": 0.00019995458539809292,
+      "loss": 12.2448,
+      "step": 883
+    },
+    {
+      "epoch": 0.14475192402161455,
+      "grad_norm": 0.28314441442489624,
+      "learning_rate": 0.0001999544294800881,
+      "loss": 12.2692,
+      "step": 884
+    },
+    {
+      "epoch": 0.144915670542001,
+      "grad_norm": 0.2943701446056366,
+      "learning_rate": 0.00019995427329495282,
+      "loss": 12.2277,
+      "step": 885
+    },
+    {
+      "epoch": 0.14507941706238742,
+      "grad_norm": 0.30075713992118835,
+      "learning_rate": 0.0001999541168426876,
+      "loss": 12.2477,
+      "step": 886
+    },
+    {
+      "epoch": 0.14524316358277387,
+      "grad_norm": 0.19734449684619904,
+      "learning_rate": 0.0001999539601232928,
+      "loss": 12.249,
+      "step": 887
+    },
+    {
+      "epoch": 0.14540691010316031,
+      "grad_norm": 0.23548541963100433,
+      "learning_rate": 0.00019995380313676884,
+      "loss": 12.267,
+      "step": 888
+    },
+    {
+      "epoch": 0.14557065662354676,
+      "grad_norm": 0.12990887463092804,
+      "learning_rate": 0.00019995364588311617,
+      "loss": 12.2397,
+      "step": 889
+    },
+    {
+      "epoch": 0.14573440314393318,
+      "grad_norm": 0.1529313325881958,
+      "learning_rate": 0.00019995348836233516,
+      "loss": 12.2598,
+      "step": 890
+    },
+    {
+      "epoch": 0.14589814966431963,
+      "grad_norm": 0.17717255651950836,
+      "learning_rate": 0.0001999533305744263,
+      "loss": 12.2697,
+      "step": 891
+    },
+    {
+      "epoch": 0.14606189618470608,
+      "grad_norm": 0.1448763757944107,
+      "learning_rate": 0.00019995317251938994,
+      "loss": 12.2429,
+      "step": 892
+    },
+    {
+      "epoch": 0.14622564270509253,
+      "grad_norm": 0.18021667003631592,
+      "learning_rate": 0.00019995301419722657,
+      "loss": 12.2432,
+      "step": 893
+    },
+    {
+      "epoch": 0.14638938922547895,
+      "grad_norm": 0.2443927526473999,
+      "learning_rate": 0.00019995285560793656,
+      "loss": 12.2451,
+      "step": 894
+    },
+    {
+      "epoch": 0.1465531357458654,
+      "grad_norm": 0.2532816231250763,
+      "learning_rate": 0.00019995269675152037,
+      "loss": 12.2565,
+      "step": 895
+    },
+    {
+      "epoch": 0.14671688226625185,
+      "grad_norm": 0.24490387737751007,
+      "learning_rate": 0.00019995253762797842,
+      "loss": 12.2618,
+      "step": 896
+    },
+    {
+      "epoch": 0.1468806287866383,
+      "grad_norm": 0.16988995671272278,
+      "learning_rate": 0.00019995237823731109,
+      "loss": 12.2267,
+      "step": 897
+    },
+    {
+      "epoch": 0.14704437530702472,
+      "grad_norm": 0.1722571700811386,
+      "learning_rate": 0.00019995221857951884,
+      "loss": 12.2419,
+      "step": 898
+    },
+    {
+      "epoch": 0.14720812182741116,
+      "grad_norm": 0.26967161893844604,
+      "learning_rate": 0.00019995205865460213,
+      "loss": 12.2608,
+      "step": 899
+    },
+    {
+      "epoch": 0.1473718683477976,
+      "grad_norm": 0.16842836141586304,
+      "learning_rate": 0.00019995189846256132,
+      "loss": 12.2694,
+      "step": 900
+    },
+    {
+      "epoch": 0.14753561486818406,
+      "grad_norm": 0.16113752126693726,
+      "learning_rate": 0.00019995173800339692,
+      "loss": 12.2416,
+      "step": 901
+    },
+    {
+      "epoch": 0.14769936138857048,
+      "grad_norm": 0.23641365766525269,
+      "learning_rate": 0.00019995157727710928,
+      "loss": 12.2583,
+      "step": 902
+    },
+    {
+      "epoch": 0.14786310790895693,
+      "grad_norm": 0.15737037360668182,
+      "learning_rate": 0.00019995141628369883,
+      "loss": 12.2499,
+      "step": 903
+    },
+    {
+      "epoch": 0.14802685442934338,
+      "grad_norm": 0.17378929257392883,
+      "learning_rate": 0.0001999512550231661,
+      "loss": 12.2601,
+      "step": 904
+    },
+    {
+      "epoch": 0.14819060094972983,
+      "grad_norm": 0.16688664257526398,
+      "learning_rate": 0.00019995109349551143,
+      "loss": 12.2777,
+      "step": 905
+    },
+    {
+      "epoch": 0.14835434747011625,
+      "grad_norm": 0.1975651979446411,
+      "learning_rate": 0.00019995093170073523,
+      "loss": 12.2414,
+      "step": 906
+    },
+    {
+      "epoch": 0.1485180939905027,
+      "grad_norm": 0.16175058484077454,
+      "learning_rate": 0.00019995076963883802,
+      "loss": 12.2685,
+      "step": 907
+    },
+    {
+      "epoch": 0.14868184051088915,
+      "grad_norm": 0.21172209084033966,
+      "learning_rate": 0.00019995060730982017,
+      "loss": 12.2506,
+      "step": 908
+    },
+    {
+      "epoch": 0.1488455870312756,
+      "grad_norm": 0.16677246987819672,
+      "learning_rate": 0.00019995044471368215,
+      "loss": 12.2602,
+      "step": 909
+    },
+    {
+      "epoch": 0.14900933355166202,
+      "grad_norm": 0.1518072932958603,
+      "learning_rate": 0.00019995028185042438,
+      "loss": 12.2631,
+      "step": 910
+    },
+    {
+      "epoch": 0.14917308007204846,
+      "grad_norm": 0.17491476237773895,
+      "learning_rate": 0.00019995011872004726,
+      "loss": 12.2561,
+      "step": 911
+    },
+    {
+      "epoch": 0.1493368265924349,
+      "grad_norm": 0.2581184208393097,
+      "learning_rate": 0.00019994995532255128,
+      "loss": 12.252,
+      "step": 912
+    },
+    {
+      "epoch": 0.14950057311282136,
+      "grad_norm": 0.15219415724277496,
+      "learning_rate": 0.00019994979165793684,
+      "loss": 12.2523,
+      "step": 913
+    },
+    {
+      "epoch": 0.14966431963320778,
+      "grad_norm": 0.16342870891094208,
+      "learning_rate": 0.00019994962772620442,
+      "loss": 12.2635,
+      "step": 914
+    },
+    {
+      "epoch": 0.14982806615359423,
+      "grad_norm": 0.25702103972435,
+      "learning_rate": 0.00019994946352735443,
+      "loss": 12.2455,
+      "step": 915
+    },
+    {
+      "epoch": 0.14999181267398068,
+      "grad_norm": 0.27089154720306396,
+      "learning_rate": 0.00019994929906138727,
+      "loss": 12.2748,
+      "step": 916
+    },
+    {
+      "epoch": 0.15015555919436713,
+      "grad_norm": 0.22161947190761566,
+      "learning_rate": 0.00019994913432830344,
+      "loss": 12.2711,
+      "step": 917
+    },
+    {
+      "epoch": 0.15031930571475355,
+      "grad_norm": 0.21250496804714203,
+      "learning_rate": 0.00019994896932810338,
+      "loss": 12.2435,
+      "step": 918
+    },
+    {
+      "epoch": 0.15048305223514,
+      "grad_norm": 0.21829836070537567,
+      "learning_rate": 0.00019994880406078752,
+      "loss": 12.2737,
+      "step": 919
+    },
+    {
+      "epoch": 0.15064679875552645,
+      "grad_norm": 0.2482108175754547,
+      "learning_rate": 0.00019994863852635625,
+      "loss": 12.2545,
+      "step": 920
+    },
+    {
+      "epoch": 0.1508105452759129,
+      "grad_norm": 0.20951540768146515,
+      "learning_rate": 0.00019994847272481007,
+      "loss": 12.2355,
+      "step": 921
+    },
+    {
+      "epoch": 0.15097429179629934,
+      "grad_norm": 0.18365690112113953,
+      "learning_rate": 0.00019994830665614943,
+      "loss": 12.2399,
+      "step": 922
+    },
+    {
+      "epoch": 0.15113803831668576,
+      "grad_norm": 0.22276079654693604,
+      "learning_rate": 0.0001999481403203747,
+      "loss": 12.3016,
+      "step": 923
+    },
+    {
+      "epoch": 0.1513017848370722,
+      "grad_norm": 0.2162826955318451,
+      "learning_rate": 0.00019994797371748643,
+      "loss": 12.2524,
+      "step": 924
+    },
+    {
+      "epoch": 0.15146553135745866,
+      "grad_norm": 0.21627525985240936,
+      "learning_rate": 0.000199947806847485,
+      "loss": 12.2235,
+      "step": 925
+    },
+    {
+      "epoch": 0.1516292778778451,
+      "grad_norm": 0.1927967220544815,
+      "learning_rate": 0.00019994763971037087,
+      "loss": 12.2738,
+      "step": 926
+    },
+    {
+      "epoch": 0.15179302439823153,
+      "grad_norm": 0.16526252031326294,
+      "learning_rate": 0.0001999474723061445,
+      "loss": 12.2413,
+      "step": 927
+    },
+    {
+      "epoch": 0.15195677091861798,
+      "grad_norm": 0.23166383802890778,
+      "learning_rate": 0.00019994730463480625,
+      "loss": 12.2622,
+      "step": 928
+    },
+    {
+      "epoch": 0.15212051743900443,
+      "grad_norm": 0.14684942364692688,
+      "learning_rate": 0.0001999471366963567,
+      "loss": 12.3034,
+      "step": 929
+    },
+    {
+      "epoch": 0.15228426395939088,
+      "grad_norm": 0.20183612406253815,
+      "learning_rate": 0.0001999469684907962,
+      "loss": 12.2465,
+      "step": 930
+    },
+    {
+      "epoch": 0.1524480104797773,
+      "grad_norm": 0.2674098312854767,
+      "learning_rate": 0.00019994680001812526,
+      "loss": 12.2674,
+      "step": 931
+    },
+    {
+      "epoch": 0.15261175700016374,
+      "grad_norm": 0.25609856843948364,
+      "learning_rate": 0.00019994663127834432,
+      "loss": 12.2579,
+      "step": 932
+    },
+    {
+      "epoch": 0.1527755035205502,
+      "grad_norm": 0.208163782954216,
+      "learning_rate": 0.0001999464622714538,
+      "loss": 12.2191,
+      "step": 933
+    },
+    {
+      "epoch": 0.15293925004093664,
+      "grad_norm": 0.23667578399181366,
+      "learning_rate": 0.0001999462929974542,
+      "loss": 12.2729,
+      "step": 934
+    },
+    {
+      "epoch": 0.15310299656132306,
+      "grad_norm": 0.1899147778749466,
+      "learning_rate": 0.00019994612345634592,
+      "loss": 12.2478,
+      "step": 935
+    },
+    {
+      "epoch": 0.1532667430817095,
+      "grad_norm": 0.2460227757692337,
+      "learning_rate": 0.00019994595364812944,
+      "loss": 12.2444,
+      "step": 936
+    },
+    {
+      "epoch": 0.15343048960209596,
+      "grad_norm": 0.27262449264526367,
+      "learning_rate": 0.0001999457835728052,
+      "loss": 12.239,
+      "step": 937
+    },
+    {
+      "epoch": 0.1535942361224824,
+      "grad_norm": 0.25000107288360596,
+      "learning_rate": 0.0001999456132303737,
+      "loss": 12.258,
+      "step": 938
+    },
+    {
+      "epoch": 0.15375798264286883,
+      "grad_norm": 0.21081086993217468,
+      "learning_rate": 0.0001999454426208353,
+      "loss": 12.2568,
+      "step": 939
+    },
+    {
+      "epoch": 0.15392172916325528,
+      "grad_norm": 0.27827292680740356,
+      "learning_rate": 0.00019994527174419056,
+      "loss": 12.2283,
+      "step": 940
+    },
+    {
+      "epoch": 0.15408547568364173,
+      "grad_norm": 0.26305484771728516,
+      "learning_rate": 0.00019994510060043988,
+      "loss": 12.2749,
+      "step": 941
+    },
+    {
+      "epoch": 0.15424922220402817,
+      "grad_norm": 0.21291232109069824,
+      "learning_rate": 0.00019994492918958372,
+      "loss": 12.2384,
+      "step": 942
+    },
+    {
+      "epoch": 0.1544129687244146,
+      "grad_norm": 0.17280226945877075,
+      "learning_rate": 0.00019994475751162257,
+      "loss": 12.2729,
+      "step": 943
+    },
+    {
+      "epoch": 0.15457671524480104,
+      "grad_norm": 0.16324912011623383,
+      "learning_rate": 0.00019994458556655685,
+      "loss": 12.2481,
+      "step": 944
+    },
+    {
+      "epoch": 0.1547404617651875,
+      "grad_norm": 0.18047533929347992,
+      "learning_rate": 0.00019994441335438704,
+      "loss": 12.2624,
+      "step": 945
+    },
+    {
+      "epoch": 0.15490420828557394,
+      "grad_norm": 0.1873340606689453,
+      "learning_rate": 0.0001999442408751136,
+      "loss": 12.2531,
+      "step": 946
+    },
+    {
+      "epoch": 0.15506795480596036,
+      "grad_norm": 0.30801185965538025,
+      "learning_rate": 0.000199944068128737,
+      "loss": 12.3044,
+      "step": 947
+    },
+    {
+      "epoch": 0.1552317013263468,
+      "grad_norm": 0.23735474050045013,
+      "learning_rate": 0.0001999438951152577,
+      "loss": 12.2282,
+      "step": 948
+    },
+    {
+      "epoch": 0.15539544784673326,
+      "grad_norm": 0.18847191333770752,
+      "learning_rate": 0.0001999437218346761,
+      "loss": 12.2607,
+      "step": 949
+    },
+    {
+      "epoch": 0.1555591943671197,
+      "grad_norm": 0.2875593900680542,
+      "learning_rate": 0.00019994354828699275,
+      "loss": 12.2203,
+      "step": 950
+    },
+    {
+      "epoch": 0.15572294088750613,
+      "grad_norm": 0.20091582834720612,
+      "learning_rate": 0.00019994337447220808,
+      "loss": 12.2733,
+      "step": 951
+    },
+    {
+      "epoch": 0.15588668740789258,
+      "grad_norm": 0.20508666336536407,
+      "learning_rate": 0.00019994320039032253,
+      "loss": 12.2486,
+      "step": 952
+    },
+    {
+      "epoch": 0.15605043392827903,
+      "grad_norm": 0.20233626663684845,
+      "learning_rate": 0.0001999430260413366,
+      "loss": 12.2954,
+      "step": 953
+    },
+    {
+      "epoch": 0.15621418044866547,
+      "grad_norm": 0.15181776881217957,
+      "learning_rate": 0.00019994285142525074,
+      "loss": 12.239,
+      "step": 954
+    },
+    {
+      "epoch": 0.1563779269690519,
+      "grad_norm": 0.1466662883758545,
+      "learning_rate": 0.00019994267654206544,
+      "loss": 12.2513,
+      "step": 955
+    },
+    {
+      "epoch": 0.15654167348943834,
+      "grad_norm": 0.3135926127433777,
+      "learning_rate": 0.00019994250139178114,
+      "loss": 12.2292,
+      "step": 956
+    },
+    {
+      "epoch": 0.1567054200098248,
+      "grad_norm": 0.23405830562114716,
+      "learning_rate": 0.00019994232597439832,
+      "loss": 12.2669,
+      "step": 957
+    },
+    {
+      "epoch": 0.15686916653021124,
+      "grad_norm": 0.20362041890621185,
+      "learning_rate": 0.00019994215028991744,
+      "loss": 12.2616,
+      "step": 958
+    },
+    {
+      "epoch": 0.15703291305059766,
+      "grad_norm": 0.1632128357887268,
+      "learning_rate": 0.00019994197433833896,
+      "loss": 12.222,
+      "step": 959
+    },
+    {
+      "epoch": 0.1571966595709841,
+      "grad_norm": 0.1670769453048706,
+      "learning_rate": 0.00019994179811966335,
+      "loss": 12.2788,
+      "step": 960
+    },
+    {
+      "epoch": 0.15736040609137056,
+      "grad_norm": 0.26559993624687195,
+      "learning_rate": 0.00019994162163389115,
+      "loss": 12.2452,
+      "step": 961
+    },
+    {
+      "epoch": 0.157524152611757,
+      "grad_norm": 0.26576483249664307,
+      "learning_rate": 0.00019994144488102276,
+      "loss": 12.2494,
+      "step": 962
+    },
+    {
+      "epoch": 0.15768789913214346,
+      "grad_norm": 0.13807259500026703,
+      "learning_rate": 0.00019994126786105864,
+      "loss": 12.2752,
+      "step": 963
+    },
+    {
+      "epoch": 0.15785164565252988,
+      "grad_norm": 0.155136376619339,
+      "learning_rate": 0.0001999410905739993,
+      "loss": 12.3317,
+      "step": 964
+    },
+    {
+      "epoch": 0.15801539217291632,
+      "grad_norm": 0.19782254099845886,
+      "learning_rate": 0.00019994091301984526,
+      "loss": 12.2419,
+      "step": 965
+    },
+    {
+      "epoch": 0.15817913869330277,
+      "grad_norm": 0.2084280252456665,
+      "learning_rate": 0.0001999407351985969,
+      "loss": 12.2583,
+      "step": 966
+    },
+    {
+      "epoch": 0.15834288521368922,
+      "grad_norm": 0.21387748420238495,
+      "learning_rate": 0.00019994055711025472,
+      "loss": 12.2722,
+      "step": 967
+    },
+    {
+      "epoch": 0.15850663173407564,
+      "grad_norm": 0.45252978801727295,
+      "learning_rate": 0.00019994037875481924,
+      "loss": 12.2957,
+      "step": 968
+    },
+    {
+      "epoch": 0.1586703782544621,
+      "grad_norm": 0.19610193371772766,
+      "learning_rate": 0.00019994020013229088,
+      "loss": 12.2351,
+      "step": 969
+    },
+    {
+      "epoch": 0.15883412477484854,
+      "grad_norm": 0.12732064723968506,
+      "learning_rate": 0.00019994002124267018,
+      "loss": 12.2691,
+      "step": 970
+    },
+    {
+      "epoch": 0.158997871295235,
+      "grad_norm": 0.1697031557559967,
+      "learning_rate": 0.00019993984208595758,
+      "loss": 12.2162,
+      "step": 971
+    },
+    {
+      "epoch": 0.1591616178156214,
+      "grad_norm": 0.3649829030036926,
+      "learning_rate": 0.00019993966266215355,
+      "loss": 12.265,
+      "step": 972
+    },
+    {
+      "epoch": 0.15932536433600786,
+      "grad_norm": 0.26839733123779297,
+      "learning_rate": 0.0001999394829712586,
+      "loss": 12.2342,
+      "step": 973
+    },
+    {
+      "epoch": 0.1594891108563943,
+      "grad_norm": 0.22787222266197205,
+      "learning_rate": 0.00019993930301327316,
+      "loss": 12.2668,
+      "step": 974
+    },
+    {
+      "epoch": 0.15965285737678075,
+      "grad_norm": 0.16727082431316376,
+      "learning_rate": 0.00019993912278819777,
+      "loss": 12.25,
+      "step": 975
+    },
+    {
+      "epoch": 0.15981660389716718,
+      "grad_norm": 0.17583709955215454,
+      "learning_rate": 0.00019993894229603288,
+      "loss": 12.2442,
+      "step": 976
+    },
+    {
+      "epoch": 0.15998035041755362,
+      "grad_norm": 0.16142424941062927,
+      "learning_rate": 0.00019993876153677899,
+      "loss": 12.2763,
+      "step": 977
+    },
+    {
+      "epoch": 0.16014409693794007,
+      "grad_norm": 0.17407581210136414,
+      "learning_rate": 0.00019993858051043655,
+      "loss": 12.2548,
+      "step": 978
+    },
+    {
+      "epoch": 0.16030784345832652,
+      "grad_norm": 0.27629950642585754,
+      "learning_rate": 0.0001999383992170061,
+      "loss": 12.2643,
+      "step": 979
+    },
+    {
+      "epoch": 0.16047158997871294,
+      "grad_norm": 0.2096289098262787,
+      "learning_rate": 0.00019993821765648803,
+      "loss": 12.2526,
+      "step": 980
+    },
+    {
+      "epoch": 0.1606353364990994,
+      "grad_norm": 0.2241704910993576,
+      "learning_rate": 0.00019993803582888292,
+      "loss": 12.283,
+      "step": 981
+    },
+    {
+      "epoch": 0.16079908301948584,
+      "grad_norm": 0.1575346291065216,
+      "learning_rate": 0.00019993785373419122,
+      "loss": 12.2503,
+      "step": 982
+    },
+    {
+      "epoch": 0.1609628295398723,
+      "grad_norm": 0.3286789655685425,
+      "learning_rate": 0.0001999376713724134,
+      "loss": 12.2153,
+      "step": 983
+    },
+    {
+      "epoch": 0.1611265760602587,
+      "grad_norm": 0.17322884500026703,
+      "learning_rate": 0.00019993748874355,
+      "loss": 12.2841,
+      "step": 984
+    },
+    {
+      "epoch": 0.16129032258064516,
+      "grad_norm": 0.18421022593975067,
+      "learning_rate": 0.00019993730584760147,
+      "loss": 12.2642,
+      "step": 985
+    },
+    {
+      "epoch": 0.1614540691010316,
+      "grad_norm": 0.2500661611557007,
+      "learning_rate": 0.00019993712268456828,
+      "loss": 12.2554,
+      "step": 986
+    },
+    {
+      "epoch": 0.16161781562141805,
+      "grad_norm": 0.13105358183383942,
+      "learning_rate": 0.00019993693925445094,
+      "loss": 12.261,
+      "step": 987
+    },
+    {
+      "epoch": 0.16178156214180447,
+      "grad_norm": 0.1763312965631485,
+      "learning_rate": 0.00019993675555724996,
+      "loss": 12.2281,
+      "step": 988
+    },
+    {
+      "epoch": 0.16194530866219092,
+      "grad_norm": 0.14316731691360474,
+      "learning_rate": 0.00019993657159296578,
+      "loss": 12.2557,
+      "step": 989
+    },
+    {
+      "epoch": 0.16210905518257737,
+      "grad_norm": 0.19045111536979675,
+      "learning_rate": 0.00019993638736159896,
+      "loss": 12.2593,
+      "step": 990
+    },
+    {
+      "epoch": 0.16227280170296382,
+      "grad_norm": 0.15188392996788025,
+      "learning_rate": 0.00019993620286314995,
+      "loss": 12.2705,
+      "step": 991
+    },
+    {
+      "epoch": 0.16243654822335024,
+      "grad_norm": 0.26927465200424194,
+      "learning_rate": 0.00019993601809761922,
+      "loss": 12.3048,
+      "step": 992
+    },
+    {
+      "epoch": 0.1626002947437367,
+      "grad_norm": 0.2550506591796875,
+      "learning_rate": 0.00019993583306500732,
+      "loss": 12.2609,
+      "step": 993
+    },
+    {
+      "epoch": 0.16276404126412314,
+      "grad_norm": 0.27006834745407104,
+      "learning_rate": 0.0001999356477653147,
+      "loss": 12.2305,
+      "step": 994
+    },
+    {
+      "epoch": 0.1629277877845096,
+      "grad_norm": 0.1495426744222641,
+      "learning_rate": 0.00019993546219854188,
+      "loss": 12.2397,
+      "step": 995
+    },
+    {
+      "epoch": 0.163091534304896,
+      "grad_norm": 0.1674230992794037,
+      "learning_rate": 0.00019993527636468937,
+      "loss": 12.2468,
+      "step": 996
+    },
+    {
+      "epoch": 0.16325528082528246,
+      "grad_norm": 0.1745332032442093,
+      "learning_rate": 0.0001999350902637576,
+      "loss": 12.2406,
+      "step": 997
+    },
+    {
+      "epoch": 0.1634190273456689,
+      "grad_norm": 0.1422467827796936,
+      "learning_rate": 0.00019993490389574715,
+      "loss": 12.2305,
+      "step": 998
+    },
+    {
+      "epoch": 0.16358277386605535,
+      "grad_norm": 0.21741586923599243,
+      "learning_rate": 0.0001999347172606585,
+      "loss": 12.2306,
+      "step": 999
+    },
+    {
+      "epoch": 0.1637465203864418,
+      "grad_norm": 0.15799422562122345,
+      "learning_rate": 0.00019993453035849207,
+      "loss": 12.2591,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1637465203864418,
+      "eval_loss": 12.25197696685791,
+      "eval_runtime": 7.2766,
+      "eval_samples_per_second": 33.807,
+      "eval_steps_per_second": 16.903,
+      "step": 1000
     }
   ],
   "logging_steps": 1,
-  "max_steps": 18321,
+  "max_steps": 61070,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 3,
-  "save_steps": 100,
+  "num_train_epochs": 10,
+  "save_steps": 1000,
   "stateful_callbacks": {
     "EarlyStoppingCallback": {
       "args": {
-        "early_stopping_patience": 4,
+        "early_stopping_patience": 3,
         "early_stopping_threshold": 0.0
       },
       "attributes": {
@@ -3583,7 +7051,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 320946634752.0,
+  "total_flos": 641772748800.0,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null