diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-1000/trainer_state.json"
@@ -0,0 +1,7050 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6916825177243645,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0006916825177243646,
+      "grad_norm": 0.21880632638931274,
+      "learning_rate": 0.0,
+      "loss": 2.5104,
+      "step": 1
+    },
+    {
+      "epoch": 0.0013833650354487291,
+      "grad_norm": 0.2225637435913086,
+      "learning_rate": 1.36986301369863e-05,
+      "loss": 2.7879,
+      "step": 2
+    },
+    {
+      "epoch": 0.0020750475531730937,
+      "grad_norm": 0.21454782783985138,
+      "learning_rate": 2.73972602739726e-05,
+      "loss": 2.8515,
+      "step": 3
+    },
+    {
+      "epoch": 0.0027667300708974583,
+      "grad_norm": 0.285408616065979,
+      "learning_rate": 4.1095890410958905e-05,
+      "loss": 2.8123,
+      "step": 4
+    },
+    {
+      "epoch": 0.0034584125886218224,
+      "grad_norm": 0.231473907828331,
+      "learning_rate": 5.47945205479452e-05,
+      "loss": 2.5181,
+      "step": 5
+    },
+    {
+      "epoch": 0.004150095106346187,
+      "grad_norm": 0.20668023824691772,
+      "learning_rate": 6.84931506849315e-05,
+      "loss": 2.066,
+      "step": 6
+    },
+    {
+      "epoch": 0.0048417776240705515,
+      "grad_norm": 0.3103766441345215,
+      "learning_rate": 8.219178082191781e-05,
+      "loss": 2.0647,
+      "step": 7
+    },
+    {
+      "epoch": 0.0055334601417949165,
+      "grad_norm": 0.3536206781864166,
+      "learning_rate": 9.58904109589041e-05,
+      "loss": 2.1267,
+      "step": 8
+    },
+    {
+      "epoch": 0.006225142659519281,
+      "grad_norm": 0.2078174203634262,
+      "learning_rate": 0.0001095890410958904,
+      "loss": 2.5859,
+      "step": 9
+    },
+    {
+      "epoch": 0.006916825177243645,
+      "grad_norm": 0.2702614367008209,
+      "learning_rate": 0.0001232876712328767,
+      "loss": 2.7732,
+      "step": 10
+    },
+    {
+      "epoch": 0.00760850769496801,
+      "grad_norm": 0.348145067691803,
+      "learning_rate": 0.000136986301369863,
+      "loss": 2.6776,
+      "step": 11
+    },
+    {
+      "epoch": 0.008300190212692375,
+      "grad_norm": 0.32872459292411804,
+      "learning_rate": 0.0001506849315068493,
+      "loss": 1.5802,
+      "step": 12
+    },
+    {
+      "epoch": 0.008991872730416739,
+      "grad_norm": 0.7062669992446899,
+      "learning_rate": 0.00016438356164383562,
+      "loss": 1.9955,
+      "step": 13
+    },
+    {
+      "epoch": 0.009683555248141103,
+      "grad_norm": 0.252165287733078,
+      "learning_rate": 0.00017808219178082192,
+      "loss": 2.9662,
+      "step": 14
+    },
+    {
+      "epoch": 0.010375237765865467,
+      "grad_norm": 0.29550454020500183,
+      "learning_rate": 0.0001917808219178082,
+      "loss": 2.3048,
+      "step": 15
+    },
+    {
+      "epoch": 0.011066920283589833,
+      "grad_norm": 0.3947546184062958,
+      "learning_rate": 0.0002054794520547945,
+      "loss": 2.8061,
+      "step": 16
+    },
+    {
+      "epoch": 0.011758602801314197,
+      "grad_norm": 0.26285308599472046,
+      "learning_rate": 0.0002191780821917808,
+      "loss": 2.7137,
+      "step": 17
+    },
+    {
+      "epoch": 0.012450285319038561,
+      "grad_norm": 0.3446462154388428,
+      "learning_rate": 0.00023287671232876712,
+      "loss": 2.3531,
+      "step": 18
+    },
+    {
+      "epoch": 0.013141967836762926,
+      "grad_norm": 0.2948848307132721,
+      "learning_rate": 0.0002465753424657534,
+      "loss": 2.2846,
+      "step": 19
+    },
+    {
+      "epoch": 0.01383365035448729,
+      "grad_norm": 0.3657473027706146,
+      "learning_rate": 0.0002602739726027397,
+      "loss": 2.4455,
+      "step": 20
+    },
+    {
+      "epoch": 0.014525332872211656,
+      "grad_norm": 0.2797200381755829,
+      "learning_rate": 0.000273972602739726,
+      "loss": 2.6308,
+      "step": 21
+    },
+    {
+      "epoch": 0.01521701538993602,
+      "grad_norm": 0.32914993166923523,
+      "learning_rate": 0.0002876712328767123,
+      "loss": 2.3918,
+      "step": 22
+    },
+    {
+      "epoch": 0.015908697907660384,
+      "grad_norm": 0.3232629895210266,
+      "learning_rate": 0.0003013698630136986,
+      "loss": 1.714,
+      "step": 23
+    },
+    {
+      "epoch": 0.01660038042538475,
+      "grad_norm": 0.3496573269367218,
+      "learning_rate": 0.00031506849315068495,
+      "loss": 2.634,
+      "step": 24
+    },
+    {
+      "epoch": 0.017292062943109112,
+      "grad_norm": 0.34902870655059814,
+      "learning_rate": 0.00032876712328767124,
+      "loss": 2.6269,
+      "step": 25
+    },
+    {
+      "epoch": 0.017983745460833478,
+      "grad_norm": 0.3312171399593353,
+      "learning_rate": 0.00034246575342465754,
+      "loss": 2.5056,
+      "step": 26
+    },
+    {
+      "epoch": 0.01867542797855784,
+      "grad_norm": 0.35412952303886414,
+      "learning_rate": 0.00035616438356164383,
+      "loss": 2.0546,
+      "step": 27
+    },
+    {
+      "epoch": 0.019367110496282206,
+      "grad_norm": 0.3497133255004883,
+      "learning_rate": 0.0003698630136986301,
+      "loss": 2.5338,
+      "step": 28
+    },
+    {
+      "epoch": 0.020058793014006572,
+      "grad_norm": 0.4878860116004944,
+      "learning_rate": 0.0003835616438356164,
+      "loss": 2.4302,
+      "step": 29
+    },
+    {
+      "epoch": 0.020750475531730934,
+      "grad_norm": 0.6170843243598938,
+      "learning_rate": 0.0003972602739726027,
+      "loss": 2.4316,
+      "step": 30
+    },
+    {
+      "epoch": 0.0214421580494553,
+      "grad_norm": 0.5822828412055969,
+      "learning_rate": 0.000410958904109589,
+      "loss": 2.2286,
+      "step": 31
+    },
+    {
+      "epoch": 0.022133840567179666,
+      "grad_norm": 0.3742135167121887,
+      "learning_rate": 0.0004246575342465753,
+      "loss": 2.145,
+      "step": 32
+    },
+    {
+      "epoch": 0.02282552308490403,
+      "grad_norm": 0.9055424332618713,
+      "learning_rate": 0.0004383561643835616,
+      "loss": 1.6547,
+      "step": 33
+    },
+    {
+      "epoch": 0.023517205602628394,
+      "grad_norm": 0.5974112153053284,
+      "learning_rate": 0.00045205479452054795,
+      "loss": 1.3416,
+      "step": 34
+    },
+    {
+      "epoch": 0.024208888120352757,
+      "grad_norm": 0.5048322677612305,
+      "learning_rate": 0.00046575342465753425,
+      "loss": 1.9358,
+      "step": 35
+    },
+    {
+      "epoch": 0.024900570638077123,
+      "grad_norm": 0.6585083603858948,
+      "learning_rate": 0.00047945205479452054,
+      "loss": 2.5502,
+      "step": 36
+    },
+    {
+      "epoch": 0.02559225315580149,
+      "grad_norm": 0.5043433904647827,
+      "learning_rate": 0.0004931506849315068,
+      "loss": 1.9789,
+      "step": 37
+    },
+    {
+      "epoch": 0.02628393567352585,
+      "grad_norm": 0.7396597266197205,
+      "learning_rate": 0.0005068493150684932,
+      "loss": 1.228,
+      "step": 38
+    },
+    {
+      "epoch": 0.026975618191250217,
+      "grad_norm": 0.6552051901817322,
+      "learning_rate": 0.0005205479452054794,
+      "loss": 2.2174,
+      "step": 39
+    },
+    {
+      "epoch": 0.02766730070897458,
+      "grad_norm": 0.5566487908363342,
+      "learning_rate": 0.0005342465753424658,
+      "loss": 1.4923,
+      "step": 40
+    },
+    {
+      "epoch": 0.028358983226698945,
+      "grad_norm": 0.8825723528862,
+      "learning_rate": 0.000547945205479452,
+      "loss": 1.1997,
+      "step": 41
+    },
+    {
+      "epoch": 0.02905066574442331,
+      "grad_norm": 0.6939167380332947,
+      "learning_rate": 0.0005616438356164384,
+      "loss": 0.9761,
+      "step": 42
+    },
+    {
+      "epoch": 0.029742348262147673,
+      "grad_norm": 0.6930441856384277,
+      "learning_rate": 0.0005753424657534246,
+      "loss": 1.859,
+      "step": 43
+    },
+    {
+      "epoch": 0.03043403077987204,
+      "grad_norm": 0.5989459753036499,
+      "learning_rate": 0.000589041095890411,
+      "loss": 2.1215,
+      "step": 44
+    },
+    {
+      "epoch": 0.0311257132975964,
+      "grad_norm": 0.5096371173858643,
+      "learning_rate": 0.0006027397260273972,
+      "loss": 1.6978,
+      "step": 45
+    },
+    {
+      "epoch": 0.03181739581532077,
+      "grad_norm": 0.505210816860199,
+      "learning_rate": 0.0006164383561643835,
+      "loss": 2.2416,
+      "step": 46
+    },
+    {
+      "epoch": 0.03250907833304513,
+      "grad_norm": 0.4391973316669464,
+      "learning_rate": 0.0006301369863013699,
+      "loss": 2.0748,
+      "step": 47
+    },
+    {
+      "epoch": 0.0332007608507695,
+      "grad_norm": 0.5823452472686768,
+      "learning_rate": 0.0006438356164383562,
+      "loss": 1.9801,
+      "step": 48
+    },
+    {
+      "epoch": 0.03389244336849386,
+      "grad_norm": 0.4575086236000061,
+      "learning_rate": 0.0006575342465753425,
+      "loss": 1.7658,
+      "step": 49
+    },
+    {
+      "epoch": 0.034584125886218224,
+      "grad_norm": 1.1978178024291992,
+      "learning_rate": 0.0006712328767123288,
+      "loss": 2.0922,
+      "step": 50
+    },
+    {
+      "epoch": 0.03527580840394259,
+      "grad_norm": 0.8242403864860535,
+      "learning_rate": 0.0006849315068493151,
+      "loss": 1.4001,
+      "step": 51
+    },
+    {
+      "epoch": 0.035967490921666956,
+      "grad_norm": 0.7775748372077942,
+      "learning_rate": 0.0006986301369863014,
+      "loss": 2.2013,
+      "step": 52
+    },
+    {
+      "epoch": 0.03665917343939132,
+      "grad_norm": 0.6356838941574097,
+      "learning_rate": 0.0007123287671232877,
+      "loss": 2.1857,
+      "step": 53
+    },
+    {
+      "epoch": 0.03735085595711568,
+      "grad_norm": 0.5681482553482056,
+      "learning_rate": 0.000726027397260274,
+      "loss": 1.8066,
+      "step": 54
+    },
+    {
+      "epoch": 0.03804253847484005,
+      "grad_norm": 0.5019308924674988,
+      "learning_rate": 0.0007397260273972603,
+      "loss": 2.2206,
+      "step": 55
+    },
+    {
+      "epoch": 0.03873422099256441,
+      "grad_norm": 0.5633329749107361,
+      "learning_rate": 0.0007534246575342466,
+      "loss": 1.8777,
+      "step": 56
+    },
+    {
+      "epoch": 0.039425903510288775,
+      "grad_norm": 0.7095340490341187,
+      "learning_rate": 0.0007671232876712328,
+      "loss": 1.547,
+      "step": 57
+    },
+    {
+      "epoch": 0.040117586028013144,
+      "grad_norm": 0.5671369433403015,
+      "learning_rate": 0.0007808219178082192,
+      "loss": 1.621,
+      "step": 58
+    },
+    {
+      "epoch": 0.040809268545737507,
+      "grad_norm": 0.6320775747299194,
+      "learning_rate": 0.0007945205479452054,
+      "loss": 1.6643,
+      "step": 59
+    },
+    {
+      "epoch": 0.04150095106346187,
+      "grad_norm": 0.5812399387359619,
+      "learning_rate": 0.0008082191780821918,
+      "loss": 2.1796,
+      "step": 60
+    },
+    {
+      "epoch": 0.04219263358118624,
+      "grad_norm": 0.6970055103302002,
+      "learning_rate": 0.000821917808219178,
+      "loss": 1.3168,
+      "step": 61
+    },
+    {
+      "epoch": 0.0428843160989106,
+      "grad_norm": 0.5469310879707336,
+      "learning_rate": 0.0008356164383561644,
+      "loss": 1.9548,
+      "step": 62
+    },
+    {
+      "epoch": 0.04357599861663496,
+      "grad_norm": 0.6035028100013733,
+      "learning_rate": 0.0008493150684931506,
+      "loss": 2.0096,
+      "step": 63
+    },
+    {
+      "epoch": 0.04426768113435933,
+      "grad_norm": 0.5294916033744812,
+      "learning_rate": 0.000863013698630137,
+      "loss": 1.9096,
+      "step": 64
+    },
+    {
+      "epoch": 0.044959363652083695,
+      "grad_norm": 0.5908515453338623,
+      "learning_rate": 0.0008767123287671232,
+      "loss": 1.7588,
+      "step": 65
+    },
+    {
+      "epoch": 0.04565104616980806,
+      "grad_norm": 0.6100364327430725,
+      "learning_rate": 0.0008904109589041097,
+      "loss": 2.354,
+      "step": 66
+    },
+    {
+      "epoch": 0.04634272868753242,
+      "grad_norm": 0.4925366938114166,
+      "learning_rate": 0.0009041095890410959,
+      "loss": 1.9282,
+      "step": 67
+    },
+    {
+      "epoch": 0.04703441120525679,
+      "grad_norm": 0.6262894868850708,
+      "learning_rate": 0.0009178082191780823,
+      "loss": 1.2392,
+      "step": 68
+    },
+    {
+      "epoch": 0.04772609372298115,
+      "grad_norm": 0.55129474401474,
+      "learning_rate": 0.0009315068493150685,
+      "loss": 2.4297,
+      "step": 69
+    },
+    {
+      "epoch": 0.048417776240705514,
+      "grad_norm": 0.5773240327835083,
+      "learning_rate": 0.0009452054794520548,
+      "loss": 1.4238,
+      "step": 70
+    },
+    {
+      "epoch": 0.04910945875842988,
+      "grad_norm": 0.3298526704311371,
+      "learning_rate": 0.0009589041095890411,
+      "loss": 1.5375,
+      "step": 71
+    },
+    {
+      "epoch": 0.049801141276154245,
+      "grad_norm": 0.9887644648551941,
+      "learning_rate": 0.0009726027397260274,
+      "loss": 2.037,
+      "step": 72
+    },
+    {
+      "epoch": 0.05049282379387861,
+      "grad_norm": 0.7895487546920776,
+      "learning_rate": 0.0009863013698630137,
+      "loss": 1.8359,
+      "step": 73
+    },
+    {
+      "epoch": 0.05118450631160298,
+      "grad_norm": 0.5635783076286316,
+      "learning_rate": 0.001,
+      "loss": 1.2403,
+      "step": 74
+    },
+    {
+      "epoch": 0.05187618882932734,
+      "grad_norm": 0.5721316933631897,
+      "learning_rate": 0.0009992716678805535,
+      "loss": 1.9278,
+      "step": 75
+    },
+    {
+      "epoch": 0.0525678713470517,
+      "grad_norm": 0.4850369095802307,
+      "learning_rate": 0.000998543335761107,
+      "loss": 1.9441,
+      "step": 76
+    },
+    {
+      "epoch": 0.053259553864776064,
+      "grad_norm": 0.5316908955574036,
+      "learning_rate": 0.0009978150036416607,
+      "loss": 1.593,
+      "step": 77
+    },
+    {
+      "epoch": 0.053951236382500434,
+      "grad_norm": 0.4999512732028961,
+      "learning_rate": 0.000997086671522214,
+      "loss": 1.8264,
+      "step": 78
+    },
+    {
+      "epoch": 0.054642918900224796,
+      "grad_norm": 0.4769350588321686,
+      "learning_rate": 0.0009963583394027677,
+      "loss": 1.6991,
+      "step": 79
+    },
+    {
+      "epoch": 0.05533460141794916,
+      "grad_norm": 0.4839954078197479,
+      "learning_rate": 0.0009956300072833213,
+      "loss": 1.0442,
+      "step": 80
+    },
+    {
+      "epoch": 0.05602628393567353,
+      "grad_norm": 0.7724981307983398,
+      "learning_rate": 0.0009949016751638748,
+      "loss": 1.5422,
+      "step": 81
+    },
+    {
+      "epoch": 0.05671796645339789,
+      "grad_norm": 0.7546667456626892,
+      "learning_rate": 0.0009941733430444283,
+      "loss": 1.7832,
+      "step": 82
+    },
+    {
+      "epoch": 0.05740964897112225,
+      "grad_norm": 0.5036157369613647,
+      "learning_rate": 0.0009934450109249818,
+      "loss": 1.9083,
+      "step": 83
+    },
+    {
+      "epoch": 0.05810133148884662,
+      "grad_norm": 0.5091835260391235,
+      "learning_rate": 0.0009927166788055353,
+      "loss": 1.5611,
+      "step": 84
+    },
+    {
+      "epoch": 0.058793014006570984,
+      "grad_norm": 0.5591360926628113,
+      "learning_rate": 0.0009919883466860888,
+      "loss": 1.517,
+      "step": 85
+    },
+    {
+      "epoch": 0.05948469652429535,
+      "grad_norm": 0.5279435515403748,
+      "learning_rate": 0.0009912600145666425,
+      "loss": 1.541,
+      "step": 86
+    },
+    {
+      "epoch": 0.060176379042019716,
+      "grad_norm": 0.8345214128494263,
+      "learning_rate": 0.000990531682447196,
+      "loss": 0.8178,
+      "step": 87
+    },
+    {
+      "epoch": 0.06086806155974408,
+      "grad_norm": 0.7917139530181885,
+      "learning_rate": 0.0009898033503277495,
+      "loss": 2.0548,
+      "step": 88
+    },
+    {
+      "epoch": 0.06155974407746844,
+      "grad_norm": 0.46465063095092773,
+      "learning_rate": 0.000989075018208303,
+      "loss": 2.045,
+      "step": 89
+    },
+    {
+      "epoch": 0.0622514265951928,
+      "grad_norm": 0.6480844020843506,
+      "learning_rate": 0.0009883466860888565,
+      "loss": 2.1368,
+      "step": 90
+    },
+    {
+      "epoch": 0.06294310911291717,
+      "grad_norm": 0.5167698264122009,
+      "learning_rate": 0.00098761835396941,
+      "loss": 2.2606,
+      "step": 91
+    },
+    {
+      "epoch": 0.06363479163064154,
+      "grad_norm": 0.6514157652854919,
+      "learning_rate": 0.0009868900218499635,
+      "loss": 2.1443,
+      "step": 92
+    },
+    {
+      "epoch": 0.0643264741483659,
+      "grad_norm": 0.9793212413787842,
+      "learning_rate": 0.0009861616897305172,
+      "loss": 1.7448,
+      "step": 93
+    },
+    {
+      "epoch": 0.06501815666609026,
+      "grad_norm": 0.5196186900138855,
+      "learning_rate": 0.0009854333576110707,
+      "loss": 1.6717,
+      "step": 94
+    },
+    {
+      "epoch": 0.06570983918381462,
+      "grad_norm": 0.4875952899456024,
+      "learning_rate": 0.0009847050254916242,
+      "loss": 1.8078,
+      "step": 95
+    },
+    {
+      "epoch": 0.066401521701539,
+      "grad_norm": 0.5111953020095825,
+      "learning_rate": 0.0009839766933721777,
+      "loss": 2.263,
+      "step": 96
+    },
+    {
+      "epoch": 0.06709320421926336,
+      "grad_norm": 0.6604788303375244,
+      "learning_rate": 0.0009832483612527312,
+      "loss": 1.6966,
+      "step": 97
+    },
+    {
+      "epoch": 0.06778488673698772,
+      "grad_norm": 0.5474271774291992,
+      "learning_rate": 0.0009825200291332847,
+      "loss": 2.3985,
+      "step": 98
+    },
+    {
+      "epoch": 0.06847656925471209,
+      "grad_norm": 0.47275879979133606,
+      "learning_rate": 0.0009817916970138382,
+      "loss": 1.7751,
+      "step": 99
+    },
+    {
+      "epoch": 0.06916825177243645,
+      "grad_norm": 0.5738961696624756,
+      "learning_rate": 0.000981063364894392,
+      "loss": 1.5479,
+      "step": 100
+    },
+    {
+      "epoch": 0.06985993429016081,
+      "grad_norm": 0.5046308636665344,
+      "learning_rate": 0.0009803350327749454,
+      "loss": 1.4889,
+      "step": 101
+    },
+    {
+      "epoch": 0.07055161680788519,
+      "grad_norm": 0.45390692353248596,
+      "learning_rate": 0.000979606700655499,
+      "loss": 1.3446,
+      "step": 102
+    },
+    {
+      "epoch": 0.07124329932560955,
+      "grad_norm": 0.4701155126094818,
+      "learning_rate": 0.0009788783685360525,
+      "loss": 1.4525,
+      "step": 103
+    },
+    {
+      "epoch": 0.07193498184333391,
+      "grad_norm": 0.6199256181716919,
+      "learning_rate": 0.000978150036416606,
+      "loss": 1.5779,
+      "step": 104
+    },
+    {
+      "epoch": 0.07262666436105827,
+      "grad_norm": 0.6306092143058777,
+      "learning_rate": 0.0009774217042971595,
+      "loss": 2.0143,
+      "step": 105
+    },
+    {
+      "epoch": 0.07331834687878264,
+      "grad_norm": 0.5837789177894592,
+      "learning_rate": 0.000976693372177713,
+      "loss": 1.2003,
+      "step": 106
+    },
+    {
+      "epoch": 0.074010029396507,
+      "grad_norm": 0.9713156223297119,
+      "learning_rate": 0.0009759650400582666,
+      "loss": 1.5721,
+      "step": 107
+    },
+    {
+      "epoch": 0.07470171191423136,
+      "grad_norm": 0.694187343120575,
+      "learning_rate": 0.0009752367079388202,
+      "loss": 2.2187,
+      "step": 108
+    },
+    {
+      "epoch": 0.07539339443195574,
+      "grad_norm": 0.465781033039093,
+      "learning_rate": 0.0009745083758193737,
+      "loss": 1.768,
+      "step": 109
+    },
+    {
+      "epoch": 0.0760850769496801,
+      "grad_norm": 0.5198079347610474,
+      "learning_rate": 0.0009737800436999272,
+      "loss": 2.1921,
+      "step": 110
+    },
+    {
+      "epoch": 0.07677675946740446,
+      "grad_norm": 0.7641897201538086,
+      "learning_rate": 0.0009730517115804807,
+      "loss": 1.7574,
+      "step": 111
+    },
+    {
+      "epoch": 0.07746844198512882,
+      "grad_norm": 0.4864037334918976,
+      "learning_rate": 0.0009723233794610342,
+      "loss": 1.9409,
+      "step": 112
+    },
+    {
+      "epoch": 0.07816012450285319,
+      "grad_norm": 1.0721259117126465,
+      "learning_rate": 0.0009715950473415878,
+      "loss": 1.2796,
+      "step": 113
+    },
+    {
+      "epoch": 0.07885180702057755,
+      "grad_norm": 0.6161507964134216,
+      "learning_rate": 0.0009708667152221413,
+      "loss": 1.646,
+      "step": 114
+    },
+    {
+      "epoch": 0.07954348953830193,
+      "grad_norm": 0.6296889185905457,
+      "learning_rate": 0.0009701383831026949,
+      "loss": 1.2842,
+      "step": 115
+    },
+    {
+      "epoch": 0.08023517205602629,
+      "grad_norm": 0.6511496901512146,
+      "learning_rate": 0.0009694100509832484,
+      "loss": 2.0559,
+      "step": 116
+    },
+    {
+      "epoch": 0.08092685457375065,
+      "grad_norm": 0.5697126984596252,
+      "learning_rate": 0.0009686817188638019,
+      "loss": 1.5121,
+      "step": 117
+    },
+    {
+      "epoch": 0.08161853709147501,
+      "grad_norm": 0.506841242313385,
+      "learning_rate": 0.0009679533867443554,
+      "loss": 1.6908,
+      "step": 118
+    },
+    {
+      "epoch": 0.08231021960919938,
+      "grad_norm": 1.1525691747665405,
+      "learning_rate": 0.0009672250546249089,
+      "loss": 2.0817,
+      "step": 119
+    },
+    {
+      "epoch": 0.08300190212692374,
+      "grad_norm": 0.6273766756057739,
+      "learning_rate": 0.0009664967225054625,
+      "loss": 1.783,
+      "step": 120
+    },
+    {
+      "epoch": 0.0836935846446481,
+      "grad_norm": 0.8089930415153503,
+      "learning_rate": 0.000965768390386016,
+      "loss": 1.9208,
+      "step": 121
+    },
+    {
+      "epoch": 0.08438526716237248,
+      "grad_norm": 0.6257967948913574,
+      "learning_rate": 0.0009650400582665696,
+      "loss": 1.8797,
+      "step": 122
+    },
+    {
+      "epoch": 0.08507694968009684,
+      "grad_norm": 0.6704832911491394,
+      "learning_rate": 0.0009643117261471231,
+      "loss": 1.6969,
+      "step": 123
+    },
+    {
+      "epoch": 0.0857686321978212,
+      "grad_norm": 0.8226727843284607,
+      "learning_rate": 0.0009635833940276765,
+      "loss": 1.291,
+      "step": 124
+    },
+    {
+      "epoch": 0.08646031471554556,
+      "grad_norm": 0.45218008756637573,
+      "learning_rate": 0.0009628550619082302,
+      "loss": 1.6168,
+      "step": 125
+    },
+    {
+      "epoch": 0.08715199723326993,
+      "grad_norm": 0.9265746474266052,
+      "learning_rate": 0.0009621267297887837,
+      "loss": 1.5465,
+      "step": 126
+    },
+    {
+      "epoch": 0.08784367975099429,
+      "grad_norm": 0.7594870924949646,
+      "learning_rate": 0.0009613983976693373,
+      "loss": 1.2877,
+      "step": 127
+    },
+    {
+      "epoch": 0.08853536226871866,
+      "grad_norm": 0.5055251121520996,
+      "learning_rate": 0.0009606700655498908,
+      "loss": 1.8451,
+      "step": 128
+    },
+    {
+      "epoch": 0.08922704478644303,
+      "grad_norm": 0.5842559337615967,
+      "learning_rate": 0.0009599417334304444,
+      "loss": 2.4324,
+      "step": 129
+    },
+    {
+      "epoch": 0.08991872730416739,
+      "grad_norm": 0.42893463373184204,
+      "learning_rate": 0.0009592134013109979,
+      "loss": 1.8858,
+      "step": 130
+    },
+    {
+      "epoch": 0.09061040982189175,
+      "grad_norm": 0.5879374146461487,
+      "learning_rate": 0.0009584850691915513,
+      "loss": 2.1098,
+      "step": 131
+    },
+    {
+      "epoch": 0.09130209233961611,
+      "grad_norm": 1.0884597301483154,
+      "learning_rate": 0.0009577567370721049,
+      "loss": 1.1548,
+      "step": 132
+    },
+    {
+      "epoch": 0.09199377485734048,
+      "grad_norm": 0.4452207684516907,
+      "learning_rate": 0.0009570284049526584,
+      "loss": 1.5784,
+      "step": 133
+    },
+    {
+      "epoch": 0.09268545737506484,
+      "grad_norm": 0.5032292604446411,
+      "learning_rate": 0.000956300072833212,
+      "loss": 1.8767,
+      "step": 134
+    },
+    {
+      "epoch": 0.09337713989278922,
+      "grad_norm": 0.6190866827964783,
+      "learning_rate": 0.0009555717407137655,
+      "loss": 1.7698,
+      "step": 135
+    },
+    {
+      "epoch": 0.09406882241051358,
+      "grad_norm": 0.559252142906189,
+      "learning_rate": 0.0009548434085943191,
+      "loss": 2.063,
+      "step": 136
+    },
+    {
+      "epoch": 0.09476050492823794,
+      "grad_norm": 0.7464174032211304,
+      "learning_rate": 0.0009541150764748726,
+      "loss": 1.334,
+      "step": 137
+    },
+    {
+      "epoch": 0.0954521874459623,
+      "grad_norm": 0.5302634835243225,
+      "learning_rate": 0.000953386744355426,
+      "loss": 1.838,
+      "step": 138
+    },
+    {
+      "epoch": 0.09614386996368667,
+      "grad_norm": 0.5212066173553467,
+      "learning_rate": 0.0009526584122359796,
+      "loss": 1.9793,
+      "step": 139
+    },
+    {
+      "epoch": 0.09683555248141103,
+      "grad_norm": 0.7148857116699219,
+      "learning_rate": 0.0009519300801165331,
+      "loss": 1.6045,
+      "step": 140
+    },
+    {
+      "epoch": 0.0975272349991354,
+      "grad_norm": 1.0729445219039917,
+      "learning_rate": 0.0009512017479970867,
+      "loss": 1.5728,
+      "step": 141
+    },
+    {
+      "epoch": 0.09821891751685977,
+      "grad_norm": 0.438503235578537,
+      "learning_rate": 0.0009504734158776402,
+      "loss": 1.5064,
+      "step": 142
+    },
+    {
+      "epoch": 0.09891060003458413,
+      "grad_norm": 0.6026888489723206,
+      "learning_rate": 0.0009497450837581938,
+      "loss": 1.5297,
+      "step": 143
+    },
+    {
+      "epoch": 0.09960228255230849,
+      "grad_norm": 0.4339958727359772,
+      "learning_rate": 0.0009490167516387472,
+      "loss": 1.2461,
+      "step": 144
+    },
+    {
+      "epoch": 0.10029396507003285,
+      "grad_norm": 0.8123407363891602,
+      "learning_rate": 0.0009482884195193007,
+      "loss": 1.7749,
+      "step": 145
+    },
+    {
+      "epoch": 0.10098564758775722,
+      "grad_norm": 0.938025951385498,
+      "learning_rate": 0.0009475600873998543,
+      "loss": 1.2157,
+      "step": 146
+    },
+    {
+      "epoch": 0.10167733010548158,
+      "grad_norm": 0.8118213415145874,
+      "learning_rate": 0.0009468317552804079,
+      "loss": 1.3722,
+      "step": 147
+    },
+    {
+      "epoch": 0.10236901262320595,
+      "grad_norm": 0.6156368851661682,
+      "learning_rate": 0.0009461034231609615,
+      "loss": 1.9435,
+      "step": 148
+    },
+    {
+      "epoch": 0.10306069514093032,
+      "grad_norm": 0.43706831336021423,
+      "learning_rate": 0.000945375091041515,
+      "loss": 1.7467,
+      "step": 149
+    },
+    {
+      "epoch": 0.10375237765865468,
+      "grad_norm": 0.5463519096374512,
+      "learning_rate": 0.0009446467589220686,
+      "loss": 1.8991,
+      "step": 150
+    },
+    {
+      "epoch": 0.10444406017637904,
+      "grad_norm": 0.4798230826854706,
+      "learning_rate": 0.000943918426802622,
+      "loss": 2.1618,
+      "step": 151
+    },
+    {
+      "epoch": 0.1051357426941034,
+      "grad_norm": 0.4733302891254425,
+      "learning_rate": 0.0009431900946831755,
+      "loss": 1.8547,
+      "step": 152
+    },
+    {
+      "epoch": 0.10582742521182777,
+      "grad_norm": 0.558428168296814,
+      "learning_rate": 0.0009424617625637291,
+      "loss": 2.314,
+      "step": 153
+    },
+    {
+      "epoch": 0.10651910772955213,
+      "grad_norm": 0.5310361385345459,
+      "learning_rate": 0.0009417334304442826,
+      "loss": 1.9073,
+      "step": 154
+    },
+    {
+      "epoch": 0.1072107902472765,
+      "grad_norm": 0.4204038679599762,
+      "learning_rate": 0.0009410050983248362,
+      "loss": 1.9635,
+      "step": 155
+    },
+    {
+      "epoch": 0.10790247276500087,
+      "grad_norm": 0.5052216649055481,
+      "learning_rate": 0.0009402767662053897,
+      "loss": 1.0511,
+      "step": 156
+    },
+    {
+      "epoch": 0.10859415528272523,
+      "grad_norm": 0.5589479804039001,
+      "learning_rate": 0.0009395484340859433,
+      "loss": 1.4608,
+      "step": 157
+    },
+    {
+      "epoch": 0.10928583780044959,
+      "grad_norm": 0.7388360500335693,
+      "learning_rate": 0.0009388201019664967,
+      "loss": 1.9785,
+      "step": 158
+    },
+    {
+      "epoch": 0.10997752031817395,
+      "grad_norm": 0.5995668172836304,
+      "learning_rate": 0.0009380917698470502,
+      "loss": 1.4633,
+      "step": 159
+    },
+    {
+      "epoch": 0.11066920283589832,
+      "grad_norm": 0.8107509613037109,
+      "learning_rate": 0.0009373634377276038,
+      "loss": 1.6685,
+      "step": 160
+    },
+    {
+      "epoch": 0.1113608853536227,
+      "grad_norm": 0.6110396981239319,
+      "learning_rate": 0.0009366351056081573,
+      "loss": 1.8113,
+      "step": 161
+    },
+    {
+      "epoch": 0.11205256787134706,
+      "grad_norm": 0.5032293796539307,
+      "learning_rate": 0.0009359067734887109,
+      "loss": 1.7344,
+      "step": 162
+    },
+    {
+      "epoch": 0.11274425038907142,
+      "grad_norm": 1.456254243850708,
+      "learning_rate": 0.0009351784413692644,
+      "loss": 1.2546,
+      "step": 163
+    },
+    {
+      "epoch": 0.11343593290679578,
+      "grad_norm": 0.8283969163894653,
+      "learning_rate": 0.0009344501092498179,
+      "loss": 1.8466,
+      "step": 164
+    },
+    {
+      "epoch": 0.11412761542452014,
+      "grad_norm": 0.8178532123565674,
+      "learning_rate": 0.0009337217771303714,
+      "loss": 1.7566,
+      "step": 165
+    },
+    {
+      "epoch": 0.1148192979422445,
+      "grad_norm": 0.5897772908210754,
+      "learning_rate": 0.0009329934450109249,
+      "loss": 1.9611,
+      "step": 166
+    },
+    {
+      "epoch": 0.11551098045996887,
+      "grad_norm": 0.4763628840446472,
+      "learning_rate": 0.0009322651128914785,
+      "loss": 1.6854,
+      "step": 167
+    },
+    {
+      "epoch": 0.11620266297769324,
+      "grad_norm": 0.5219669938087463,
+      "learning_rate": 0.000931536780772032,
+      "loss": 1.5156,
+      "step": 168
+    },
+    {
+      "epoch": 0.1168943454954176,
+      "grad_norm": 0.7750780582427979,
+      "learning_rate": 0.0009308084486525857,
+      "loss": 1.2845,
+      "step": 169
+    },
+    {
+      "epoch": 0.11758602801314197,
+      "grad_norm": 0.5357050895690918,
+      "learning_rate": 0.0009300801165331392,
+      "loss": 1.1729,
+      "step": 170
+    },
+    {
+      "epoch": 0.11827771053086633,
+      "grad_norm": 0.5962219834327698,
+      "learning_rate": 0.0009293517844136927,
+      "loss": 1.4915,
+      "step": 171
+    },
+    {
+      "epoch": 0.1189693930485907,
+      "grad_norm": 0.4935504198074341,
+      "learning_rate": 0.0009286234522942462,
+      "loss": 1.7863,
+      "step": 172
+    },
+    {
+      "epoch": 0.11966107556631506,
+      "grad_norm": 0.5719547867774963,
+      "learning_rate": 0.0009278951201747997,
+      "loss": 1.7086,
+      "step": 173
+    },
+    {
+      "epoch": 0.12035275808403943,
+      "grad_norm": 0.614291787147522,
+      "learning_rate": 0.0009271667880553533,
+      "loss": 2.0141,
+      "step": 174
+    },
+    {
+      "epoch": 0.1210444406017638,
+      "grad_norm": 0.4415907859802246,
+      "learning_rate": 0.0009264384559359068,
+      "loss": 1.4772,
+      "step": 175
+    },
+    {
+      "epoch": 0.12173612311948816,
+      "grad_norm": 0.518036961555481,
+      "learning_rate": 0.0009257101238164604,
+      "loss": 1.1856,
+      "step": 176
+    },
+    {
+      "epoch": 0.12242780563721252,
+      "grad_norm": 0.39714357256889343,
+      "learning_rate": 0.0009249817916970139,
+      "loss": 1.1254,
+      "step": 177
+    },
+    {
+      "epoch": 0.12311948815493688,
+      "grad_norm": 0.5234679579734802,
+      "learning_rate": 0.0009242534595775674,
+      "loss": 2.0139,
+      "step": 178
+    },
+    {
+      "epoch": 0.12381117067266124,
+      "grad_norm": 0.548357367515564,
+      "learning_rate": 0.0009235251274581209,
+      "loss": 1.5552,
+      "step": 179
+    },
+    {
+      "epoch": 0.1245028531903856,
+      "grad_norm": 0.6111085414886475,
+      "learning_rate": 0.0009227967953386744,
+      "loss": 2.1376,
+      "step": 180
+    },
+    {
+      "epoch": 0.12519453570810998,
+      "grad_norm": 11.656793594360352,
+      "learning_rate": 0.000922068463219228,
+      "loss": 2.0174,
+      "step": 181
+    },
+    {
+      "epoch": 0.12588621822583435,
+      "grad_norm": 0.7396730184555054,
+      "learning_rate": 0.0009213401310997815,
+      "loss": 2.2195,
+      "step": 182
+    },
+    {
+      "epoch": 0.1265779007435587,
+      "grad_norm": 0.7623037099838257,
+      "learning_rate": 0.0009206117989803351,
+      "loss": 1.8883,
+      "step": 183
+    },
+    {
+      "epoch": 0.12726958326128307,
+      "grad_norm": 4.827798366546631,
+      "learning_rate": 0.0009198834668608885,
+      "loss": 1.4435,
+      "step": 184
+    },
+    {
+      "epoch": 0.12796126577900743,
+      "grad_norm": 6.8200836181640625,
+      "learning_rate": 0.0009191551347414421,
+      "loss": 2.4153,
+      "step": 185
+    },
+    {
+      "epoch": 0.1286529482967318,
+      "grad_norm": 10.740931510925293,
+      "learning_rate": 0.0009184268026219956,
+      "loss": 2.1309,
+      "step": 186
+    },
+    {
+      "epoch": 0.12934463081445616,
+      "grad_norm": 31.872066497802734,
+      "learning_rate": 0.0009176984705025491,
+      "loss": 2.2398,
+      "step": 187
+    },
+    {
+      "epoch": 0.13003631333218052,
+      "grad_norm": 31.492610931396484,
+      "learning_rate": 0.0009169701383831027,
+      "loss": 1.7102,
+      "step": 188
+    },
+    {
+      "epoch": 0.13072799584990488,
+      "grad_norm": 14.984453201293945,
+      "learning_rate": 0.0009162418062636562,
+      "loss": 1.4899,
+      "step": 189
+    },
+    {
+      "epoch": 0.13141967836762924,
+      "grad_norm": 60.037567138671875,
+      "learning_rate": 0.0009155134741442099,
+      "loss": 1.4512,
+      "step": 190
+    },
+    {
+      "epoch": 0.13211136088535363,
+      "grad_norm": 13.009904861450195,
+      "learning_rate": 0.0009147851420247633,
+      "loss": 1.7607,
+      "step": 191
+    },
+    {
+      "epoch": 0.132803043403078,
+      "grad_norm": 37.90861511230469,
+      "learning_rate": 0.0009140568099053169,
+      "loss": 1.9484,
+      "step": 192
+    },
+    {
+      "epoch": 0.13349472592080236,
+      "grad_norm": 25.40981674194336,
+      "learning_rate": 0.0009133284777858704,
+      "loss": 2.4415,
+      "step": 193
+    },
+    {
+      "epoch": 0.13418640843852672,
+      "grad_norm": 6.186267375946045,
+      "learning_rate": 0.0009126001456664239,
+      "loss": 1.1091,
+      "step": 194
+    },
+    {
+      "epoch": 0.13487809095625108,
+      "grad_norm": 0.7662860155105591,
+      "learning_rate": 0.0009118718135469775,
+      "loss": 1.8422,
+      "step": 195
+    },
+    {
+      "epoch": 0.13556977347397545,
+      "grad_norm": 0.6533941626548767,
+      "learning_rate": 0.000911143481427531,
+      "loss": 1.9951,
+      "step": 196
+    },
+    {
+      "epoch": 0.1362614559916998,
+      "grad_norm": 0.6851759552955627,
+      "learning_rate": 0.0009104151493080846,
+      "loss": 2.2665,
+      "step": 197
+    },
+    {
+      "epoch": 0.13695313850942417,
+      "grad_norm": 0.49062949419021606,
+      "learning_rate": 0.000909686817188638,
+      "loss": 1.1709,
+      "step": 198
+    },
+    {
+      "epoch": 0.13764482102714853,
+      "grad_norm": 0.5005449056625366,
+      "learning_rate": 0.0009089584850691916,
+      "loss": 1.6217,
+      "step": 199
+    },
+    {
+      "epoch": 0.1383365035448729,
+      "grad_norm": 0.5429890751838684,
+      "learning_rate": 0.0009082301529497451,
+      "loss": 2.0511,
+      "step": 200
+    },
+    {
+      "epoch": 0.13902818606259726,
+      "grad_norm": 0.652536153793335,
+      "learning_rate": 0.0009075018208302986,
+      "loss": 1.2122,
+      "step": 201
+    },
+    {
+      "epoch": 0.13971986858032162,
+      "grad_norm": 0.4541880488395691,
+      "learning_rate": 0.0009067734887108522,
+      "loss": 1.1111,
+      "step": 202
+    },
+    {
+      "epoch": 0.14041155109804598,
+      "grad_norm": 0.5066574811935425,
+      "learning_rate": 0.0009060451565914057,
+      "loss": 1.0966,
+      "step": 203
+    },
+    {
+      "epoch": 0.14110323361577037,
+      "grad_norm": 0.5900403261184692,
+      "learning_rate": 0.0009053168244719592,
+      "loss": 1.9502,
+      "step": 204
+    },
+    {
+      "epoch": 0.14179491613349474,
+      "grad_norm": 0.5873029828071594,
+      "learning_rate": 0.0009045884923525127,
+      "loss": 2.1726,
+      "step": 205
+    },
+    {
+      "epoch": 0.1424865986512191,
+      "grad_norm": 0.46297940611839294,
+      "learning_rate": 0.0009038601602330663,
+      "loss": 1.5529,
+      "step": 206
+    },
+    {
+      "epoch": 0.14317828116894346,
+      "grad_norm": 0.6434882283210754,
+      "learning_rate": 0.0009031318281136198,
+      "loss": 1.1717,
+      "step": 207
+    },
+    {
+      "epoch": 0.14386996368666782,
+      "grad_norm": 0.5225998163223267,
+      "learning_rate": 0.0009024034959941733,
+      "loss": 1.5854,
+      "step": 208
+    },
+    {
+      "epoch": 0.14456164620439219,
+      "grad_norm": 0.5846410989761353,
+      "learning_rate": 0.0009016751638747269,
+      "loss": 1.5399,
+      "step": 209
+    },
+    {
+      "epoch": 0.14525332872211655,
+      "grad_norm": 0.6395654082298279,
+      "learning_rate": 0.0009009468317552804,
+      "loss": 1.7706,
+      "step": 210
+    },
+    {
+      "epoch": 0.1459450112398409,
+      "grad_norm": 4.408266067504883,
+      "learning_rate": 0.000900218499635834,
+      "loss": 2.841,
+      "step": 211
+    },
+    {
+      "epoch": 0.14663669375756527,
+      "grad_norm": 0.5043503642082214,
+      "learning_rate": 0.0008994901675163874,
+      "loss": 1.7936,
+      "step": 212
+    },
+    {
+      "epoch": 0.14732837627528964,
+      "grad_norm": 0.4562769830226898,
+      "learning_rate": 0.0008987618353969411,
+      "loss": 1.8362,
+      "step": 213
+    },
+    {
+      "epoch": 0.148020058793014,
+      "grad_norm": 0.7404221296310425,
+      "learning_rate": 0.0008980335032774946,
+      "loss": 1.6168,
+      "step": 214
+    },
+    {
+      "epoch": 0.14871174131073836,
+      "grad_norm": 0.7720257043838501,
+      "learning_rate": 0.0008973051711580481,
+      "loss": 1.2365,
+      "step": 215
+    },
+    {
+      "epoch": 0.14940342382846272,
+      "grad_norm": 0.9425879716873169,
+      "learning_rate": 0.0008965768390386017,
+      "loss": 1.5786,
+      "step": 216
+    },
+    {
+      "epoch": 0.1500951063461871,
+      "grad_norm": 0.5764768719673157,
+      "learning_rate": 0.0008958485069191552,
+      "loss": 2.0833,
+      "step": 217
+    },
+    {
+      "epoch": 0.15078678886391148,
+      "grad_norm": 0.49806153774261475,
+      "learning_rate": 0.0008951201747997087,
+      "loss": 1.1311,
+      "step": 218
+    },
+    {
+      "epoch": 0.15147847138163584,
+      "grad_norm": 0.5747334361076355,
+      "learning_rate": 0.0008943918426802622,
+      "loss": 2.2558,
+      "step": 219
+    },
+    {
+      "epoch": 0.1521701538993602,
+      "grad_norm": 1.0881627798080444,
+      "learning_rate": 0.0008936635105608158,
+      "loss": 1.7196,
+      "step": 220
+    },
+    {
+      "epoch": 0.15286183641708456,
+      "grad_norm": 0.6077120900154114,
+      "learning_rate": 0.0008929351784413693,
+      "loss": 1.8436,
+      "step": 221
+    },
+    {
+      "epoch": 0.15355351893480892,
+      "grad_norm": 1.4011138677597046,
+      "learning_rate": 0.0008922068463219228,
+      "loss": 1.4092,
+      "step": 222
+    },
+    {
+      "epoch": 0.1542452014525333,
+      "grad_norm": 0.6316831707954407,
+      "learning_rate": 0.0008914785142024764,
+      "loss": 1.6801,
+      "step": 223
+    },
+    {
+      "epoch": 0.15493688397025765,
+      "grad_norm": 0.6225351691246033,
+      "learning_rate": 0.0008907501820830298,
+      "loss": 1.575,
+      "step": 224
+    },
+    {
+      "epoch": 0.155628566487982,
+      "grad_norm": 0.45079120993614197,
+      "learning_rate": 0.0008900218499635834,
+      "loss": 1.7441,
+      "step": 225
+    },
+    {
+      "epoch": 0.15632024900570637,
+      "grad_norm": 0.5602415204048157,
+      "learning_rate": 0.0008892935178441369,
+      "loss": 1.8509,
+      "step": 226
+    },
+    {
+      "epoch": 0.15701193152343074,
+      "grad_norm": 0.43019142746925354,
+      "learning_rate": 0.0008885651857246905,
+      "loss": 2.0136,
+      "step": 227
+    },
+    {
+      "epoch": 0.1577036140411551,
+      "grad_norm": 0.48303139209747314,
+      "learning_rate": 0.000887836853605244,
+      "loss": 1.7679,
+      "step": 228
+    },
+    {
+      "epoch": 0.15839529655887946,
+      "grad_norm": 0.5987271666526794,
+      "learning_rate": 0.0008871085214857975,
+      "loss": 1.3894,
+      "step": 229
+    },
+    {
+      "epoch": 0.15908697907660385,
+      "grad_norm": 0.6672357320785522,
+      "learning_rate": 0.0008863801893663511,
+      "loss": 2.0173,
+      "step": 230
+    },
+    {
+      "epoch": 0.15977866159432821,
+      "grad_norm": 0.5140132904052734,
+      "learning_rate": 0.0008856518572469045,
+      "loss": 1.9006,
+      "step": 231
+    },
+    {
+      "epoch": 0.16047034411205258,
+      "grad_norm": 0.7984848022460938,
+      "learning_rate": 0.0008849235251274581,
+      "loss": 1.7669,
+      "step": 232
+    },
+    {
+      "epoch": 0.16116202662977694,
+      "grad_norm": 1.279133677482605,
+      "learning_rate": 0.0008841951930080116,
+      "loss": 1.3613,
+      "step": 233
+    },
+    {
+      "epoch": 0.1618537091475013,
+      "grad_norm": 0.37104475498199463,
+      "learning_rate": 0.0008834668608885653,
+      "loss": 1.118,
+      "step": 234
+    },
+    {
+      "epoch": 0.16254539166522566,
+      "grad_norm": 0.5247305631637573,
+      "learning_rate": 0.0008827385287691188,
+      "loss": 2.0458,
+      "step": 235
+    },
+    {
+      "epoch": 0.16323707418295003,
+      "grad_norm": 0.837685227394104,
+      "learning_rate": 0.0008820101966496723,
+      "loss": 2.1954,
+      "step": 236
+    },
+    {
+      "epoch": 0.1639287567006744,
+      "grad_norm": 0.5766549706459045,
+      "learning_rate": 0.0008812818645302259,
+      "loss": 1.4839,
+      "step": 237
+    },
+    {
+      "epoch": 0.16462043921839875,
+      "grad_norm": 0.9044421315193176,
+      "learning_rate": 0.0008805535324107793,
+      "loss": 1.6802,
+      "step": 238
+    },
+    {
+      "epoch": 0.1653121217361231,
+      "grad_norm": 0.6272666454315186,
+      "learning_rate": 0.0008798252002913329,
+      "loss": 1.9889,
+      "step": 239
+    },
+    {
+      "epoch": 0.16600380425384748,
+      "grad_norm": 0.5650503039360046,
+      "learning_rate": 0.0008790968681718864,
+      "loss": 1.9305,
+      "step": 240
+    },
+    {
+      "epoch": 0.16669548677157184,
+      "grad_norm": 0.605739176273346,
+      "learning_rate": 0.00087836853605244,
+      "loss": 1.4619,
+      "step": 241
+    },
+    {
+      "epoch": 0.1673871692892962,
+      "grad_norm": 0.654289186000824,
+      "learning_rate": 0.0008776402039329935,
+      "loss": 2.1283,
+      "step": 242
+    },
+    {
+      "epoch": 0.1680788518070206,
+      "grad_norm": 0.5998426079750061,
+      "learning_rate": 0.000876911871813547,
+      "loss": 2.0628,
+      "step": 243
+    },
+    {
+      "epoch": 0.16877053432474495,
+      "grad_norm": 0.5341598391532898,
+      "learning_rate": 0.0008761835396941005,
+      "loss": 1.8415,
+      "step": 244
+    },
+    {
+      "epoch": 0.16946221684246932,
+      "grad_norm": 0.9030768275260925,
+      "learning_rate": 0.000875455207574654,
+      "loss": 1.3423,
+      "step": 245
+    },
+    {
+      "epoch": 0.17015389936019368,
+      "grad_norm": 0.7384636998176575,
+      "learning_rate": 0.0008747268754552076,
+      "loss": 1.6916,
+      "step": 246
+    },
+    {
+      "epoch": 0.17084558187791804,
+      "grad_norm": 0.9748024940490723,
+      "learning_rate": 0.0008739985433357611,
+      "loss": 1.1592,
+      "step": 247
+    },
+    {
+      "epoch": 0.1715372643956424,
+      "grad_norm": 0.49209123849868774,
+      "learning_rate": 0.0008732702112163147,
+      "loss": 1.5281,
+      "step": 248
+    },
+    {
+      "epoch": 0.17222894691336676,
+      "grad_norm": 0.6235657930374146,
+      "learning_rate": 0.0008725418790968682,
+      "loss": 1.6423,
+      "step": 249
+    },
+    {
+      "epoch": 0.17292062943109113,
+      "grad_norm": 0.8116986751556396,
+      "learning_rate": 0.0008718135469774217,
+      "loss": 1.6386,
+      "step": 250
+    },
+    {
+      "epoch": 0.1736123119488155,
+      "grad_norm": 0.643518328666687,
+      "learning_rate": 0.0008710852148579752,
+      "loss": 1.1341,
+      "step": 251
+    },
+    {
+      "epoch": 0.17430399446653985,
+      "grad_norm": 0.826726496219635,
+      "learning_rate": 0.0008703568827385287,
+      "loss": 1.4637,
+      "step": 252
+    },
+    {
+      "epoch": 0.17499567698426421,
+      "grad_norm": 0.6371028423309326,
+      "learning_rate": 0.0008696285506190823,
+      "loss": 1.8159,
+      "step": 253
+    },
+    {
+      "epoch": 0.17568735950198858,
+      "grad_norm": 0.7354971766471863,
+      "learning_rate": 0.0008689002184996358,
+      "loss": 1.5482,
+      "step": 254
+    },
+    {
+      "epoch": 0.17637904201971294,
+      "grad_norm": 0.5614224672317505,
+      "learning_rate": 0.0008681718863801895,
+      "loss": 1.6824,
+      "step": 255
+    },
+    {
+      "epoch": 0.17707072453743733,
+      "grad_norm": 0.7730950117111206,
+      "learning_rate": 0.000867443554260743,
+      "loss": 1.6996,
+      "step": 256
+    },
+    {
+      "epoch": 0.1777624070551617,
+      "grad_norm": 0.5419211983680725,
+      "learning_rate": 0.0008667152221412965,
+      "loss": 1.8659,
+      "step": 257
+    },
+    {
+      "epoch": 0.17845408957288605,
+      "grad_norm": 0.5566856861114502,
+      "learning_rate": 0.00086598689002185,
+      "loss": 1.1103,
+      "step": 258
+    },
+    {
+      "epoch": 0.17914577209061042,
+      "grad_norm": 0.773952841758728,
+      "learning_rate": 0.0008652585579024035,
+      "loss": 1.2759,
+      "step": 259
+    },
+    {
+      "epoch": 0.17983745460833478,
+      "grad_norm": 0.49450692534446716,
+      "learning_rate": 0.0008645302257829571,
+      "loss": 1.913,
+      "step": 260
+    },
+    {
+      "epoch": 0.18052913712605914,
+      "grad_norm": 0.565629243850708,
+      "learning_rate": 0.0008638018936635106,
+      "loss": 1.0691,
+      "step": 261
+    },
+    {
+      "epoch": 0.1812208196437835,
+      "grad_norm": 0.5907365679740906,
+      "learning_rate": 0.0008630735615440642,
+      "loss": 2.1622,
+      "step": 262
+    },
+    {
+      "epoch": 0.18191250216150787,
+      "grad_norm": 0.6517736911773682,
+      "learning_rate": 0.0008623452294246177,
+      "loss": 1.9742,
+      "step": 263
+    },
+    {
+      "epoch": 0.18260418467923223,
+      "grad_norm": 0.7100114822387695,
+      "learning_rate": 0.0008616168973051711,
+      "loss": 1.6081,
+      "step": 264
+    },
+    {
+      "epoch": 0.1832958671969566,
+      "grad_norm": 0.5431230068206787,
+      "learning_rate": 0.0008608885651857247,
+      "loss": 2.0853,
+      "step": 265
+    },
+    {
+      "epoch": 0.18398754971468095,
+      "grad_norm": 0.4722400903701782,
+      "learning_rate": 0.0008601602330662782,
+      "loss": 1.1004,
+      "step": 266
+    },
+    {
+      "epoch": 0.18467923223240532,
+      "grad_norm": 0.6258965730667114,
+      "learning_rate": 0.0008594319009468318,
+      "loss": 1.9442,
+      "step": 267
+    },
+    {
+      "epoch": 0.18537091475012968,
+      "grad_norm": 0.6985493898391724,
+      "learning_rate": 0.0008587035688273853,
+      "loss": 1.5243,
+      "step": 268
+    },
+    {
+      "epoch": 0.18606259726785407,
+      "grad_norm": 0.6129814386367798,
+      "learning_rate": 0.0008579752367079389,
+      "loss": 1.5075,
+      "step": 269
+    },
+    {
+      "epoch": 0.18675427978557843,
+      "grad_norm": 0.49683645367622375,
+      "learning_rate": 0.0008572469045884924,
+      "loss": 2.0752,
+      "step": 270
+    },
+    {
+      "epoch": 0.1874459623033028,
+      "grad_norm": 0.48728471994400024,
+      "learning_rate": 0.0008565185724690458,
+      "loss": 2.2337,
+      "step": 271
+    },
+    {
+      "epoch": 0.18813764482102716,
+      "grad_norm": 0.8094476461410522,
+      "learning_rate": 0.0008557902403495994,
+      "loss": 2.007,
+      "step": 272
+    },
+    {
+      "epoch": 0.18882932733875152,
+      "grad_norm": 0.558074951171875,
+      "learning_rate": 0.0008550619082301529,
+      "loss": 1.4369,
+      "step": 273
+    },
+    {
+      "epoch": 0.18952100985647588,
+      "grad_norm": 0.6702684760093689,
+      "learning_rate": 0.0008543335761107065,
+      "loss": 1.8943,
+      "step": 274
+    },
+    {
+      "epoch": 0.19021269237420024,
+      "grad_norm": 0.7045763731002808,
+      "learning_rate": 0.00085360524399126,
+      "loss": 2.0621,
+      "step": 275
+    },
+    {
+      "epoch": 0.1909043748919246,
+      "grad_norm": 0.5553760528564453,
+      "learning_rate": 0.0008528769118718137,
+      "loss": 1.9012,
+      "step": 276
+    },
+    {
+      "epoch": 0.19159605740964897,
+      "grad_norm": 0.651685893535614,
+      "learning_rate": 0.0008521485797523672,
+      "loss": 1.826,
+      "step": 277
+    },
+    {
+      "epoch": 0.19228773992737333,
+      "grad_norm": 0.46926578879356384,
+      "learning_rate": 0.0008514202476329205,
+      "loss": 1.8451,
+      "step": 278
+    },
+    {
+      "epoch": 0.1929794224450977,
+      "grad_norm": 0.5306689739227295,
+      "learning_rate": 0.0008506919155134742,
+      "loss": 1.9559,
+      "step": 279
+    },
+    {
+      "epoch": 0.19367110496282205,
+      "grad_norm": 0.437308669090271,
+      "learning_rate": 0.0008499635833940277,
+      "loss": 1.1798,
+      "step": 280
+    },
+    {
+      "epoch": 0.19436278748054642,
+      "grad_norm": 0.5720314383506775,
+      "learning_rate": 0.0008492352512745813,
+      "loss": 1.8443,
+      "step": 281
+    },
+    {
+      "epoch": 0.1950544699982708,
+      "grad_norm": 0.6609981060028076,
+      "learning_rate": 0.0008485069191551348,
+      "loss": 2.1044,
+      "step": 282
+    },
+    {
+      "epoch": 0.19574615251599517,
+      "grad_norm": 0.7185072302818298,
+      "learning_rate": 0.0008477785870356884,
+      "loss": 1.309,
+      "step": 283
+    },
+    {
+      "epoch": 0.19643783503371953,
+      "grad_norm": 0.9821947813034058,
+      "learning_rate": 0.0008470502549162418,
+      "loss": 1.5823,
+      "step": 284
+    },
+    {
+      "epoch": 0.1971295175514439,
+      "grad_norm": 0.6811301112174988,
+      "learning_rate": 0.0008463219227967953,
+      "loss": 1.4655,
+      "step": 285
+    },
+    {
+      "epoch": 0.19782120006916826,
+      "grad_norm": 0.5955311059951782,
+      "learning_rate": 0.0008455935906773489,
+      "loss": 1.5187,
+      "step": 286
+    },
+    {
+      "epoch": 0.19851288258689262,
+      "grad_norm": 0.568804919719696,
+      "learning_rate": 0.0008448652585579024,
+      "loss": 1.3988,
+      "step": 287
+    },
+    {
+      "epoch": 0.19920456510461698,
+      "grad_norm": 0.7858214974403381,
+      "learning_rate": 0.000844136926438456,
+      "loss": 1.5478,
+      "step": 288
+    },
+    {
+      "epoch": 0.19989624762234134,
+      "grad_norm": 0.5844207406044006,
+      "learning_rate": 0.0008434085943190095,
+      "loss": 1.904,
+      "step": 289
+    },
+    {
+      "epoch": 0.2005879301400657,
+      "grad_norm": 0.7172948122024536,
+      "learning_rate": 0.0008426802621995631,
+      "loss": 1.4772,
+      "step": 290
+    },
+    {
+      "epoch": 0.20127961265779007,
+      "grad_norm": 0.6408190727233887,
+      "learning_rate": 0.0008419519300801165,
+      "loss": 1.4348,
+      "step": 291
+    },
+    {
+      "epoch": 0.20197129517551443,
+      "grad_norm": 0.9460310339927673,
+      "learning_rate": 0.00084122359796067,
+      "loss": 1.7674,
+      "step": 292
+    },
+    {
+      "epoch": 0.2026629776932388,
+      "grad_norm": 0.6002872586250305,
+      "learning_rate": 0.0008404952658412236,
+      "loss": 1.4278,
+      "step": 293
+    },
+    {
+      "epoch": 0.20335466021096316,
+      "grad_norm": 1.0076587200164795,
+      "learning_rate": 0.0008397669337217771,
+      "loss": 1.5417,
+      "step": 294
+    },
+    {
+      "epoch": 0.20404634272868752,
+      "grad_norm": 1.3005017042160034,
+      "learning_rate": 0.0008390386016023307,
+      "loss": 1.6232,
+      "step": 295
+    },
+    {
+      "epoch": 0.2047380252464119,
+      "grad_norm": 0.751641035079956,
+      "learning_rate": 0.0008383102694828842,
+      "loss": 1.9296,
+      "step": 296
+    },
+    {
+      "epoch": 0.20542970776413627,
+      "grad_norm": 0.6361163258552551,
+      "learning_rate": 0.0008375819373634378,
+      "loss": 2.0047,
+      "step": 297
+    },
+    {
+      "epoch": 0.20612139028186063,
+      "grad_norm": 0.9554282426834106,
+      "learning_rate": 0.0008368536052439912,
+      "loss": 1.7218,
+      "step": 298
+    },
+    {
+      "epoch": 0.206813072799585,
+      "grad_norm": 0.7240822911262512,
+      "learning_rate": 0.0008361252731245447,
+      "loss": 1.6413,
+      "step": 299
+    },
+    {
+      "epoch": 0.20750475531730936,
+      "grad_norm": 0.46834996342658997,
+      "learning_rate": 0.0008353969410050984,
+      "loss": 1.2228,
+      "step": 300
+    },
+    {
+      "epoch": 0.20819643783503372,
+      "grad_norm": 0.7188776731491089,
+      "learning_rate": 0.0008346686088856519,
+      "loss": 1.4258,
+      "step": 301
+    },
+    {
+      "epoch": 0.20888812035275808,
+      "grad_norm": 0.588649332523346,
+      "learning_rate": 0.0008339402767662055,
+      "loss": 1.7551,
+      "step": 302
+    },
+    {
+      "epoch": 0.20957980287048245,
+      "grad_norm": 0.6962491273880005,
+      "learning_rate": 0.000833211944646759,
+      "loss": 2.1571,
+      "step": 303
+    },
+    {
+      "epoch": 0.2102714853882068,
+      "grad_norm": 0.6146702170372009,
+      "learning_rate": 0.0008324836125273124,
+      "loss": 2.0106,
+      "step": 304
+    },
+    {
+      "epoch": 0.21096316790593117,
+      "grad_norm": 0.6004481315612793,
+      "learning_rate": 0.000831755280407866,
+      "loss": 1.3429,
+      "step": 305
+    },
+    {
+      "epoch": 0.21165485042365553,
+      "grad_norm": 0.6162500381469727,
+      "learning_rate": 0.0008310269482884195,
+      "loss": 1.8476,
+      "step": 306
+    },
+    {
+      "epoch": 0.2123465329413799,
+      "grad_norm": 0.5027235746383667,
+      "learning_rate": 0.0008302986161689731,
+      "loss": 1.7516,
+      "step": 307
+    },
+    {
+      "epoch": 0.21303821545910426,
+      "grad_norm": 0.5416428446769714,
+      "learning_rate": 0.0008295702840495266,
+      "loss": 1.8363,
+      "step": 308
+    },
+    {
+      "epoch": 0.21372989797682865,
+      "grad_norm": 0.6236619353294373,
+      "learning_rate": 0.0008288419519300802,
+      "loss": 1.6738,
+      "step": 309
+    },
+    {
+      "epoch": 0.214421580494553,
+      "grad_norm": 0.5952901244163513,
+      "learning_rate": 0.0008281136198106337,
+      "loss": 1.4978,
+      "step": 310
+    },
+    {
+      "epoch": 0.21511326301227737,
+      "grad_norm": 0.7139809131622314,
+      "learning_rate": 0.0008273852876911871,
+      "loss": 1.593,
+      "step": 311
+    },
+    {
+      "epoch": 0.21580494553000173,
+      "grad_norm": 0.6548435091972351,
+      "learning_rate": 0.0008266569555717407,
+      "loss": 1.7464,
+      "step": 312
+    },
+    {
+      "epoch": 0.2164966280477261,
+      "grad_norm": 0.6812461018562317,
+      "learning_rate": 0.0008259286234522942,
+      "loss": 1.9677,
+      "step": 313
+    },
+    {
+      "epoch": 0.21718831056545046,
+      "grad_norm": 0.7574117183685303,
+      "learning_rate": 0.0008252002913328478,
+      "loss": 1.6102,
+      "step": 314
+    },
+    {
+      "epoch": 0.21787999308317482,
+      "grad_norm": 0.5767763257026672,
+      "learning_rate": 0.0008244719592134013,
+      "loss": 2.0047,
+      "step": 315
+    },
+    {
+      "epoch": 0.21857167560089918,
+      "grad_norm": 0.864742636680603,
+      "learning_rate": 0.0008237436270939549,
+      "loss": 1.6079,
+      "step": 316
+    },
+    {
+      "epoch": 0.21926335811862355,
+      "grad_norm": 1.1354854106903076,
+      "learning_rate": 0.0008230152949745084,
+      "loss": 1.3049,
+      "step": 317
+    },
+    {
+      "epoch": 0.2199550406363479,
+      "grad_norm": 0.8098461031913757,
+      "learning_rate": 0.0008222869628550618,
+      "loss": 1.1503,
+      "step": 318
+    },
+    {
+      "epoch": 0.22064672315407227,
+      "grad_norm": 0.7209709286689758,
+      "learning_rate": 0.0008215586307356154,
+      "loss": 1.9659,
+      "step": 319
+    },
+    {
+      "epoch": 0.22133840567179663,
+      "grad_norm": 0.6464136838912964,
+      "learning_rate": 0.0008208302986161689,
+      "loss": 1.8093,
+      "step": 320
+    },
+    {
+      "epoch": 0.222030088189521,
+      "grad_norm": 2.2832796573638916,
+      "learning_rate": 0.0008201019664967226,
+      "loss": 1.2252,
+      "step": 321
+    },
+    {
+      "epoch": 0.2227217707072454,
+      "grad_norm": 0.6651481986045837,
+      "learning_rate": 0.0008193736343772761,
+      "loss": 1.8967,
+      "step": 322
+    },
+    {
+      "epoch": 0.22341345322496975,
+      "grad_norm": 0.5639248490333557,
+      "learning_rate": 0.0008186453022578297,
+      "loss": 1.1423,
+      "step": 323
+    },
+    {
+      "epoch": 0.2241051357426941,
+      "grad_norm": 0.6734063029289246,
+      "learning_rate": 0.0008179169701383831,
+      "loss": 1.7355,
+      "step": 324
+    },
+    {
+      "epoch": 0.22479681826041847,
+      "grad_norm": 0.8061289191246033,
+      "learning_rate": 0.0008171886380189366,
+      "loss": 0.8081,
+      "step": 325
+    },
+    {
+      "epoch": 0.22548850077814284,
+      "grad_norm": 0.584674060344696,
+      "learning_rate": 0.0008164603058994902,
+      "loss": 1.1589,
+      "step": 326
+    },
+    {
+      "epoch": 0.2261801832958672,
+      "grad_norm": 0.6683285236358643,
+      "learning_rate": 0.0008157319737800437,
+      "loss": 1.7779,
+      "step": 327
+    },
+    {
+      "epoch": 0.22687186581359156,
+      "grad_norm": 0.7037453055381775,
+      "learning_rate": 0.0008150036416605973,
+      "loss": 2.0579,
+      "step": 328
+    },
+    {
+      "epoch": 0.22756354833131592,
+      "grad_norm": 0.6727277636528015,
+      "learning_rate": 0.0008142753095411508,
+      "loss": 1.7619,
+      "step": 329
+    },
+    {
+      "epoch": 0.22825523084904029,
+      "grad_norm": 0.7048072218894958,
+      "learning_rate": 0.0008135469774217044,
+      "loss": 1.7445,
+      "step": 330
+    },
+    {
+      "epoch": 0.22894691336676465,
+      "grad_norm": 0.5675456523895264,
+      "learning_rate": 0.0008128186453022578,
+      "loss": 1.8732,
+      "step": 331
+    },
+    {
+      "epoch": 0.229638595884489,
+      "grad_norm": 0.5742422342300415,
+      "learning_rate": 0.0008120903131828113,
+      "loss": 1.9362,
+      "step": 332
+    },
+    {
+      "epoch": 0.23033027840221337,
+      "grad_norm": 0.612397313117981,
+      "learning_rate": 0.0008113619810633649,
+      "loss": 1.3922,
+      "step": 333
+    },
+    {
+      "epoch": 0.23102196091993774,
+      "grad_norm": 0.5459281802177429,
+      "learning_rate": 0.0008106336489439184,
+      "loss": 1.823,
+      "step": 334
+    },
+    {
+      "epoch": 0.23171364343766213,
+      "grad_norm": 0.6739487051963806,
+      "learning_rate": 0.000809905316824472,
+      "loss": 1.9681,
+      "step": 335
+    },
+    {
+      "epoch": 0.2324053259553865,
+      "grad_norm": 0.550207257270813,
+      "learning_rate": 0.0008091769847050255,
+      "loss": 1.8516,
+      "step": 336
+    },
+    {
+      "epoch": 0.23309700847311085,
+      "grad_norm": 0.45742911100387573,
+      "learning_rate": 0.0008084486525855791,
+      "loss": 1.6626,
+      "step": 337
+    },
+    {
+      "epoch": 0.2337886909908352,
+      "grad_norm": 0.764325737953186,
+      "learning_rate": 0.0008077203204661325,
+      "loss": 1.7487,
+      "step": 338
+    },
+    {
+      "epoch": 0.23448037350855958,
+      "grad_norm": 0.5911192297935486,
+      "learning_rate": 0.000806991988346686,
+      "loss": 1.2527,
+      "step": 339
+    },
+    {
+      "epoch": 0.23517205602628394,
+      "grad_norm": 0.5554788708686829,
+      "learning_rate": 0.0008062636562272396,
+      "loss": 1.2681,
+      "step": 340
+    },
+    {
+      "epoch": 0.2358637385440083,
+      "grad_norm": 0.5522503852844238,
+      "learning_rate": 0.0008055353241077931,
+      "loss": 1.4677,
+      "step": 341
+    },
+    {
+      "epoch": 0.23655542106173266,
+      "grad_norm": 0.5872853994369507,
+      "learning_rate": 0.0008048069919883468,
+      "loss": 1.7502,
+      "step": 342
+    },
+    {
+      "epoch": 0.23724710357945702,
+      "grad_norm": 0.6910445690155029,
+      "learning_rate": 0.0008040786598689003,
+      "loss": 2.1711,
+      "step": 343
+    },
+    {
+      "epoch": 0.2379387860971814,
+      "grad_norm": 0.7674363255500793,
+      "learning_rate": 0.0008033503277494538,
+      "loss": 1.0802,
+      "step": 344
+    },
+    {
+      "epoch": 0.23863046861490575,
+      "grad_norm": 0.8880471587181091,
+      "learning_rate": 0.0008026219956300073,
+      "loss": 1.3592,
+      "step": 345
+    },
+    {
+      "epoch": 0.2393221511326301,
+      "grad_norm": 0.6581029295921326,
+      "learning_rate": 0.0008018936635105608,
+      "loss": 1.6392,
+      "step": 346
+    },
+    {
+      "epoch": 0.24001383365035447,
+      "grad_norm": 1.1078494787216187,
+      "learning_rate": 0.0008011653313911144,
+      "loss": 1.5557,
+      "step": 347
+    },
+    {
+      "epoch": 0.24070551616807886,
+      "grad_norm": 0.5563998818397522,
+      "learning_rate": 0.0008004369992716679,
+      "loss": 1.3359,
+      "step": 348
+    },
+    {
+      "epoch": 0.24139719868580323,
+      "grad_norm": 0.6263977885246277,
+      "learning_rate": 0.0007997086671522215,
+      "loss": 2.1702,
+      "step": 349
+    },
+    {
+      "epoch": 0.2420888812035276,
+      "grad_norm": 0.5776513814926147,
+      "learning_rate": 0.000798980335032775,
+      "loss": 1.9447,
+      "step": 350
+    },
+    {
+      "epoch": 0.24278056372125195,
+      "grad_norm": 0.748920202255249,
+      "learning_rate": 0.0007982520029133285,
+      "loss": 1.4751,
+      "step": 351
+    },
+    {
+      "epoch": 0.24347224623897631,
+      "grad_norm": 1.3247708082199097,
+      "learning_rate": 0.000797523670793882,
+      "loss": 1.7557,
+      "step": 352
+    },
+    {
+      "epoch": 0.24416392875670068,
+      "grad_norm": 0.7095309495925903,
+      "learning_rate": 0.0007967953386744355,
+      "loss": 2.2349,
+      "step": 353
+    },
+    {
+      "epoch": 0.24485561127442504,
+      "grad_norm": 0.532289981842041,
+      "learning_rate": 0.0007960670065549891,
+      "loss": 1.1916,
+      "step": 354
+    },
+    {
+      "epoch": 0.2455472937921494,
+      "grad_norm": 0.6105953454971313,
+      "learning_rate": 0.0007953386744355426,
+      "loss": 1.6084,
+      "step": 355
+    },
+    {
+      "epoch": 0.24623897630987376,
+      "grad_norm": 0.6233397126197815,
+      "learning_rate": 0.0007946103423160962,
+      "loss": 1.7177,
+      "step": 356
+    },
+    {
+      "epoch": 0.24693065882759813,
+      "grad_norm": 10.080041885375977,
+      "learning_rate": 0.0007938820101966497,
+      "loss": 2.1116,
+      "step": 357
+    },
+    {
+      "epoch": 0.2476223413453225,
+      "grad_norm": 0.5390161275863647,
+      "learning_rate": 0.0007931536780772032,
+      "loss": 1.7296,
+      "step": 358
+    },
+    {
+      "epoch": 0.24831402386304685,
+      "grad_norm": 1.2583034038543701,
+      "learning_rate": 0.0007924253459577567,
+      "loss": 1.4504,
+      "step": 359
+    },
+    {
+      "epoch": 0.2490057063807712,
+      "grad_norm": 0.6620193719863892,
+      "learning_rate": 0.0007916970138383102,
+      "loss": 1.9403,
+      "step": 360
+    },
+    {
+      "epoch": 0.2496973888984956,
+      "grad_norm": 0.8169893622398376,
+      "learning_rate": 0.0007909686817188638,
+      "loss": 1.2534,
+      "step": 361
+    },
+    {
+      "epoch": 0.25038907141621997,
+      "grad_norm": 0.693074643611908,
+      "learning_rate": 0.0007902403495994173,
+      "loss": 2.112,
+      "step": 362
+    },
+    {
+      "epoch": 0.2510807539339443,
+      "grad_norm": 0.628724217414856,
+      "learning_rate": 0.000789512017479971,
+      "loss": 2.0005,
+      "step": 363
+    },
+    {
+      "epoch": 0.2517724364516687,
+      "grad_norm": 0.6025403141975403,
+      "learning_rate": 0.0007887836853605243,
+      "loss": 1.0489,
+      "step": 364
+    },
+    {
+      "epoch": 0.252464118969393,
+      "grad_norm": 0.6881316900253296,
+      "learning_rate": 0.000788055353241078,
+      "loss": 1.1542,
+      "step": 365
+    },
+    {
+      "epoch": 0.2531558014871174,
+      "grad_norm": 1.035561442375183,
+      "learning_rate": 0.0007873270211216315,
+      "loss": 1.5017,
+      "step": 366
+    },
+    {
+      "epoch": 0.25384748400484175,
+      "grad_norm": 0.5408887267112732,
+      "learning_rate": 0.000786598689002185,
+      "loss": 2.0277,
+      "step": 367
+    },
+    {
+      "epoch": 0.25453916652256614,
+      "grad_norm": 0.5508919358253479,
+      "learning_rate": 0.0007858703568827386,
+      "loss": 1.5144,
+      "step": 368
+    },
+    {
+      "epoch": 0.25523084904029053,
+      "grad_norm": 0.9890360236167908,
+      "learning_rate": 0.0007851420247632921,
+      "loss": 1.0548,
+      "step": 369
+    },
+    {
+      "epoch": 0.25592253155801487,
+      "grad_norm": 0.6218384504318237,
+      "learning_rate": 0.0007844136926438457,
+      "loss": 1.4775,
+      "step": 370
+    },
+    {
+      "epoch": 0.25661421407573926,
+      "grad_norm": 0.5427407622337341,
+      "learning_rate": 0.0007836853605243991,
+      "loss": 1.6648,
+      "step": 371
+    },
+    {
+      "epoch": 0.2573058965934636,
+      "grad_norm": 0.6376339793205261,
+      "learning_rate": 0.0007829570284049527,
+      "loss": 1.7319,
+      "step": 372
+    },
+    {
+      "epoch": 0.257997579111188,
+      "grad_norm": 0.5155366063117981,
+      "learning_rate": 0.0007822286962855062,
+      "loss": 1.7143,
+      "step": 373
+    },
+    {
+      "epoch": 0.2586892616289123,
+      "grad_norm": 1.0346859693527222,
+      "learning_rate": 0.0007815003641660597,
+      "loss": 1.96,
+      "step": 374
+    },
+    {
+      "epoch": 0.2593809441466367,
+      "grad_norm": 0.5473276376724243,
+      "learning_rate": 0.0007807720320466133,
+      "loss": 1.8269,
+      "step": 375
+    },
+    {
+      "epoch": 0.26007262666436104,
+      "grad_norm": 0.9501216411590576,
+      "learning_rate": 0.0007800436999271668,
+      "loss": 1.5286,
+      "step": 376
+    },
+    {
+      "epoch": 0.26076430918208543,
+      "grad_norm": 0.4338766634464264,
+      "learning_rate": 0.0007793153678077204,
+      "loss": 0.8967,
+      "step": 377
+    },
+    {
+      "epoch": 0.26145599169980976,
+      "grad_norm": 12.023887634277344,
+      "learning_rate": 0.0007785870356882738,
+      "loss": 1.8385,
+      "step": 378
+    },
+    {
+      "epoch": 0.26214767421753415,
+      "grad_norm": 0.5424131155014038,
+      "learning_rate": 0.0007778587035688274,
+      "loss": 1.7216,
+      "step": 379
+    },
+    {
+      "epoch": 0.2628393567352585,
+      "grad_norm": 0.6199079751968384,
+      "learning_rate": 0.0007771303714493809,
+      "loss": 1.9606,
+      "step": 380
+    },
+    {
+      "epoch": 0.2635310392529829,
+      "grad_norm": 0.6037024259567261,
+      "learning_rate": 0.0007764020393299344,
+      "loss": 1.4306,
+      "step": 381
+    },
+    {
+      "epoch": 0.26422272177070727,
+      "grad_norm": 0.6312823295593262,
+      "learning_rate": 0.000775673707210488,
+      "loss": 1.0085,
+      "step": 382
+    },
+    {
+      "epoch": 0.2649144042884316,
+      "grad_norm": 0.5497464537620544,
+      "learning_rate": 0.0007749453750910415,
+      "loss": 1.4443,
+      "step": 383
+    },
+    {
+      "epoch": 0.265606086806156,
+      "grad_norm": 0.9736106395721436,
+      "learning_rate": 0.000774217042971595,
+      "loss": 1.7324,
+      "step": 384
+    },
+    {
+      "epoch": 0.26629776932388033,
+      "grad_norm": 0.6415931582450867,
+      "learning_rate": 0.0007734887108521485,
+      "loss": 0.9575,
+      "step": 385
+    },
+    {
+      "epoch": 0.2669894518416047,
+      "grad_norm": 0.570580244064331,
+      "learning_rate": 0.0007727603787327021,
+      "loss": 1.7195,
+      "step": 386
+    },
+    {
+      "epoch": 0.26768113435932905,
+      "grad_norm": 0.7033479809761047,
+      "learning_rate": 0.0007720320466132557,
+      "loss": 2.0542,
+      "step": 387
+    },
+    {
+      "epoch": 0.26837281687705344,
+      "grad_norm": 0.7575972676277161,
+      "learning_rate": 0.0007713037144938092,
+      "loss": 1.7199,
+      "step": 388
+    },
+    {
+      "epoch": 0.2690644993947778,
+      "grad_norm": 0.5389835238456726,
+      "learning_rate": 0.0007705753823743628,
+      "loss": 1.5259,
+      "step": 389
+    },
+    {
+      "epoch": 0.26975618191250217,
+      "grad_norm": 0.574540913105011,
+      "learning_rate": 0.0007698470502549163,
+      "loss": 1.5391,
+      "step": 390
+    },
+    {
+      "epoch": 0.2704478644302265,
+      "grad_norm": 0.5298869013786316,
+      "learning_rate": 0.0007691187181354698,
+      "loss": 1.9361,
+      "step": 391
+    },
+    {
+      "epoch": 0.2711395469479509,
+      "grad_norm": 0.5654643177986145,
+      "learning_rate": 0.0007683903860160233,
+      "loss": 1.8952,
+      "step": 392
+    },
+    {
+      "epoch": 0.27183122946567523,
+      "grad_norm": 0.7499473094940186,
+      "learning_rate": 0.0007676620538965769,
+      "loss": 1.0416,
+      "step": 393
+    },
+    {
+      "epoch": 0.2725229119833996,
+      "grad_norm": 0.6296089887619019,
+      "learning_rate": 0.0007669337217771304,
+      "loss": 1.4852,
+      "step": 394
+    },
+    {
+      "epoch": 0.273214594501124,
+      "grad_norm": 0.5401056408882141,
+      "learning_rate": 0.0007662053896576839,
+      "loss": 1.8153,
+      "step": 395
+    },
+    {
+      "epoch": 0.27390627701884834,
+      "grad_norm": 0.5954565405845642,
+      "learning_rate": 0.0007654770575382375,
+      "loss": 1.7855,
+      "step": 396
+    },
+    {
+      "epoch": 0.27459795953657273,
+      "grad_norm": 0.9156423211097717,
+      "learning_rate": 0.000764748725418791,
+      "loss": 1.9173,
+      "step": 397
+    },
+    {
+      "epoch": 0.27528964205429707,
+      "grad_norm": 0.6210983395576477,
+      "learning_rate": 0.0007640203932993445,
+      "loss": 1.9079,
+      "step": 398
+    },
+    {
+      "epoch": 0.27598132457202146,
+      "grad_norm": 0.529227077960968,
+      "learning_rate": 0.000763292061179898,
+      "loss": 1.8594,
+      "step": 399
+    },
+    {
+      "epoch": 0.2766730070897458,
+      "grad_norm": 0.80283522605896,
+      "learning_rate": 0.0007625637290604516,
+      "loss": 1.5913,
+      "step": 400
+    },
+    {
+      "epoch": 0.2773646896074702,
+      "grad_norm": 0.5629950761795044,
+      "learning_rate": 0.0007618353969410051,
+      "loss": 1.5373,
+      "step": 401
+    },
+    {
+      "epoch": 0.2780563721251945,
+      "grad_norm": 0.6493797898292542,
+      "learning_rate": 0.0007611070648215586,
+      "loss": 2.2277,
+      "step": 402
+    },
+    {
+      "epoch": 0.2787480546429189,
+      "grad_norm": 0.5912362933158875,
+      "learning_rate": 0.0007603787327021122,
+      "loss": 1.4388,
+      "step": 403
+    },
+    {
+      "epoch": 0.27943973716064324,
+      "grad_norm": 0.7361041307449341,
+      "learning_rate": 0.0007596504005826656,
+      "loss": 0.9273,
+      "step": 404
+    },
+    {
+      "epoch": 0.28013141967836763,
+      "grad_norm": 0.8257749676704407,
+      "learning_rate": 0.0007589220684632192,
+      "loss": 0.7073,
+      "step": 405
+    },
+    {
+      "epoch": 0.28082310219609197,
+      "grad_norm": 0.8185616731643677,
+      "learning_rate": 0.0007581937363437727,
+      "loss": 1.9928,
+      "step": 406
+    },
+    {
+      "epoch": 0.28151478471381636,
+      "grad_norm": 0.5865523219108582,
+      "learning_rate": 0.0007574654042243263,
+      "loss": 1.1638,
+      "step": 407
+    },
+    {
+      "epoch": 0.28220646723154075,
+      "grad_norm": 0.5210615396499634,
+      "learning_rate": 0.0007567370721048798,
+      "loss": 1.1934,
+      "step": 408
+    },
+    {
+      "epoch": 0.2828981497492651,
+      "grad_norm": 0.6531309485435486,
+      "learning_rate": 0.0007560087399854334,
+      "loss": 1.8601,
+      "step": 409
+    },
+    {
+      "epoch": 0.28358983226698947,
+      "grad_norm": 0.7874279022216797,
+      "learning_rate": 0.000755280407865987,
+      "loss": 1.4043,
+      "step": 410
+    },
+    {
+      "epoch": 0.2842815147847138,
+      "grad_norm": 1.121983289718628,
+      "learning_rate": 0.0007545520757465404,
+      "loss": 1.7112,
+      "step": 411
+    },
+    {
+      "epoch": 0.2849731973024382,
+      "grad_norm": 0.5046870708465576,
+      "learning_rate": 0.000753823743627094,
+      "loss": 1.6255,
+      "step": 412
+    },
+    {
+      "epoch": 0.28566487982016253,
+      "grad_norm": 0.4254264831542969,
+      "learning_rate": 0.0007530954115076475,
+      "loss": 1.3181,
+      "step": 413
+    },
+    {
+      "epoch": 0.2863565623378869,
+      "grad_norm": 0.8146479725837708,
+      "learning_rate": 0.0007523670793882011,
+      "loss": 1.7557,
+      "step": 414
+    },
+    {
+      "epoch": 0.28704824485561126,
+      "grad_norm": 0.47856444120407104,
+      "learning_rate": 0.0007516387472687546,
+      "loss": 1.6217,
+      "step": 415
+    },
+    {
+      "epoch": 0.28773992737333565,
+      "grad_norm": 0.5287722945213318,
+      "learning_rate": 0.0007509104151493081,
+      "loss": 1.8146,
+      "step": 416
+    },
+    {
+      "epoch": 0.28843160989106,
+      "grad_norm": 0.8364676833152771,
+      "learning_rate": 0.0007501820830298617,
+      "loss": 1.1081,
+      "step": 417
+    },
+    {
+      "epoch": 0.28912329240878437,
+      "grad_norm": 0.6923417448997498,
+      "learning_rate": 0.0007494537509104151,
+      "loss": 1.0822,
+      "step": 418
+    },
+    {
+      "epoch": 0.2898149749265087,
+      "grad_norm": 0.7535339593887329,
+      "learning_rate": 0.0007487254187909687,
+      "loss": 1.8013,
+      "step": 419
+    },
+    {
+      "epoch": 0.2905066574442331,
+      "grad_norm": 1.226645588874817,
+      "learning_rate": 0.0007479970866715222,
+      "loss": 1.1891,
+      "step": 420
+    },
+    {
+      "epoch": 0.2911983399619575,
+      "grad_norm": 0.7388406991958618,
+      "learning_rate": 0.0007472687545520758,
+      "loss": 1.0448,
+      "step": 421
+    },
+    {
+      "epoch": 0.2918900224796818,
+      "grad_norm": 0.6585919260978699,
+      "learning_rate": 0.0007465404224326293,
+      "loss": 1.0951,
+      "step": 422
+    },
+    {
+      "epoch": 0.2925817049974062,
+      "grad_norm": 0.7637200355529785,
+      "learning_rate": 0.0007458120903131828,
+      "loss": 1.3508,
+      "step": 423
+    },
+    {
+      "epoch": 0.29327338751513055,
+      "grad_norm": 0.5754939913749695,
+      "learning_rate": 0.0007450837581937363,
+      "loss": 1.4937,
+      "step": 424
+    },
+    {
+      "epoch": 0.29396507003285494,
+      "grad_norm": 0.6434321999549866,
+      "learning_rate": 0.0007443554260742898,
+      "loss": 2.0318,
+      "step": 425
+    },
+    {
+      "epoch": 0.29465675255057927,
+      "grad_norm": 0.7063912749290466,
+      "learning_rate": 0.0007436270939548434,
+      "loss": 2.0036,
+      "step": 426
+    },
+    {
+      "epoch": 0.29534843506830366,
+      "grad_norm": 0.5120965242385864,
+      "learning_rate": 0.0007428987618353969,
+      "loss": 1.7687,
+      "step": 427
+    },
+    {
+      "epoch": 0.296040117586028,
+      "grad_norm": 0.7403333187103271,
+      "learning_rate": 0.0007421704297159505,
+      "loss": 1.5747,
+      "step": 428
+    },
+    {
+      "epoch": 0.2967318001037524,
+      "grad_norm": 0.5760396122932434,
+      "learning_rate": 0.000741442097596504,
+      "loss": 1.7351,
+      "step": 429
+    },
+    {
+      "epoch": 0.2974234826214767,
+      "grad_norm": 0.6725696325302124,
+      "learning_rate": 0.0007407137654770575,
+      "loss": 1.6643,
+      "step": 430
+    },
+    {
+      "epoch": 0.2981151651392011,
+      "grad_norm": 0.5612234473228455,
+      "learning_rate": 0.000739985433357611,
+      "loss": 1.47,
+      "step": 431
+    },
+    {
+      "epoch": 0.29880684765692545,
+      "grad_norm": 0.48072177171707153,
+      "learning_rate": 0.0007392571012381646,
+      "loss": 1.2806,
+      "step": 432
+    },
+    {
+      "epoch": 0.29949853017464984,
+      "grad_norm": 0.6465651988983154,
+      "learning_rate": 0.0007385287691187182,
+      "loss": 1.7875,
+      "step": 433
+    },
+    {
+      "epoch": 0.3001902126923742,
+      "grad_norm": 0.7127341628074646,
+      "learning_rate": 0.0007378004369992717,
+      "loss": 1.9259,
+      "step": 434
+    },
+    {
+      "epoch": 0.30088189521009856,
+      "grad_norm": 0.5775954127311707,
+      "learning_rate": 0.0007370721048798253,
+      "loss": 1.6652,
+      "step": 435
+    },
+    {
+      "epoch": 0.30157357772782295,
+      "grad_norm": 0.6212239861488342,
+      "learning_rate": 0.0007363437727603788,
+      "loss": 1.7045,
+      "step": 436
+    },
+    {
+      "epoch": 0.3022652602455473,
+      "grad_norm": 0.6443663835525513,
+      "learning_rate": 0.0007356154406409323,
+      "loss": 1.9416,
+      "step": 437
+    },
+    {
+      "epoch": 0.3029569427632717,
+      "grad_norm": 0.6374452710151672,
+      "learning_rate": 0.0007348871085214858,
+      "loss": 1.7767,
+      "step": 438
+    },
+    {
+      "epoch": 0.303648625280996,
+      "grad_norm": 0.7170910835266113,
+      "learning_rate": 0.0007341587764020393,
+      "loss": 1.1563,
+      "step": 439
+    },
+    {
+      "epoch": 0.3043403077987204,
+      "grad_norm": 1.249282717704773,
+      "learning_rate": 0.0007334304442825929,
+      "loss": 1.7667,
+      "step": 440
+    },
+    {
+      "epoch": 0.30503199031644473,
+      "grad_norm": 0.6763089895248413,
+      "learning_rate": 0.0007327021121631464,
+      "loss": 1.9292,
+      "step": 441
+    },
+    {
+      "epoch": 0.3057236728341691,
+      "grad_norm": 0.7364800572395325,
+      "learning_rate": 0.0007319737800437,
+      "loss": 1.6102,
+      "step": 442
+    },
+    {
+      "epoch": 0.30641535535189346,
+      "grad_norm": 0.8224323987960815,
+      "learning_rate": 0.0007312454479242535,
+      "loss": 1.0648,
+      "step": 443
+    },
+    {
+      "epoch": 0.30710703786961785,
+      "grad_norm": 0.752155065536499,
+      "learning_rate": 0.0007305171158048069,
+      "loss": 1.788,
+      "step": 444
+    },
+    {
+      "epoch": 0.3077987203873422,
+      "grad_norm": 0.5755220651626587,
+      "learning_rate": 0.0007297887836853605,
+      "loss": 1.2096,
+      "step": 445
+    },
+    {
+      "epoch": 0.3084904029050666,
+      "grad_norm": 0.8400484323501587,
+      "learning_rate": 0.000729060451565914,
+      "loss": 1.3676,
+      "step": 446
+    },
+    {
+      "epoch": 0.30918208542279096,
+      "grad_norm": 0.5796182155609131,
+      "learning_rate": 0.0007283321194464676,
+      "loss": 1.5668,
+      "step": 447
+    },
+    {
+      "epoch": 0.3098737679405153,
+      "grad_norm": 0.985273003578186,
+      "learning_rate": 0.0007276037873270211,
+      "loss": 1.4033,
+      "step": 448
+    },
+    {
+      "epoch": 0.3105654504582397,
+      "grad_norm": 0.6488866209983826,
+      "learning_rate": 0.0007268754552075747,
+      "loss": 1.5978,
+      "step": 449
+    },
+    {
+      "epoch": 0.311257132975964,
+      "grad_norm": 0.5811178088188171,
+      "learning_rate": 0.0007261471230881282,
+      "loss": 2.1722,
+      "step": 450
+    },
+    {
+      "epoch": 0.3119488154936884,
+      "grad_norm": 0.5769645571708679,
+      "learning_rate": 0.0007254187909686816,
+      "loss": 1.9041,
+      "step": 451
+    },
+    {
+      "epoch": 0.31264049801141275,
+      "grad_norm": 0.769631028175354,
+      "learning_rate": 0.0007246904588492352,
+      "loss": 1.3963,
+      "step": 452
+    },
+    {
+      "epoch": 0.31333218052913714,
+      "grad_norm": 0.8301665186882019,
+      "learning_rate": 0.0007239621267297888,
+      "loss": 1.623,
+      "step": 453
+    },
+    {
+      "epoch": 0.3140238630468615,
+      "grad_norm": 0.6046326756477356,
+      "learning_rate": 0.0007232337946103424,
+      "loss": 1.8703,
+      "step": 454
+    },
+    {
+      "epoch": 0.31471554556458586,
+      "grad_norm": 0.5623071193695068,
+      "learning_rate": 0.0007225054624908959,
+      "loss": 1.6112,
+      "step": 455
+    },
+    {
+      "epoch": 0.3154072280823102,
+      "grad_norm": 0.7813363671302795,
+      "learning_rate": 0.0007217771303714495,
+      "loss": 1.681,
+      "step": 456
+    },
+    {
+      "epoch": 0.3160989106000346,
+      "grad_norm": 0.6935021877288818,
+      "learning_rate": 0.000721048798252003,
+      "loss": 1.5276,
+      "step": 457
+    },
+    {
+      "epoch": 0.3167905931177589,
+      "grad_norm": 1.0678547620773315,
+      "learning_rate": 0.0007203204661325564,
+      "loss": 1.2066,
+      "step": 458
+    },
+    {
+      "epoch": 0.3174822756354833,
+      "grad_norm": 0.985817551612854,
+      "learning_rate": 0.00071959213401311,
+      "loss": 1.5923,
+      "step": 459
+    },
+    {
+      "epoch": 0.3181739581532077,
+      "grad_norm": 0.6185691356658936,
+      "learning_rate": 0.0007188638018936635,
+      "loss": 1.1597,
+      "step": 460
+    },
+    {
+      "epoch": 0.31886564067093204,
+      "grad_norm": 0.6517722010612488,
+      "learning_rate": 0.0007181354697742171,
+      "loss": 1.0909,
+      "step": 461
+    },
+    {
+      "epoch": 0.31955732318865643,
+      "grad_norm": 0.6660693883895874,
+      "learning_rate": 0.0007174071376547706,
+      "loss": 1.8785,
+      "step": 462
+    },
+    {
+      "epoch": 0.32024900570638076,
+      "grad_norm": 0.8916088938713074,
+      "learning_rate": 0.0007166788055353242,
+      "loss": 1.0246,
+      "step": 463
+    },
+    {
+      "epoch": 0.32094068822410515,
+      "grad_norm": 0.6262109875679016,
+      "learning_rate": 0.0007159504734158776,
+      "loss": 1.8117,
+      "step": 464
+    },
+    {
+      "epoch": 0.3216323707418295,
+      "grad_norm": 0.5265359878540039,
+      "learning_rate": 0.0007152221412964311,
+      "loss": 1.8935,
+      "step": 465
+    },
+    {
+      "epoch": 0.3223240532595539,
+      "grad_norm": 0.584057629108429,
+      "learning_rate": 0.0007144938091769847,
+      "loss": 2.1743,
+      "step": 466
+    },
+    {
+      "epoch": 0.3230157357772782,
+      "grad_norm": 0.6198194026947021,
+      "learning_rate": 0.0007137654770575382,
+      "loss": 1.0213,
+      "step": 467
+    },
+    {
+      "epoch": 0.3237074182950026,
+      "grad_norm": 0.8011792898178101,
+      "learning_rate": 0.0007130371449380918,
+      "loss": 1.3635,
+      "step": 468
+    },
+    {
+      "epoch": 0.32439910081272694,
+      "grad_norm": 0.6226928234100342,
+      "learning_rate": 0.0007123088128186453,
+      "loss": 1.5389,
+      "step": 469
+    },
+    {
+      "epoch": 0.3250907833304513,
+      "grad_norm": 0.6563382744789124,
+      "learning_rate": 0.0007115804806991989,
+      "loss": 1.6626,
+      "step": 470
+    },
+    {
+      "epoch": 0.32578246584817566,
+      "grad_norm": 0.6689289808273315,
+      "learning_rate": 0.0007108521485797523,
+      "loss": 0.2715,
+      "step": 471
+    },
+    {
+      "epoch": 0.32647414836590005,
+      "grad_norm": 0.7439026832580566,
+      "learning_rate": 0.0007101238164603058,
+      "loss": 1.9645,
+      "step": 472
+    },
+    {
+      "epoch": 0.32716583088362444,
+      "grad_norm": 0.6306619048118591,
+      "learning_rate": 0.0007093954843408594,
+      "loss": 2.026,
+      "step": 473
+    },
+    {
+      "epoch": 0.3278575134013488,
+      "grad_norm": 0.5557575225830078,
+      "learning_rate": 0.000708667152221413,
+      "loss": 1.7433,
+      "step": 474
+    },
+    {
+      "epoch": 0.32854919591907317,
+      "grad_norm": 0.6485971212387085,
+      "learning_rate": 0.0007079388201019666,
+      "loss": 1.8697,
+      "step": 475
+    },
+    {
+      "epoch": 0.3292408784367975,
+      "grad_norm": 0.6541025042533875,
+      "learning_rate": 0.0007072104879825201,
+      "loss": 1.8979,
+      "step": 476
+    },
+    {
+      "epoch": 0.3299325609545219,
+      "grad_norm": 0.617359459400177,
+      "learning_rate": 0.0007064821558630737,
+      "loss": 2.1617,
+      "step": 477
+    },
+    {
+      "epoch": 0.3306242434722462,
+      "grad_norm": 0.5855705142021179,
+      "learning_rate": 0.0007057538237436271,
+      "loss": 1.8079,
+      "step": 478
+    },
+    {
+      "epoch": 0.3313159259899706,
+      "grad_norm": 0.5983846187591553,
+      "learning_rate": 0.0007050254916241806,
+      "loss": 1.8796,
+      "step": 479
+    },
+    {
+      "epoch": 0.33200760850769495,
+      "grad_norm": 1.1877782344818115,
+      "learning_rate": 0.0007042971595047342,
+      "loss": 1.5439,
+      "step": 480
+    },
+    {
+      "epoch": 0.33269929102541934,
+      "grad_norm": 0.48594361543655396,
+      "learning_rate": 0.0007035688273852877,
+      "loss": 1.3418,
+      "step": 481
+    },
+    {
+      "epoch": 0.3333909735431437,
+      "grad_norm": 1.506446123123169,
+      "learning_rate": 0.0007028404952658413,
+      "loss": 1.2715,
+      "step": 482
+    },
+    {
+      "epoch": 0.33408265606086807,
+      "grad_norm": 1.1047416925430298,
+      "learning_rate": 0.0007021121631463948,
+      "loss": 1.6349,
+      "step": 483
+    },
+    {
+      "epoch": 0.3347743385785924,
+      "grad_norm": 0.6083149909973145,
+      "learning_rate": 0.0007013838310269483,
+      "loss": 1.9314,
+      "step": 484
+    },
+    {
+      "epoch": 0.3354660210963168,
+      "grad_norm": 1.35393226146698,
+      "learning_rate": 0.0007006554989075018,
+      "loss": 1.8499,
+      "step": 485
+    },
+    {
+      "epoch": 0.3361577036140412,
+      "grad_norm": 0.9483690857887268,
+      "learning_rate": 0.0006999271667880553,
+      "loss": 1.51,
+      "step": 486
+    },
+    {
+      "epoch": 0.3368493861317655,
+      "grad_norm": 0.5228135585784912,
+      "learning_rate": 0.0006991988346686089,
+      "loss": 1.9898,
+      "step": 487
+    },
+    {
+      "epoch": 0.3375410686494899,
+      "grad_norm": 0.5586177706718445,
+      "learning_rate": 0.0006984705025491624,
+      "loss": 1.3041,
+      "step": 488
+    },
+    {
+      "epoch": 0.33823275116721424,
+      "grad_norm": 0.5928153991699219,
+      "learning_rate": 0.000697742170429716,
+      "loss": 2.0132,
+      "step": 489
+    },
+    {
+      "epoch": 0.33892443368493863,
+      "grad_norm": 1.3419654369354248,
+      "learning_rate": 0.0006970138383102695,
+      "loss": 1.0039,
+      "step": 490
+    },
+    {
+      "epoch": 0.33961611620266297,
+      "grad_norm": 0.6874161958694458,
+      "learning_rate": 0.000696285506190823,
+      "loss": 1.2347,
+      "step": 491
+    },
+    {
+      "epoch": 0.34030779872038736,
+      "grad_norm": 0.6180602312088013,
+      "learning_rate": 0.0006955571740713765,
+      "loss": 1.1032,
+      "step": 492
+    },
+    {
+      "epoch": 0.3409994812381117,
+      "grad_norm": 0.6274152994155884,
+      "learning_rate": 0.00069482884195193,
+      "loss": 1.8178,
+      "step": 493
+    },
+    {
+      "epoch": 0.3416911637558361,
+      "grad_norm": 0.6605129837989807,
+      "learning_rate": 0.0006941005098324836,
+      "loss": 1.4469,
+      "step": 494
+    },
+    {
+      "epoch": 0.3423828462735604,
+      "grad_norm": 0.5513582229614258,
+      "learning_rate": 0.0006933721777130371,
+      "loss": 1.6028,
+      "step": 495
+    },
+    {
+      "epoch": 0.3430745287912848,
+      "grad_norm": 0.4937622845172882,
+      "learning_rate": 0.0006926438455935908,
+      "loss": 1.676,
+      "step": 496
+    },
+    {
+      "epoch": 0.34376621130900914,
+      "grad_norm": 0.6980156302452087,
+      "learning_rate": 0.0006919155134741443,
+      "loss": 1.8214,
+      "step": 497
+    },
+    {
+      "epoch": 0.34445789382673353,
+      "grad_norm": 0.589282214641571,
+      "learning_rate": 0.0006911871813546978,
+      "loss": 1.5903,
+      "step": 498
+    },
+    {
+      "epoch": 0.3451495763444579,
+      "grad_norm": 0.5733899474143982,
+      "learning_rate": 0.0006904588492352513,
+      "loss": 2.0085,
+      "step": 499
+    },
+    {
+      "epoch": 0.34584125886218225,
+      "grad_norm": 13.85383129119873,
+      "learning_rate": 0.0006897305171158048,
+      "loss": 1.3935,
+      "step": 500
+    },
+    {
+      "epoch": 0.34584125886218225,
+      "eval_loss": 1.4428730010986328,
+      "eval_runtime": 586.3776,
+      "eval_samples_per_second": 2.191,
+      "eval_steps_per_second": 1.097,
+      "step": 500
+    },
+    {
+      "epoch": 0.34653294137990664,
+      "grad_norm": 0.7078066468238831,
+      "learning_rate": 0.0006890021849963584,
+      "loss": 2.1292,
+      "step": 501
+    },
+    {
+      "epoch": 0.347224623897631,
+      "grad_norm": 0.5394365787506104,
+      "learning_rate": 0.0006882738528769119,
+      "loss": 1.754,
+      "step": 502
+    },
+    {
+      "epoch": 0.34791630641535537,
+      "grad_norm": 0.5652272701263428,
+      "learning_rate": 0.0006875455207574655,
+      "loss": 1.7321,
+      "step": 503
+    },
+    {
+      "epoch": 0.3486079889330797,
+      "grad_norm": 0.7590258717536926,
+      "learning_rate": 0.0006868171886380189,
+      "loss": 1.1914,
+      "step": 504
+    },
+    {
+      "epoch": 0.3492996714508041,
+      "grad_norm": 0.6021602153778076,
+      "learning_rate": 0.0006860888565185725,
+      "loss": 1.4885,
+      "step": 505
+    },
+    {
+      "epoch": 0.34999135396852843,
+      "grad_norm": 0.5702530741691589,
+      "learning_rate": 0.000685360524399126,
+      "loss": 1.7321,
+      "step": 506
+    },
+    {
+      "epoch": 0.3506830364862528,
+      "grad_norm": 0.6997826099395752,
+      "learning_rate": 0.0006846321922796795,
+      "loss": 1.2399,
+      "step": 507
+    },
+    {
+      "epoch": 0.35137471900397715,
+      "grad_norm": 0.5168879628181458,
+      "learning_rate": 0.0006839038601602331,
+      "loss": 1.1561,
+      "step": 508
+    },
+    {
+      "epoch": 0.35206640152170154,
+      "grad_norm": 0.7245921492576599,
+      "learning_rate": 0.0006831755280407866,
+      "loss": 1.3214,
+      "step": 509
+    },
+    {
+      "epoch": 0.3527580840394259,
+      "grad_norm": 0.5683633685112,
+      "learning_rate": 0.0006824471959213402,
+      "loss": 0.8284,
+      "step": 510
+    },
+    {
+      "epoch": 0.35344976655715027,
+      "grad_norm": 0.6745566725730896,
+      "learning_rate": 0.0006817188638018936,
+      "loss": 1.3734,
+      "step": 511
+    },
+    {
+      "epoch": 0.35414144907487466,
+      "grad_norm": 1.4038891792297363,
+      "learning_rate": 0.0006809905316824472,
+      "loss": 1.5278,
+      "step": 512
+    },
+    {
+      "epoch": 0.354833131592599,
+      "grad_norm": 0.633068323135376,
+      "learning_rate": 0.0006802621995630007,
+      "loss": 1.9671,
+      "step": 513
+    },
+    {
+      "epoch": 0.3555248141103234,
+      "grad_norm": 0.6043351888656616,
+      "learning_rate": 0.0006795338674435542,
+      "loss": 2.0149,
+      "step": 514
+    },
+    {
+      "epoch": 0.3562164966280477,
+      "grad_norm": 0.68370121717453,
+      "learning_rate": 0.0006788055353241078,
+      "loss": 2.1092,
+      "step": 515
+    },
+    {
+      "epoch": 0.3569081791457721,
+      "grad_norm": 0.8545238375663757,
+      "learning_rate": 0.0006780772032046613,
+      "loss": 1.8421,
+      "step": 516
+    },
+    {
+      "epoch": 0.35759986166349644,
+      "grad_norm": 0.5773653388023376,
+      "learning_rate": 0.000677348871085215,
+      "loss": 1.3504,
+      "step": 517
+    },
+    {
+      "epoch": 0.35829154418122083,
+      "grad_norm": 0.5424834489822388,
+      "learning_rate": 0.0006766205389657683,
+      "loss": 1.5214,
+      "step": 518
+    },
+    {
+      "epoch": 0.35898322669894517,
+      "grad_norm": 0.6430636644363403,
+      "learning_rate": 0.000675892206846322,
+      "loss": 1.8938,
+      "step": 519
+    },
+    {
+      "epoch": 0.35967490921666956,
+      "grad_norm": 0.7093544602394104,
+      "learning_rate": 0.0006751638747268755,
+      "loss": 1.2654,
+      "step": 520
+    },
+    {
+      "epoch": 0.3603665917343939,
+      "grad_norm": 0.6688354015350342,
+      "learning_rate": 0.000674435542607429,
+      "loss": 1.457,
+      "step": 521
+    },
+    {
+      "epoch": 0.3610582742521183,
+      "grad_norm": 0.6786370873451233,
+      "learning_rate": 0.0006737072104879826,
+      "loss": 1.3456,
+      "step": 522
+    },
+    {
+      "epoch": 0.3617499567698426,
+      "grad_norm": 0.5341997742652893,
+      "learning_rate": 0.0006729788783685361,
+      "loss": 1.9376,
+      "step": 523
+    },
+    {
+      "epoch": 0.362441639287567,
+      "grad_norm": 0.7012555003166199,
+      "learning_rate": 0.0006722505462490896,
+      "loss": 1.2591,
+      "step": 524
+    },
+    {
+      "epoch": 0.3631333218052914,
+      "grad_norm": 0.7682555317878723,
+      "learning_rate": 0.0006715222141296431,
+      "loss": 1.4887,
+      "step": 525
+    },
+    {
+      "epoch": 0.36382500432301573,
+      "grad_norm": 0.8778069615364075,
+      "learning_rate": 0.0006707938820101967,
+      "loss": 1.8948,
+      "step": 526
+    },
+    {
+      "epoch": 0.3645166868407401,
+      "grad_norm": 0.7078661322593689,
+      "learning_rate": 0.0006700655498907502,
+      "loss": 1.8194,
+      "step": 527
+    },
+    {
+      "epoch": 0.36520836935846446,
+      "grad_norm": 0.7950670719146729,
+      "learning_rate": 0.0006693372177713037,
+      "loss": 1.4127,
+      "step": 528
+    },
+    {
+      "epoch": 0.36590005187618885,
+      "grad_norm": 0.5888793468475342,
+      "learning_rate": 0.0006686088856518573,
+      "loss": 1.7251,
+      "step": 529
+    },
+    {
+      "epoch": 0.3665917343939132,
+      "grad_norm": 0.9127269387245178,
+      "learning_rate": 0.0006678805535324108,
+      "loss": 1.3327,
+      "step": 530
+    },
+    {
+      "epoch": 0.3672834169116376,
+      "grad_norm": 0.5952627062797546,
+      "learning_rate": 0.0006671522214129643,
+      "loss": 1.4203,
+      "step": 531
+    },
+    {
+      "epoch": 0.3679750994293619,
+      "grad_norm": 0.4815312922000885,
+      "learning_rate": 0.0006664238892935178,
+      "loss": 0.9024,
+      "step": 532
+    },
+    {
+      "epoch": 0.3686667819470863,
+      "grad_norm": 0.652438759803772,
+      "learning_rate": 0.0006656955571740714,
+      "loss": 1.146,
+      "step": 533
+    },
+    {
+      "epoch": 0.36935846446481063,
+      "grad_norm": 0.7467165589332581,
+      "learning_rate": 0.0006649672250546249,
+      "loss": 1.2516,
+      "step": 534
+    },
+    {
+      "epoch": 0.370050146982535,
+      "grad_norm": 0.5682463049888611,
+      "learning_rate": 0.0006642388929351784,
+      "loss": 1.9983,
+      "step": 535
+    },
+    {
+      "epoch": 0.37074182950025936,
+      "grad_norm": 1.3564231395721436,
+      "learning_rate": 0.000663510560815732,
+      "loss": 1.6168,
+      "step": 536
+    },
+    {
+      "epoch": 0.37143351201798375,
+      "grad_norm": 0.5406346321105957,
+      "learning_rate": 0.0006627822286962855,
+      "loss": 1.8084,
+      "step": 537
+    },
+    {
+      "epoch": 0.37212519453570814,
+      "grad_norm": 0.7050182223320007,
+      "learning_rate": 0.000662053896576839,
+      "loss": 1.8881,
+      "step": 538
+    },
+    {
+      "epoch": 0.37281687705343247,
+      "grad_norm": 0.645821213722229,
+      "learning_rate": 0.0006613255644573925,
+      "loss": 1.9676,
+      "step": 539
+    },
+    {
+      "epoch": 0.37350855957115686,
+      "grad_norm": 0.5666909217834473,
+      "learning_rate": 0.0006605972323379462,
+      "loss": 1.4184,
+      "step": 540
+    },
+    {
+      "epoch": 0.3742002420888812,
+      "grad_norm": 0.8641100525856018,
+      "learning_rate": 0.0006598689002184997,
+      "loss": 2.3462,
+      "step": 541
+    },
+    {
+      "epoch": 0.3748919246066056,
+      "grad_norm": 1.3686286211013794,
+      "learning_rate": 0.0006591405680990532,
+      "loss": 2.1734,
+      "step": 542
+    },
+    {
+      "epoch": 0.3755836071243299,
+      "grad_norm": 0.946014404296875,
+      "learning_rate": 0.0006584122359796068,
+      "loss": 1.5707,
+      "step": 543
+    },
+    {
+      "epoch": 0.3762752896420543,
+      "grad_norm": 0.7506546378135681,
+      "learning_rate": 0.0006576839038601602,
+      "loss": 1.9043,
+      "step": 544
+    },
+    {
+      "epoch": 0.37696697215977865,
+      "grad_norm": 0.5678505301475525,
+      "learning_rate": 0.0006569555717407138,
+      "loss": 1.8435,
+      "step": 545
+    },
+    {
+      "epoch": 0.37765865467750304,
+      "grad_norm": 0.616298496723175,
+      "learning_rate": 0.0006562272396212673,
+      "loss": 1.6757,
+      "step": 546
+    },
+    {
+      "epoch": 0.37835033719522737,
+      "grad_norm": 0.6611154675483704,
+      "learning_rate": 0.0006554989075018209,
+      "loss": 1.997,
+      "step": 547
+    },
+    {
+      "epoch": 0.37904201971295176,
+      "grad_norm": 0.6710254549980164,
+      "learning_rate": 0.0006547705753823744,
+      "loss": 2.0473,
+      "step": 548
+    },
+    {
+      "epoch": 0.3797337022306761,
+      "grad_norm": 0.6638564467430115,
+      "learning_rate": 0.0006540422432629279,
+      "loss": 1.5527,
+      "step": 549
+    },
+    {
+      "epoch": 0.3804253847484005,
+      "grad_norm": 0.5560479164123535,
+      "learning_rate": 0.0006533139111434815,
+      "loss": 1.1808,
+      "step": 550
+    },
+    {
+      "epoch": 0.3811170672661249,
+      "grad_norm": 0.8038564920425415,
+      "learning_rate": 0.0006525855790240349,
+      "loss": 2.1025,
+      "step": 551
+    },
+    {
+      "epoch": 0.3818087497838492,
+      "grad_norm": 0.5988295078277588,
+      "learning_rate": 0.0006518572469045885,
+      "loss": 1.5031,
+      "step": 552
+    },
+    {
+      "epoch": 0.3825004323015736,
+      "grad_norm": 0.6957617402076721,
+      "learning_rate": 0.000651128914785142,
+      "loss": 1.8805,
+      "step": 553
+    },
+    {
+      "epoch": 0.38319211481929794,
+      "grad_norm": 0.5128351449966431,
+      "learning_rate": 0.0006504005826656956,
+      "loss": 1.3545,
+      "step": 554
+    },
+    {
+      "epoch": 0.3838837973370223,
+      "grad_norm": 0.6512929201126099,
+      "learning_rate": 0.0006496722505462491,
+      "loss": 1.4282,
+      "step": 555
+    },
+    {
+      "epoch": 0.38457547985474666,
+      "grad_norm": 0.7529391646385193,
+      "learning_rate": 0.0006489439184268026,
+      "loss": 1.6247,
+      "step": 556
+    },
+    {
+      "epoch": 0.38526716237247105,
+      "grad_norm": 0.5604774951934814,
+      "learning_rate": 0.0006482155863073562,
+      "loss": 2.0437,
+      "step": 557
+    },
+    {
+      "epoch": 0.3859588448901954,
+      "grad_norm": 0.9159753322601318,
+      "learning_rate": 0.0006474872541879096,
+      "loss": 1.2743,
+      "step": 558
+    },
+    {
+      "epoch": 0.3866505274079198,
+      "grad_norm": 0.6029691100120544,
+      "learning_rate": 0.0006467589220684632,
+      "loss": 1.709,
+      "step": 559
+    },
+    {
+      "epoch": 0.3873422099256441,
+      "grad_norm": 0.545449435710907,
+      "learning_rate": 0.0006460305899490167,
+      "loss": 1.7701,
+      "step": 560
+    },
+    {
+      "epoch": 0.3880338924433685,
+      "grad_norm": 0.8501741886138916,
+      "learning_rate": 0.0006453022578295704,
+      "loss": 0.98,
+      "step": 561
+    },
+    {
+      "epoch": 0.38872557496109283,
+      "grad_norm": 0.8401899337768555,
+      "learning_rate": 0.0006445739257101239,
+      "loss": 0.8133,
+      "step": 562
+    },
+    {
+      "epoch": 0.3894172574788172,
+      "grad_norm": 0.5995264053344727,
+      "learning_rate": 0.0006438455935906774,
+      "loss": 2.0898,
+      "step": 563
+    },
+    {
+      "epoch": 0.3901089399965416,
+      "grad_norm": 0.5908014178276062,
+      "learning_rate": 0.0006431172614712309,
+      "loss": 1.2815,
+      "step": 564
+    },
+    {
+      "epoch": 0.39080062251426595,
+      "grad_norm": 0.6674137711524963,
+      "learning_rate": 0.0006423889293517844,
+      "loss": 0.9943,
+      "step": 565
+    },
+    {
+      "epoch": 0.39149230503199034,
+      "grad_norm": 0.5658942461013794,
+      "learning_rate": 0.000641660597232338,
+      "loss": 1.8295,
+      "step": 566
+    },
+    {
+      "epoch": 0.3921839875497147,
+      "grad_norm": 0.994026780128479,
+      "learning_rate": 0.0006409322651128915,
+      "loss": 1.3895,
+      "step": 567
+    },
+    {
+      "epoch": 0.39287567006743906,
+      "grad_norm": 0.8961228132247925,
+      "learning_rate": 0.0006402039329934451,
+      "loss": 1.3684,
+      "step": 568
+    },
+    {
+      "epoch": 0.3935673525851634,
+      "grad_norm": 0.7889755368232727,
+      "learning_rate": 0.0006394756008739986,
+      "loss": 0.6388,
+      "step": 569
+    },
+    {
+      "epoch": 0.3942590351028878,
+      "grad_norm": 0.5751606822013855,
+      "learning_rate": 0.0006387472687545521,
+      "loss": 1.4578,
+      "step": 570
+    },
+    {
+      "epoch": 0.3949507176206121,
+      "grad_norm": 0.6397355794906616,
+      "learning_rate": 0.0006380189366351056,
+      "loss": 1.3388,
+      "step": 571
+    },
+    {
+      "epoch": 0.3956424001383365,
+      "grad_norm": 0.5430677533149719,
+      "learning_rate": 0.0006372906045156591,
+      "loss": 1.6792,
+      "step": 572
+    },
+    {
+      "epoch": 0.39633408265606085,
+      "grad_norm": 0.5750475525856018,
+      "learning_rate": 0.0006365622723962127,
+      "loss": 1.7402,
+      "step": 573
+    },
+    {
+      "epoch": 0.39702576517378524,
+      "grad_norm": 0.8769389986991882,
+      "learning_rate": 0.0006358339402767662,
+      "loss": 1.1827,
+      "step": 574
+    },
+    {
+      "epoch": 0.3977174476915096,
+      "grad_norm": 0.6424825191497803,
+      "learning_rate": 0.0006351056081573198,
+      "loss": 1.9263,
+      "step": 575
+    },
+    {
+      "epoch": 0.39840913020923396,
+      "grad_norm": 0.7621489763259888,
+      "learning_rate": 0.0006343772760378733,
+      "loss": 1.0903,
+      "step": 576
+    },
+    {
+      "epoch": 0.39910081272695835,
+      "grad_norm": 1.0157557725906372,
+      "learning_rate": 0.0006336489439184268,
+      "loss": 1.0674,
+      "step": 577
+    },
+    {
+      "epoch": 0.3997924952446827,
+      "grad_norm": 0.9108319878578186,
+      "learning_rate": 0.0006329206117989803,
+      "loss": 1.371,
+      "step": 578
+    },
+    {
+      "epoch": 0.4004841777624071,
+      "grad_norm": 0.9133428931236267,
+      "learning_rate": 0.0006321922796795338,
+      "loss": 1.6273,
+      "step": 579
+    },
+    {
+      "epoch": 0.4011758602801314,
+      "grad_norm": 1.2508081197738647,
+      "learning_rate": 0.0006314639475600874,
+      "loss": 1.3252,
+      "step": 580
+    },
+    {
+      "epoch": 0.4018675427978558,
+      "grad_norm": 0.5828914642333984,
+      "learning_rate": 0.0006307356154406409,
+      "loss": 1.8814,
+      "step": 581
+    },
+    {
+      "epoch": 0.40255922531558014,
+      "grad_norm": 0.6611084938049316,
+      "learning_rate": 0.0006300072833211945,
+      "loss": 1.7483,
+      "step": 582
+    },
+    {
+      "epoch": 0.40325090783330453,
+      "grad_norm": 0.5295059680938721,
+      "learning_rate": 0.000629278951201748,
+      "loss": 1.0284,
+      "step": 583
+    },
+    {
+      "epoch": 0.40394259035102886,
+      "grad_norm": 0.6011462211608887,
+      "learning_rate": 0.0006285506190823014,
+      "loss": 1.4974,
+      "step": 584
+    },
+    {
+      "epoch": 0.40463427286875325,
+      "grad_norm": 0.9036271572113037,
+      "learning_rate": 0.0006278222869628551,
+      "loss": 0.9939,
+      "step": 585
+    },
+    {
+      "epoch": 0.4053259553864776,
+      "grad_norm": 0.7162883877754211,
+      "learning_rate": 0.0006270939548434086,
+      "loss": 1.8186,
+      "step": 586
+    },
+    {
+      "epoch": 0.406017637904202,
+      "grad_norm": 0.8202586770057678,
+      "learning_rate": 0.0006263656227239622,
+      "loss": 1.3202,
+      "step": 587
+    },
+    {
+      "epoch": 0.4067093204219263,
+      "grad_norm": 0.6272046566009521,
+      "learning_rate": 0.0006256372906045157,
+      "loss": 1.3552,
+      "step": 588
+    },
+    {
+      "epoch": 0.4074010029396507,
+      "grad_norm": 0.5360514521598816,
+      "learning_rate": 0.0006249089584850693,
+      "loss": 1.3703,
+      "step": 589
+    },
+    {
+      "epoch": 0.40809268545737504,
+      "grad_norm": 0.5342544913291931,
+      "learning_rate": 0.0006241806263656228,
+      "loss": 1.4543,
+      "step": 590
+    },
+    {
+      "epoch": 0.4087843679750994,
+      "grad_norm": 0.678282618522644,
+      "learning_rate": 0.0006234522942461762,
+      "loss": 1.9462,
+      "step": 591
+    },
+    {
+      "epoch": 0.4094760504928238,
+      "grad_norm": 0.6571401357650757,
+      "learning_rate": 0.0006227239621267298,
+      "loss": 2.0453,
+      "step": 592
+    },
+    {
+      "epoch": 0.41016773301054815,
+      "grad_norm": 0.6967712044715881,
+      "learning_rate": 0.0006219956300072833,
+      "loss": 1.3844,
+      "step": 593
+    },
+    {
+      "epoch": 0.41085941552827254,
+      "grad_norm": 1.1035219430923462,
+      "learning_rate": 0.0006212672978878369,
+      "loss": 1.6901,
+      "step": 594
+    },
+    {
+      "epoch": 0.4115510980459969,
+      "grad_norm": 1.2840924263000488,
+      "learning_rate": 0.0006205389657683904,
+      "loss": 1.8151,
+      "step": 595
+    },
+    {
+      "epoch": 0.41224278056372127,
+      "grad_norm": 0.594129204750061,
+      "learning_rate": 0.000619810633648944,
+      "loss": 1.6399,
+      "step": 596
+    },
+    {
+      "epoch": 0.4129344630814456,
+      "grad_norm": 0.5824661254882812,
+      "learning_rate": 0.0006190823015294975,
+      "loss": 1.7004,
+      "step": 597
+    },
+    {
+      "epoch": 0.41362614559917,
+      "grad_norm": 0.7485136389732361,
+      "learning_rate": 0.0006183539694100509,
+      "loss": 1.3451,
+      "step": 598
+    },
+    {
+      "epoch": 0.4143178281168943,
+      "grad_norm": 0.5770803689956665,
+      "learning_rate": 0.0006176256372906045,
+      "loss": 1.571,
+      "step": 599
+    },
+    {
+      "epoch": 0.4150095106346187,
+      "grad_norm": 0.8474501371383667,
+      "learning_rate": 0.000616897305171158,
+      "loss": 1.6703,
+      "step": 600
+    },
+    {
+      "epoch": 0.41570119315234305,
+      "grad_norm": 0.5007557272911072,
+      "learning_rate": 0.0006161689730517116,
+      "loss": 1.3968,
+      "step": 601
+    },
+    {
+      "epoch": 0.41639287567006744,
+      "grad_norm": 0.598360538482666,
+      "learning_rate": 0.0006154406409322651,
+      "loss": 1.0098,
+      "step": 602
+    },
+    {
+      "epoch": 0.4170845581877918,
+      "grad_norm": 0.49432629346847534,
+      "learning_rate": 0.0006147123088128187,
+      "loss": 1.0843,
+      "step": 603
+    },
+    {
+      "epoch": 0.41777624070551617,
+      "grad_norm": 0.6179078817367554,
+      "learning_rate": 0.0006139839766933721,
+      "loss": 1.781,
+      "step": 604
+    },
+    {
+      "epoch": 0.41846792322324056,
+      "grad_norm": 0.719780445098877,
+      "learning_rate": 0.0006132556445739256,
+      "loss": 1.8742,
+      "step": 605
+    },
+    {
+      "epoch": 0.4191596057409649,
+      "grad_norm": 0.768247127532959,
+      "learning_rate": 0.0006125273124544793,
+      "loss": 1.8538,
+      "step": 606
+    },
+    {
+      "epoch": 0.4198512882586893,
+      "grad_norm": 0.5836595892906189,
+      "learning_rate": 0.0006117989803350328,
+      "loss": 1.2134,
+      "step": 607
+    },
+    {
+      "epoch": 0.4205429707764136,
+      "grad_norm": 0.8769506216049194,
+      "learning_rate": 0.0006110706482155864,
+      "loss": 1.1559,
+      "step": 608
+    },
+    {
+      "epoch": 0.421234653294138,
+      "grad_norm": 0.5141684412956238,
+      "learning_rate": 0.0006103423160961399,
+      "loss": 1.6293,
+      "step": 609
+    },
+    {
+      "epoch": 0.42192633581186234,
+      "grad_norm": 0.6994926333427429,
+      "learning_rate": 0.0006096139839766935,
+      "loss": 1.4453,
+      "step": 610
+    },
+    {
+      "epoch": 0.42261801832958673,
+      "grad_norm": 0.6264182329177856,
+      "learning_rate": 0.0006088856518572469,
+      "loss": 1.9851,
+      "step": 611
+    },
+    {
+      "epoch": 0.42330970084731107,
+      "grad_norm": 0.6712834239006042,
+      "learning_rate": 0.0006081573197378004,
+      "loss": 1.5792,
+      "step": 612
+    },
+    {
+      "epoch": 0.42400138336503546,
+      "grad_norm": 0.6829782724380493,
+      "learning_rate": 0.000607428987618354,
+      "loss": 1.2137,
+      "step": 613
+    },
+    {
+      "epoch": 0.4246930658827598,
+      "grad_norm": 0.6076679825782776,
+      "learning_rate": 0.0006067006554989075,
+      "loss": 1.7787,
+      "step": 614
+    },
+    {
+      "epoch": 0.4253847484004842,
+      "grad_norm": 0.6156198382377625,
+      "learning_rate": 0.0006059723233794611,
+      "loss": 1.7189,
+      "step": 615
+    },
+    {
+      "epoch": 0.4260764309182085,
+      "grad_norm": 0.7797242403030396,
+      "learning_rate": 0.0006052439912600146,
+      "loss": 1.58,
+      "step": 616
+    },
+    {
+      "epoch": 0.4267681134359329,
+      "grad_norm": 0.6205226182937622,
+      "learning_rate": 0.0006045156591405682,
+      "loss": 1.0808,
+      "step": 617
+    },
+    {
+      "epoch": 0.4274597959536573,
+      "grad_norm": 0.7405165433883667,
+      "learning_rate": 0.0006037873270211216,
+      "loss": 1.7764,
+      "step": 618
+    },
+    {
+      "epoch": 0.42815147847138163,
+      "grad_norm": 0.5714502334594727,
+      "learning_rate": 0.0006030589949016751,
+      "loss": 0.8059,
+      "step": 619
+    },
+    {
+      "epoch": 0.428843160989106,
+      "grad_norm": 0.6483557224273682,
+      "learning_rate": 0.0006023306627822287,
+      "loss": 1.7951,
+      "step": 620
+    },
+    {
+      "epoch": 0.42953484350683035,
+      "grad_norm": 0.8051972389221191,
+      "learning_rate": 0.0006016023306627822,
+      "loss": 1.4164,
+      "step": 621
+    },
+    {
+      "epoch": 0.43022652602455475,
+      "grad_norm": 0.6258521676063538,
+      "learning_rate": 0.0006008739985433358,
+      "loss": 1.9752,
+      "step": 622
+    },
+    {
+      "epoch": 0.4309182085422791,
+      "grad_norm": 0.7436981201171875,
+      "learning_rate": 0.0006001456664238893,
+      "loss": 1.6743,
+      "step": 623
+    },
+    {
+      "epoch": 0.43160989106000347,
+      "grad_norm": 0.43516382575035095,
+      "learning_rate": 0.0005994173343044428,
+      "loss": 1.6526,
+      "step": 624
+    },
+    {
+      "epoch": 0.4323015735777278,
+      "grad_norm": 0.8342098593711853,
+      "learning_rate": 0.0005986890021849963,
+      "loss": 1.612,
+      "step": 625
+    },
+    {
+      "epoch": 0.4329932560954522,
+      "grad_norm": 1.1247819662094116,
+      "learning_rate": 0.0005979606700655498,
+      "loss": 1.736,
+      "step": 626
+    },
+    {
+      "epoch": 0.43368493861317653,
+      "grad_norm": 0.6731348633766174,
+      "learning_rate": 0.0005972323379461035,
+      "loss": 1.4284,
+      "step": 627
+    },
+    {
+      "epoch": 0.4343766211309009,
+      "grad_norm": 0.8929141163825989,
+      "learning_rate": 0.000596504005826657,
+      "loss": 1.5504,
+      "step": 628
+    },
+    {
+      "epoch": 0.43506830364862525,
+      "grad_norm": 0.5387009382247925,
+      "learning_rate": 0.0005957756737072106,
+      "loss": 1.4214,
+      "step": 629
+    },
+    {
+      "epoch": 0.43575998616634964,
+      "grad_norm": 0.6756249666213989,
+      "learning_rate": 0.0005950473415877641,
+      "loss": 1.863,
+      "step": 630
+    },
+    {
+      "epoch": 0.43645166868407403,
+      "grad_norm": 0.7305259704589844,
+      "learning_rate": 0.0005943190094683176,
+      "loss": 2.0542,
+      "step": 631
+    },
+    {
+      "epoch": 0.43714335120179837,
+      "grad_norm": 0.6207787394523621,
+      "learning_rate": 0.0005935906773488711,
+      "loss": 1.4846,
+      "step": 632
+    },
+    {
+      "epoch": 0.43783503371952276,
+      "grad_norm": 0.5971491932868958,
+      "learning_rate": 0.0005928623452294246,
+      "loss": 1.5098,
+      "step": 633
+    },
+    {
+      "epoch": 0.4385267162372471,
+      "grad_norm": 0.6698988080024719,
+      "learning_rate": 0.0005921340131099782,
+      "loss": 1.9203,
+      "step": 634
+    },
+    {
+      "epoch": 0.4392183987549715,
+      "grad_norm": 0.6068461537361145,
+      "learning_rate": 0.0005914056809905317,
+      "loss": 1.0954,
+      "step": 635
+    },
+    {
+      "epoch": 0.4399100812726958,
+      "grad_norm": 0.5868760347366333,
+      "learning_rate": 0.0005906773488710853,
+      "loss": 1.3239,
+      "step": 636
+    },
+    {
+      "epoch": 0.4406017637904202,
+      "grad_norm": 0.6060192584991455,
+      "learning_rate": 0.0005899490167516388,
+      "loss": 1.4716,
+      "step": 637
+    },
+    {
+      "epoch": 0.44129344630814454,
+      "grad_norm": 0.7964019775390625,
+      "learning_rate": 0.0005892206846321923,
+      "loss": 1.6277,
+      "step": 638
+    },
+    {
+      "epoch": 0.44198512882586893,
+      "grad_norm": 0.7350580096244812,
+      "learning_rate": 0.0005884923525127458,
+      "loss": 1.7172,
+      "step": 639
+    },
+    {
+      "epoch": 0.44267681134359327,
+      "grad_norm": 0.6525869369506836,
+      "learning_rate": 0.0005877640203932993,
+      "loss": 1.5932,
+      "step": 640
+    },
+    {
+      "epoch": 0.44336849386131766,
+      "grad_norm": 0.5807157754898071,
+      "learning_rate": 0.0005870356882738529,
+      "loss": 1.8092,
+      "step": 641
+    },
+    {
+      "epoch": 0.444060176379042,
+      "grad_norm": 0.5111778378486633,
+      "learning_rate": 0.0005863073561544064,
+      "loss": 1.3868,
+      "step": 642
+    },
+    {
+      "epoch": 0.4447518588967664,
+      "grad_norm": 0.5414280295372009,
+      "learning_rate": 0.00058557902403496,
+      "loss": 1.2737,
+      "step": 643
+    },
+    {
+      "epoch": 0.4454435414144908,
+      "grad_norm": 0.5459370613098145,
+      "learning_rate": 0.0005848506919155134,
+      "loss": 1.1543,
+      "step": 644
+    },
+    {
+      "epoch": 0.4461352239322151,
+      "grad_norm": 0.8171971440315247,
+      "learning_rate": 0.000584122359796067,
+      "loss": 1.0578,
+      "step": 645
+    },
+    {
+      "epoch": 0.4468269064499395,
+      "grad_norm": 0.7383177280426025,
+      "learning_rate": 0.0005833940276766205,
+      "loss": 1.592,
+      "step": 646
+    },
+    {
+      "epoch": 0.44751858896766383,
+      "grad_norm": 1.1485675573349,
+      "learning_rate": 0.000582665695557174,
+      "loss": 0.9909,
+      "step": 647
+    },
+    {
+      "epoch": 0.4482102714853882,
+      "grad_norm": 0.9529873132705688,
+      "learning_rate": 0.0005819373634377276,
+      "loss": 1.3978,
+      "step": 648
+    },
+    {
+      "epoch": 0.44890195400311256,
+      "grad_norm": 0.6915671229362488,
+      "learning_rate": 0.0005812090313182812,
+      "loss": 1.4762,
+      "step": 649
+    },
+    {
+      "epoch": 0.44959363652083695,
+      "grad_norm": 0.5856941938400269,
+      "learning_rate": 0.0005804806991988348,
+      "loss": 1.5474,
+      "step": 650
+    },
+    {
+      "epoch": 0.4502853190385613,
+      "grad_norm": 1.4381829500198364,
+      "learning_rate": 0.0005797523670793882,
+      "loss": 1.3106,
+      "step": 651
+    },
+    {
+      "epoch": 0.4509770015562857,
+      "grad_norm": 0.9599109292030334,
+      "learning_rate": 0.0005790240349599418,
+      "loss": 1.6511,
+      "step": 652
+    },
+    {
+      "epoch": 0.45166868407401,
+      "grad_norm": 0.779615581035614,
+      "learning_rate": 0.0005782957028404953,
+      "loss": 1.3878,
+      "step": 653
+    },
+    {
+      "epoch": 0.4523603665917344,
+      "grad_norm": 0.9594855308532715,
+      "learning_rate": 0.0005775673707210488,
+      "loss": 1.4339,
+      "step": 654
+    },
+    {
+      "epoch": 0.45305204910945873,
+      "grad_norm": 0.7078779935836792,
+      "learning_rate": 0.0005768390386016024,
+      "loss": 1.1043,
+      "step": 655
+    },
+    {
+      "epoch": 0.4537437316271831,
+      "grad_norm": 1.176413893699646,
+      "learning_rate": 0.0005761107064821559,
+      "loss": 1.5211,
+      "step": 656
+    },
+    {
+      "epoch": 0.4544354141449075,
+      "grad_norm": 0.6396954655647278,
+      "learning_rate": 0.0005753823743627095,
+      "loss": 1.8841,
+      "step": 657
+    },
+    {
+      "epoch": 0.45512709666263185,
+      "grad_norm": 0.8579792380332947,
+      "learning_rate": 0.0005746540422432629,
+      "loss": 1.7458,
+      "step": 658
+    },
+    {
+      "epoch": 0.45581877918035624,
+      "grad_norm": 0.6195595264434814,
+      "learning_rate": 0.0005739257101238165,
+      "loss": 1.6583,
+      "step": 659
+    },
+    {
+      "epoch": 0.45651046169808057,
+      "grad_norm": 0.6468372941017151,
+      "learning_rate": 0.00057319737800437,
+      "loss": 1.9984,
+      "step": 660
+    },
+    {
+      "epoch": 0.45720214421580496,
+      "grad_norm": 1.8570871353149414,
+      "learning_rate": 0.0005724690458849235,
+      "loss": 1.3586,
+      "step": 661
+    },
+    {
+      "epoch": 0.4578938267335293,
+      "grad_norm": 7.004371166229248,
+      "learning_rate": 0.0005717407137654771,
+      "loss": 1.9842,
+      "step": 662
+    },
+    {
+      "epoch": 0.4585855092512537,
+      "grad_norm": 0.7576525807380676,
+      "learning_rate": 0.0005710123816460306,
+      "loss": 1.8268,
+      "step": 663
+    },
+    {
+      "epoch": 0.459277191768978,
+      "grad_norm": 1.155989646911621,
+      "learning_rate": 0.0005702840495265841,
+      "loss": 0.7441,
+      "step": 664
+    },
+    {
+      "epoch": 0.4599688742867024,
+      "grad_norm": 0.5623228549957275,
+      "learning_rate": 0.0005695557174071376,
+      "loss": 1.5069,
+      "step": 665
+    },
+    {
+      "epoch": 0.46066055680442675,
+      "grad_norm": 0.8218792676925659,
+      "learning_rate": 0.0005688273852876912,
+      "loss": 1.4099,
+      "step": 666
+    },
+    {
+      "epoch": 0.46135223932215114,
+      "grad_norm": 0.5278311371803284,
+      "learning_rate": 0.0005680990531682447,
+      "loss": 1.6798,
+      "step": 667
+    },
+    {
+      "epoch": 0.46204392183987547,
+      "grad_norm": 0.5412008762359619,
+      "learning_rate": 0.0005673707210487982,
+      "loss": 1.1857,
+      "step": 668
+    },
+    {
+      "epoch": 0.46273560435759986,
+      "grad_norm": 0.5376167893409729,
+      "learning_rate": 0.0005666423889293518,
+      "loss": 1.7932,
+      "step": 669
+    },
+    {
+      "epoch": 0.46342728687532425,
+      "grad_norm": 0.7957973480224609,
+      "learning_rate": 0.0005659140568099053,
+      "loss": 1.0163,
+      "step": 670
+    },
+    {
+      "epoch": 0.4641189693930486,
+      "grad_norm": 0.933850109577179,
+      "learning_rate": 0.0005651857246904589,
+      "loss": 1.2543,
+      "step": 671
+    },
+    {
+      "epoch": 0.464810651910773,
+      "grad_norm": 0.7921749949455261,
+      "learning_rate": 0.0005644573925710124,
+      "loss": 1.3725,
+      "step": 672
+    },
+    {
+      "epoch": 0.4655023344284973,
+      "grad_norm": 0.5913270115852356,
+      "learning_rate": 0.000563729060451566,
+      "loss": 1.7408,
+      "step": 673
+    },
+    {
+      "epoch": 0.4661940169462217,
+      "grad_norm": 0.5727415680885315,
+      "learning_rate": 0.0005630007283321195,
+      "loss": 1.0868,
+      "step": 674
+    },
+    {
+      "epoch": 0.46688569946394604,
+      "grad_norm": 0.6014571785926819,
+      "learning_rate": 0.000562272396212673,
+      "loss": 1.2138,
+      "step": 675
+    },
+    {
+      "epoch": 0.4675773819816704,
+      "grad_norm": 0.6142331957817078,
+      "learning_rate": 0.0005615440640932266,
+      "loss": 1.348,
+      "step": 676
+    },
+    {
+      "epoch": 0.46826906449939476,
+      "grad_norm": 0.6304256319999695,
+      "learning_rate": 0.0005608157319737801,
+      "loss": 2.1715,
+      "step": 677
+    },
+    {
+      "epoch": 0.46896074701711915,
+      "grad_norm": 0.6730361580848694,
+      "learning_rate": 0.0005600873998543336,
+      "loss": 1.7929,
+      "step": 678
+    },
+    {
+      "epoch": 0.4696524295348435,
+      "grad_norm": 1.0285260677337646,
+      "learning_rate": 0.0005593590677348871,
+      "loss": 1.0785,
+      "step": 679
+    },
+    {
+      "epoch": 0.4703441120525679,
+      "grad_norm": 0.6518314480781555,
+      "learning_rate": 0.0005586307356154407,
+      "loss": 1.9217,
+      "step": 680
+    },
+    {
+      "epoch": 0.4710357945702922,
+      "grad_norm": 0.813822329044342,
+      "learning_rate": 0.0005579024034959942,
+      "loss": 1.4779,
+      "step": 681
+    },
+    {
+      "epoch": 0.4717274770880166,
+      "grad_norm": 0.87235426902771,
+      "learning_rate": 0.0005571740713765477,
+      "loss": 1.4881,
+      "step": 682
+    },
+    {
+      "epoch": 0.472419159605741,
+      "grad_norm": 0.44555163383483887,
+      "learning_rate": 0.0005564457392571013,
+      "loss": 1.345,
+      "step": 683
+    },
+    {
+      "epoch": 0.4731108421234653,
+      "grad_norm": 0.6434228420257568,
+      "learning_rate": 0.0005557174071376547,
+      "loss": 1.7488,
+      "step": 684
+    },
+    {
+      "epoch": 0.4738025246411897,
+      "grad_norm": 1.8087595701217651,
+      "learning_rate": 0.0005549890750182083,
+      "loss": 1.4157,
+      "step": 685
+    },
+    {
+      "epoch": 0.47449420715891405,
+      "grad_norm": 0.6187067031860352,
+      "learning_rate": 0.0005542607428987618,
+      "loss": 1.9694,
+      "step": 686
+    },
+    {
+      "epoch": 0.47518588967663844,
+      "grad_norm": 0.5754404664039612,
+      "learning_rate": 0.0005535324107793154,
+      "loss": 1.6113,
+      "step": 687
+    },
+    {
+      "epoch": 0.4758775721943628,
+      "grad_norm": 1.1567997932434082,
+      "learning_rate": 0.0005528040786598689,
+      "loss": 1.492,
+      "step": 688
+    },
+    {
+      "epoch": 0.47656925471208716,
+      "grad_norm": 7.947727680206299,
+      "learning_rate": 0.0005520757465404224,
+      "loss": 1.1358,
+      "step": 689
+    },
+    {
+      "epoch": 0.4772609372298115,
+      "grad_norm": 0.7208907008171082,
+      "learning_rate": 0.000551347414420976,
+      "loss": 1.4342,
+      "step": 690
+    },
+    {
+      "epoch": 0.4779526197475359,
+      "grad_norm": 0.594211995601654,
+      "learning_rate": 0.0005506190823015294,
+      "loss": 2.0316,
+      "step": 691
+    },
+    {
+      "epoch": 0.4786443022652602,
+      "grad_norm": 0.65560382604599,
+      "learning_rate": 0.000549890750182083,
+      "loss": 1.1252,
+      "step": 692
+    },
+    {
+      "epoch": 0.4793359847829846,
+      "grad_norm": 0.6759006381034851,
+      "learning_rate": 0.0005491624180626365,
+      "loss": 1.8818,
+      "step": 693
+    },
+    {
+      "epoch": 0.48002766730070895,
+      "grad_norm": 0.5605379343032837,
+      "learning_rate": 0.0005484340859431902,
+      "loss": 1.8104,
+      "step": 694
+    },
+    {
+      "epoch": 0.48071934981843334,
+      "grad_norm": 1.8264721632003784,
+      "learning_rate": 0.0005477057538237437,
+      "loss": 1.032,
+      "step": 695
+    },
+    {
+      "epoch": 0.48141103233615773,
+      "grad_norm": 0.6420240998268127,
+      "learning_rate": 0.0005469774217042972,
+      "loss": 0.9941,
+      "step": 696
+    },
+    {
+      "epoch": 0.48210271485388206,
+      "grad_norm": 0.8541857600212097,
+      "learning_rate": 0.0005462490895848508,
+      "loss": 1.9238,
+      "step": 697
+    },
+    {
+      "epoch": 0.48279439737160645,
+      "grad_norm": 0.5706299543380737,
+      "learning_rate": 0.0005455207574654042,
+      "loss": 0.894,
+      "step": 698
+    },
+    {
+      "epoch": 0.4834860798893308,
+      "grad_norm": 0.7758136987686157,
+      "learning_rate": 0.0005447924253459578,
+      "loss": 1.5292,
+      "step": 699
+    },
+    {
+      "epoch": 0.4841777624070552,
+      "grad_norm": 0.5781087875366211,
+      "learning_rate": 0.0005440640932265113,
+      "loss": 0.8735,
+      "step": 700
+    },
+    {
+      "epoch": 0.4848694449247795,
+      "grad_norm": 0.5916205048561096,
+      "learning_rate": 0.0005433357611070649,
+      "loss": 1.329,
+      "step": 701
+    },
+    {
+      "epoch": 0.4855611274425039,
+      "grad_norm": 1.0263584852218628,
+      "learning_rate": 0.0005426074289876184,
+      "loss": 1.7103,
+      "step": 702
+    },
+    {
+      "epoch": 0.48625280996022824,
+      "grad_norm": 0.6922283172607422,
+      "learning_rate": 0.0005418790968681719,
+      "loss": 0.9118,
+      "step": 703
+    },
+    {
+      "epoch": 0.48694449247795263,
+      "grad_norm": 1.4904029369354248,
+      "learning_rate": 0.0005411507647487254,
+      "loss": 1.3195,
+      "step": 704
+    },
+    {
+      "epoch": 0.48763617499567696,
+      "grad_norm": 0.7213814854621887,
+      "learning_rate": 0.0005404224326292789,
+      "loss": 2.0028,
+      "step": 705
+    },
+    {
+      "epoch": 0.48832785751340135,
+      "grad_norm": 0.6512391567230225,
+      "learning_rate": 0.0005396941005098325,
+      "loss": 1.8618,
+      "step": 706
+    },
+    {
+      "epoch": 0.4890195400311257,
+      "grad_norm": 0.8064336180686951,
+      "learning_rate": 0.000538965768390386,
+      "loss": 1.3046,
+      "step": 707
+    },
+    {
+      "epoch": 0.4897112225488501,
+      "grad_norm": 0.7077018022537231,
+      "learning_rate": 0.0005382374362709396,
+      "loss": 1.9682,
+      "step": 708
+    },
+    {
+      "epoch": 0.49040290506657447,
+      "grad_norm": 0.8143154382705688,
+      "learning_rate": 0.0005375091041514931,
+      "loss": 1.5645,
+      "step": 709
+    },
+    {
+      "epoch": 0.4910945875842988,
+      "grad_norm": 0.7311553359031677,
+      "learning_rate": 0.0005367807720320466,
+      "loss": 1.6889,
+      "step": 710
+    },
+    {
+      "epoch": 0.4917862701020232,
+      "grad_norm": 0.5971880555152893,
+      "learning_rate": 0.0005360524399126001,
+      "loss": 1.4776,
+      "step": 711
+    },
+    {
+      "epoch": 0.4924779526197475,
+      "grad_norm": 0.6928828954696655,
+      "learning_rate": 0.0005353241077931536,
+      "loss": 2.232,
+      "step": 712
+    },
+    {
+      "epoch": 0.4931696351374719,
+      "grad_norm": 0.6291587352752686,
+      "learning_rate": 0.0005345957756737072,
+      "loss": 1.3976,
+      "step": 713
+    },
+    {
+      "epoch": 0.49386131765519625,
+      "grad_norm": 0.48338043689727783,
+      "learning_rate": 0.0005338674435542607,
+      "loss": 0.9061,
+      "step": 714
+    },
+    {
+      "epoch": 0.49455300017292064,
+      "grad_norm": 0.6431358456611633,
+      "learning_rate": 0.0005331391114348144,
+      "loss": 1.6624,
+      "step": 715
+    },
+    {
+      "epoch": 0.495244682690645,
+      "grad_norm": 0.6543510556221008,
+      "learning_rate": 0.0005324107793153679,
+      "loss": 1.905,
+      "step": 716
+    },
+    {
+      "epoch": 0.49593636520836937,
+      "grad_norm": 0.5716462731361389,
+      "learning_rate": 0.0005316824471959214,
+      "loss": 1.7145,
+      "step": 717
+    },
+    {
+      "epoch": 0.4966280477260937,
+      "grad_norm": 0.5821312665939331,
+      "learning_rate": 0.0005309541150764749,
+      "loss": 1.8807,
+      "step": 718
+    },
+    {
+      "epoch": 0.4973197302438181,
+      "grad_norm": 0.7142646908760071,
+      "learning_rate": 0.0005302257829570284,
+      "loss": 1.9264,
+      "step": 719
+    },
+    {
+      "epoch": 0.4980114127615424,
+      "grad_norm": 0.5939432382583618,
+      "learning_rate": 0.000529497450837582,
+      "loss": 1.934,
+      "step": 720
+    },
+    {
+      "epoch": 0.4987030952792668,
+      "grad_norm": 1.000845193862915,
+      "learning_rate": 0.0005287691187181355,
+      "loss": 1.4068,
+      "step": 721
+    },
+    {
+      "epoch": 0.4993947777969912,
+      "grad_norm": 0.7502846717834473,
+      "learning_rate": 0.0005280407865986891,
+      "loss": 1.6726,
+      "step": 722
+    },
+    {
+      "epoch": 0.5000864603147156,
+      "grad_norm": 0.7099502682685852,
+      "learning_rate": 0.0005273124544792426,
+      "loss": 0.9208,
+      "step": 723
+    },
+    {
+      "epoch": 0.5007781428324399,
+      "grad_norm": 0.5614446997642517,
+      "learning_rate": 0.000526584122359796,
+      "loss": 1.8832,
+      "step": 724
+    },
+    {
+      "epoch": 0.5014698253501643,
+      "grad_norm": 0.6386409401893616,
+      "learning_rate": 0.0005258557902403496,
+      "loss": 0.9929,
+      "step": 725
+    },
+    {
+      "epoch": 0.5021615078678886,
+      "grad_norm": 0.7122677564620972,
+      "learning_rate": 0.0005251274581209031,
+      "loss": 1.6074,
+      "step": 726
+    },
+    {
+      "epoch": 0.502853190385613,
+      "grad_norm": 0.7774210572242737,
+      "learning_rate": 0.0005243991260014567,
+      "loss": 1.7793,
+      "step": 727
+    },
+    {
+      "epoch": 0.5035448729033374,
+      "grad_norm": 0.6662734150886536,
+      "learning_rate": 0.0005236707938820102,
+      "loss": 1.8851,
+      "step": 728
+    },
+    {
+      "epoch": 0.5042365554210617,
+      "grad_norm": 0.6568670868873596,
+      "learning_rate": 0.0005229424617625638,
+      "loss": 1.369,
+      "step": 729
+    },
+    {
+      "epoch": 0.504928237938786,
+      "grad_norm": 3.8021135330200195,
+      "learning_rate": 0.0005222141296431173,
+      "loss": 1.4424,
+      "step": 730
+    },
+    {
+      "epoch": 0.5056199204565105,
+      "grad_norm": 1.1102453470230103,
+      "learning_rate": 0.0005214857975236707,
+      "loss": 1.5926,
+      "step": 731
+    },
+    {
+      "epoch": 0.5063116029742348,
+      "grad_norm": 0.6229285001754761,
+      "learning_rate": 0.0005207574654042243,
+      "loss": 1.7467,
+      "step": 732
+    },
+    {
+      "epoch": 0.5070032854919592,
+      "grad_norm": 0.6418752074241638,
+      "learning_rate": 0.0005200291332847778,
+      "loss": 1.7413,
+      "step": 733
+    },
+    {
+      "epoch": 0.5076949680096835,
+      "grad_norm": 1.1830925941467285,
+      "learning_rate": 0.0005193008011653314,
+      "loss": 1.6488,
+      "step": 734
+    },
+    {
+      "epoch": 0.508386650527408,
+      "grad_norm": 0.7698209285736084,
+      "learning_rate": 0.0005185724690458849,
+      "loss": 1.5476,
+      "step": 735
+    },
+    {
+      "epoch": 0.5090783330451323,
+      "grad_norm": 0.781249463558197,
+      "learning_rate": 0.0005178441369264386,
+      "loss": 1.8956,
+      "step": 736
+    },
+    {
+      "epoch": 0.5097700155628566,
+      "grad_norm": 0.5504831075668335,
+      "learning_rate": 0.0005171158048069921,
+      "loss": 1.7983,
+      "step": 737
+    },
+    {
+      "epoch": 0.5104616980805811,
+      "grad_norm": 1.5704818964004517,
+      "learning_rate": 0.0005163874726875455,
+      "loss": 2.4774,
+      "step": 738
+    },
+    {
+      "epoch": 0.5111533805983054,
+      "grad_norm": 0.581377387046814,
+      "learning_rate": 0.0005156591405680991,
+      "loss": 1.5272,
+      "step": 739
+    },
+    {
+      "epoch": 0.5118450631160297,
+      "grad_norm": 0.7596077919006348,
+      "learning_rate": 0.0005149308084486526,
+      "loss": 0.9089,
+      "step": 740
+    },
+    {
+      "epoch": 0.5125367456337541,
+      "grad_norm": 0.6792967915534973,
+      "learning_rate": 0.0005142024763292062,
+      "loss": 2.0292,
+      "step": 741
+    },
+    {
+      "epoch": 0.5132284281514785,
+      "grad_norm": 0.6570396423339844,
+      "learning_rate": 0.0005134741442097597,
+      "loss": 1.9329,
+      "step": 742
+    },
+    {
+      "epoch": 0.5139201106692028,
+      "grad_norm": 0.7024231553077698,
+      "learning_rate": 0.0005127458120903133,
+      "loss": 1.6629,
+      "step": 743
+    },
+    {
+      "epoch": 0.5146117931869272,
+      "grad_norm": 0.8019945621490479,
+      "learning_rate": 0.0005120174799708667,
+      "loss": 1.0594,
+      "step": 744
+    },
+    {
+      "epoch": 0.5153034757046515,
+      "grad_norm": 7.380868911743164,
+      "learning_rate": 0.0005112891478514202,
+      "loss": 1.2738,
+      "step": 745
+    },
+    {
+      "epoch": 0.515995158222376,
+      "grad_norm": 0.6265088319778442,
+      "learning_rate": 0.0005105608157319738,
+      "loss": 2.0701,
+      "step": 746
+    },
+    {
+      "epoch": 0.5166868407401003,
+      "grad_norm": 0.5740177631378174,
+      "learning_rate": 0.0005098324836125273,
+      "loss": 1.8416,
+      "step": 747
+    },
+    {
+      "epoch": 0.5173785232578246,
+      "grad_norm": 0.6777141690254211,
+      "learning_rate": 0.0005091041514930809,
+      "loss": 1.333,
+      "step": 748
+    },
+    {
+      "epoch": 0.5180702057755491,
+      "grad_norm": 0.4951770007610321,
+      "learning_rate": 0.0005083758193736344,
+      "loss": 1.7605,
+      "step": 749
+    },
+    {
+      "epoch": 0.5187618882932734,
+      "grad_norm": 0.6544963717460632,
+      "learning_rate": 0.000507647487254188,
+      "loss": 1.5054,
+      "step": 750
+    },
+    {
+      "epoch": 0.5194535708109977,
+      "grad_norm": 68.59622955322266,
+      "learning_rate": 0.0005069191551347414,
+      "loss": 1.911,
+      "step": 751
+    },
+    {
+      "epoch": 0.5201452533287221,
+      "grad_norm": 0.5751796960830688,
+      "learning_rate": 0.0005061908230152949,
+      "loss": 1.3003,
+      "step": 752
+    },
+    {
+      "epoch": 0.5208369358464465,
+      "grad_norm": 0.7600306868553162,
+      "learning_rate": 0.0005054624908958485,
+      "loss": 1.2258,
+      "step": 753
+    },
+    {
+      "epoch": 0.5215286183641709,
+      "grad_norm": 0.9036048650741577,
+      "learning_rate": 0.000504734158776402,
+      "loss": 1.0586,
+      "step": 754
+    },
+    {
+      "epoch": 0.5222203008818952,
+      "grad_norm": 0.6078492403030396,
+      "learning_rate": 0.0005040058266569556,
+      "loss": 1.3874,
+      "step": 755
+    },
+    {
+      "epoch": 0.5229119833996195,
+      "grad_norm": 0.714954137802124,
+      "learning_rate": 0.0005032774945375091,
+      "loss": 1.6183,
+      "step": 756
+    },
+    {
+      "epoch": 0.523603665917344,
+      "grad_norm": 0.5517666935920715,
+      "learning_rate": 0.0005025491624180628,
+      "loss": 1.5844,
+      "step": 757
+    },
+    {
+      "epoch": 0.5242953484350683,
+      "grad_norm": 0.7139641046524048,
+      "learning_rate": 0.0005018208302986161,
+      "loss": 1.724,
+      "step": 758
+    },
+    {
+      "epoch": 0.5249870309527926,
+      "grad_norm": 0.7252593040466309,
+      "learning_rate": 0.0005010924981791696,
+      "loss": 1.3367,
+      "step": 759
+    },
+    {
+      "epoch": 0.525678713470517,
+      "grad_norm": 0.7002785205841064,
+      "learning_rate": 0.0005003641660597233,
+      "loss": 2.0537,
+      "step": 760
+    },
+    {
+      "epoch": 0.5263703959882414,
+      "grad_norm": 0.6444349884986877,
+      "learning_rate": 0.0004996358339402768,
+      "loss": 1.9185,
+      "step": 761
+    },
+    {
+      "epoch": 0.5270620785059658,
+      "grad_norm": 0.6805012822151184,
+      "learning_rate": 0.0004989075018208304,
+      "loss": 1.2298,
+      "step": 762
+    },
+    {
+      "epoch": 0.5277537610236901,
+      "grad_norm": 0.5559502243995667,
+      "learning_rate": 0.0004981791697013839,
+      "loss": 1.1846,
+      "step": 763
+    },
+    {
+      "epoch": 0.5284454435414145,
+      "grad_norm": 0.6215760111808777,
+      "learning_rate": 0.0004974508375819374,
+      "loss": 1.9352,
+      "step": 764
+    },
+    {
+      "epoch": 0.5291371260591389,
+      "grad_norm": 0.8805095553398132,
+      "learning_rate": 0.0004967225054624909,
+      "loss": 1.4648,
+      "step": 765
+    },
+    {
+      "epoch": 0.5298288085768632,
+      "grad_norm": 0.8312969207763672,
+      "learning_rate": 0.0004959941733430444,
+      "loss": 1.5746,
+      "step": 766
+    },
+    {
+      "epoch": 0.5305204910945875,
+      "grad_norm": 0.5943465828895569,
+      "learning_rate": 0.000495265841223598,
+      "loss": 1.6898,
+      "step": 767
+    },
+    {
+      "epoch": 0.531212173612312,
+      "grad_norm": 0.6693414449691772,
+      "learning_rate": 0.0004945375091041515,
+      "loss": 1.3878,
+      "step": 768
+    },
+    {
+      "epoch": 0.5319038561300363,
+      "grad_norm": 1.313461422920227,
+      "learning_rate": 0.000493809176984705,
+      "loss": 1.551,
+      "step": 769
+    },
+    {
+      "epoch": 0.5325955386477607,
+      "grad_norm": 0.6303570866584778,
+      "learning_rate": 0.0004930808448652586,
+      "loss": 1.2923,
+      "step": 770
+    },
+    {
+      "epoch": 0.533287221165485,
+      "grad_norm": 1.8511700630187988,
+      "learning_rate": 0.0004923525127458121,
+      "loss": 1.2441,
+      "step": 771
+    },
+    {
+      "epoch": 0.5339789036832094,
+      "grad_norm": 0.4831252992153168,
+      "learning_rate": 0.0004916241806263656,
+      "loss": 1.5108,
+      "step": 772
+    },
+    {
+      "epoch": 0.5346705862009338,
+      "grad_norm": 0.6066388487815857,
+      "learning_rate": 0.0004908958485069191,
+      "loss": 1.41,
+      "step": 773
+    },
+    {
+      "epoch": 0.5353622687186581,
+      "grad_norm": 0.6585482358932495,
+      "learning_rate": 0.0004901675163874727,
+      "loss": 1.7785,
+      "step": 774
+    },
+    {
+      "epoch": 0.5360539512363826,
+      "grad_norm": 1.0999970436096191,
+      "learning_rate": 0.0004894391842680262,
+      "loss": 1.686,
+      "step": 775
+    },
+    {
+      "epoch": 0.5367456337541069,
+      "grad_norm": 0.5038065314292908,
+      "learning_rate": 0.0004887108521485797,
+      "loss": 1.2632,
+      "step": 776
+    },
+    {
+      "epoch": 0.5374373162718312,
+      "grad_norm": 0.9723607897758484,
+      "learning_rate": 0.0004879825200291333,
+      "loss": 1.5182,
+      "step": 777
+    },
+    {
+      "epoch": 0.5381289987895556,
+      "grad_norm": 1.2071069478988647,
+      "learning_rate": 0.00048725418790968684,
+      "loss": 1.352,
+      "step": 778
+    },
+    {
+      "epoch": 0.53882068130728,
+      "grad_norm": 0.5600361227989197,
+      "learning_rate": 0.00048652585579024034,
+      "loss": 1.0904,
+      "step": 779
+    },
+    {
+      "epoch": 0.5395123638250043,
+      "grad_norm": 0.5586333870887756,
+      "learning_rate": 0.0004857975236707939,
+      "loss": 1.7258,
+      "step": 780
+    },
+    {
+      "epoch": 0.5402040463427287,
+      "grad_norm": 0.47964948415756226,
+      "learning_rate": 0.00048506919155134746,
+      "loss": 1.2326,
+      "step": 781
+    },
+    {
+      "epoch": 0.540895728860453,
+      "grad_norm": 0.6363099813461304,
+      "learning_rate": 0.00048434085943190096,
+      "loss": 1.6935,
+      "step": 782
+    },
+    {
+      "epoch": 0.5415874113781775,
+      "grad_norm": 1.4082186222076416,
+      "learning_rate": 0.00048361252731245446,
+      "loss": 1.8553,
+      "step": 783
+    },
+    {
+      "epoch": 0.5422790938959018,
+      "grad_norm": 0.5664854049682617,
+      "learning_rate": 0.000482884195193008,
+      "loss": 1.7903,
+      "step": 784
+    },
+    {
+      "epoch": 0.5429707764136261,
+      "grad_norm": 0.7249478101730347,
+      "learning_rate": 0.0004821558630735616,
+      "loss": 1.9806,
+      "step": 785
+    },
+    {
+      "epoch": 0.5436624589313505,
+      "grad_norm": 0.7467180490493774,
+      "learning_rate": 0.0004814275309541151,
+      "loss": 1.9561,
+      "step": 786
+    },
+    {
+      "epoch": 0.5443541414490749,
+      "grad_norm": 0.6132490634918213,
+      "learning_rate": 0.00048069919883466863,
+      "loss": 1.4799,
+      "step": 787
+    },
+    {
+      "epoch": 0.5450458239667992,
+      "grad_norm": 0.8408911824226379,
+      "learning_rate": 0.0004799708667152222,
+      "loss": 1.4826,
+      "step": 788
+    },
+    {
+      "epoch": 0.5457375064845236,
+      "grad_norm": 0.867709219455719,
+      "learning_rate": 0.00047924253459577564,
+      "loss": 1.1398,
+      "step": 789
+    },
+    {
+      "epoch": 0.546429189002248,
+      "grad_norm": 1.0073575973510742,
+      "learning_rate": 0.0004785142024763292,
+      "loss": 1.5904,
+      "step": 790
+    },
+    {
+      "epoch": 0.5471208715199724,
+      "grad_norm": 0.7767248153686523,
+      "learning_rate": 0.00047778587035688275,
+      "loss": 1.8212,
+      "step": 791
+    },
+    {
+      "epoch": 0.5478125540376967,
+      "grad_norm": 0.45061439275741577,
+      "learning_rate": 0.0004770575382374363,
+      "loss": 1.3728,
+      "step": 792
+    },
+    {
+      "epoch": 0.548504236555421,
+      "grad_norm": 0.6767532229423523,
+      "learning_rate": 0.0004763292061179898,
+      "loss": 1.2439,
+      "step": 793
+    },
+    {
+      "epoch": 0.5491959190731455,
+      "grad_norm": 0.987127423286438,
+      "learning_rate": 0.00047560087399854337,
+      "loss": 1.1955,
+      "step": 794
+    },
+    {
+      "epoch": 0.5498876015908698,
+      "grad_norm": 0.5513572096824646,
+      "learning_rate": 0.0004748725418790969,
+      "loss": 0.935,
+      "step": 795
+    },
+    {
+      "epoch": 0.5505792841085941,
+      "grad_norm": 0.5820390582084656,
+      "learning_rate": 0.00047414420975965037,
+      "loss": 1.5176,
+      "step": 796
+    },
+    {
+      "epoch": 0.5512709666263185,
+      "grad_norm": 0.5624158382415771,
+      "learning_rate": 0.0004734158776402039,
+      "loss": 1.8107,
+      "step": 797
+    },
+    {
+      "epoch": 0.5519626491440429,
+      "grad_norm": 0.9582436680793762,
+      "learning_rate": 0.0004726875455207575,
+      "loss": 1.4249,
+      "step": 798
+    },
+    {
+      "epoch": 0.5526543316617673,
+      "grad_norm": 0.8588325381278992,
+      "learning_rate": 0.000471959213401311,
+      "loss": 1.6912,
+      "step": 799
+    },
+    {
+      "epoch": 0.5533460141794916,
+      "grad_norm": 0.5387138724327087,
+      "learning_rate": 0.00047123088128186454,
+      "loss": 1.9325,
+      "step": 800
+    },
+    {
+      "epoch": 0.554037696697216,
+      "grad_norm": 0.5876449346542358,
+      "learning_rate": 0.0004705025491624181,
+      "loss": 1.8334,
+      "step": 801
+    },
+    {
+      "epoch": 0.5547293792149404,
+      "grad_norm": 0.6836307048797607,
+      "learning_rate": 0.00046977421704297165,
+      "loss": 1.6645,
+      "step": 802
+    },
+    {
+      "epoch": 0.5554210617326647,
+      "grad_norm": 0.4884951412677765,
+      "learning_rate": 0.0004690458849235251,
+      "loss": 1.8236,
+      "step": 803
+    },
+    {
+      "epoch": 0.556112744250389,
+      "grad_norm": 0.6170971989631653,
+      "learning_rate": 0.00046831755280407866,
+      "loss": 1.5683,
+      "step": 804
+    },
+    {
+      "epoch": 0.5568044267681135,
+      "grad_norm": 0.5362435579299927,
+      "learning_rate": 0.0004675892206846322,
+      "loss": 1.3946,
+      "step": 805
+    },
+    {
+      "epoch": 0.5574961092858378,
+      "grad_norm": 0.590861976146698,
+      "learning_rate": 0.0004668608885651857,
+      "loss": 1.8858,
+      "step": 806
+    },
+    {
+      "epoch": 0.5581877918035621,
+      "grad_norm": 0.8209269046783447,
+      "learning_rate": 0.0004661325564457393,
+      "loss": 1.1731,
+      "step": 807
+    },
+    {
+      "epoch": 0.5588794743212865,
+      "grad_norm": 0.5863669514656067,
+      "learning_rate": 0.00046540422432629283,
+      "loss": 1.3344,
+      "step": 808
+    },
+    {
+      "epoch": 0.5595711568390109,
+      "grad_norm": 0.6586650013923645,
+      "learning_rate": 0.00046467589220684633,
+      "loss": 1.5557,
+      "step": 809
+    },
+    {
+      "epoch": 0.5602628393567353,
+      "grad_norm": 0.5491942763328552,
+      "learning_rate": 0.00046394756008739983,
+      "loss": 1.8428,
+      "step": 810
+    },
+    {
+      "epoch": 0.5609545218744596,
+      "grad_norm": 0.5121621489524841,
+      "learning_rate": 0.0004632192279679534,
+      "loss": 1.2454,
+      "step": 811
+    },
+    {
+      "epoch": 0.5616462043921839,
+      "grad_norm": 0.5642603039741516,
+      "learning_rate": 0.00046249089584850695,
+      "loss": 1.8521,
+      "step": 812
+    },
+    {
+      "epoch": 0.5623378869099084,
+      "grad_norm": 0.8125218749046326,
+      "learning_rate": 0.00046176256372906045,
+      "loss": 1.983,
+      "step": 813
+    },
+    {
+      "epoch": 0.5630295694276327,
+      "grad_norm": 0.5608864426612854,
+      "learning_rate": 0.000461034231609614,
+      "loss": 1.4064,
+      "step": 814
+    },
+    {
+      "epoch": 0.563721251945357,
+      "grad_norm": 0.555400550365448,
+      "learning_rate": 0.00046030589949016756,
+      "loss": 1.3027,
+      "step": 815
+    },
+    {
+      "epoch": 0.5644129344630815,
+      "grad_norm": 0.8211348056793213,
+      "learning_rate": 0.00045957756737072106,
+      "loss": 1.4144,
+      "step": 816
+    },
+    {
+      "epoch": 0.5651046169808058,
+      "grad_norm": 0.6786699891090393,
+      "learning_rate": 0.00045884923525127457,
+      "loss": 1.7804,
+      "step": 817
+    },
+    {
+      "epoch": 0.5657962994985302,
+      "grad_norm": 18.696903228759766,
+      "learning_rate": 0.0004581209031318281,
+      "loss": 1.9006,
+      "step": 818
+    },
+    {
+      "epoch": 0.5664879820162545,
+      "grad_norm": 0.5174362063407898,
+      "learning_rate": 0.0004573925710123816,
+      "loss": 1.4575,
+      "step": 819
+    },
+    {
+      "epoch": 0.5671796645339789,
+      "grad_norm": 0.946522057056427,
+      "learning_rate": 0.0004566642388929352,
+      "loss": 1.6379,
+      "step": 820
+    },
+    {
+      "epoch": 0.5678713470517033,
+      "grad_norm": 0.7363066077232361,
+      "learning_rate": 0.00045593590677348874,
+      "loss": 1.7782,
+      "step": 821
+    },
+    {
+      "epoch": 0.5685630295694276,
+      "grad_norm": 0.6305325031280518,
+      "learning_rate": 0.0004552075746540423,
+      "loss": 1.1396,
+      "step": 822
+    },
+    {
+      "epoch": 0.569254712087152,
+      "grad_norm": 0.7217493057250977,
+      "learning_rate": 0.0004544792425345958,
+      "loss": 1.8529,
+      "step": 823
+    },
+    {
+      "epoch": 0.5699463946048764,
+      "grad_norm": 0.7322853207588196,
+      "learning_rate": 0.0004537509104151493,
+      "loss": 1.9815,
+      "step": 824
+    },
+    {
+      "epoch": 0.5706380771226007,
+      "grad_norm": 0.48964637517929077,
+      "learning_rate": 0.00045302257829570286,
+      "loss": 1.0413,
+      "step": 825
+    },
+    {
+      "epoch": 0.5713297596403251,
+      "grad_norm": 0.7855163216590881,
+      "learning_rate": 0.00045229424617625636,
+      "loss": 1.7931,
+      "step": 826
+    },
+    {
+      "epoch": 0.5720214421580495,
+      "grad_norm": 0.8431006073951721,
+      "learning_rate": 0.0004515659140568099,
+      "loss": 1.5186,
+      "step": 827
+    },
+    {
+      "epoch": 0.5727131246757738,
+      "grad_norm": 0.7614803314208984,
+      "learning_rate": 0.00045083758193736347,
+      "loss": 1.6783,
+      "step": 828
+    },
+    {
+      "epoch": 0.5734048071934982,
+      "grad_norm": 0.670314371585846,
+      "learning_rate": 0.000450109249817917,
+      "loss": 1.5495,
+      "step": 829
+    },
+    {
+      "epoch": 0.5740964897112225,
+      "grad_norm": 0.7063092589378357,
+      "learning_rate": 0.00044938091769847053,
+      "loss": 1.5258,
+      "step": 830
+    },
+    {
+      "epoch": 0.574788172228947,
+      "grad_norm": 0.5529667139053345,
+      "learning_rate": 0.00044865258557902403,
+      "loss": 1.3823,
+      "step": 831
+    },
+    {
+      "epoch": 0.5754798547466713,
+      "grad_norm": 0.7476693391799927,
+      "learning_rate": 0.0004479242534595776,
+      "loss": 1.3917,
+      "step": 832
+    },
+    {
+      "epoch": 0.5761715372643956,
+      "grad_norm": 3.093163013458252,
+      "learning_rate": 0.0004471959213401311,
+      "loss": 1.6397,
+      "step": 833
+    },
+    {
+      "epoch": 0.57686321978212,
+      "grad_norm": 3.8266420364379883,
+      "learning_rate": 0.00044646758922068465,
+      "loss": 1.5871,
+      "step": 834
+    },
+    {
+      "epoch": 0.5775549022998444,
+      "grad_norm": 0.958208441734314,
+      "learning_rate": 0.0004457392571012382,
+      "loss": 1.5749,
+      "step": 835
+    },
+    {
+      "epoch": 0.5782465848175687,
+      "grad_norm": 0.5491811633110046,
+      "learning_rate": 0.0004450109249817917,
+      "loss": 1.6318,
+      "step": 836
+    },
+    {
+      "epoch": 0.5789382673352931,
+      "grad_norm": 1.3938939571380615,
+      "learning_rate": 0.00044428259286234526,
+      "loss": 1.6772,
+      "step": 837
+    },
+    {
+      "epoch": 0.5796299498530174,
+      "grad_norm": 0.9199579954147339,
+      "learning_rate": 0.00044355426074289876,
+      "loss": 1.5163,
+      "step": 838
+    },
+    {
+      "epoch": 0.5803216323707419,
+      "grad_norm": 0.5929269194602966,
+      "learning_rate": 0.00044282592862345227,
+      "loss": 1.6575,
+      "step": 839
+    },
+    {
+      "epoch": 0.5810133148884662,
+      "grad_norm": 0.6414217948913574,
+      "learning_rate": 0.0004420975965040058,
+      "loss": 1.5223,
+      "step": 840
+    },
+    {
+      "epoch": 0.5817049974061905,
+      "grad_norm": 0.48738619685173035,
+      "learning_rate": 0.0004413692643845594,
+      "loss": 1.5968,
+      "step": 841
+    },
+    {
+      "epoch": 0.582396679923915,
+      "grad_norm": 0.56129390001297,
+      "learning_rate": 0.00044064093226511294,
+      "loss": 1.707,
+      "step": 842
+    },
+    {
+      "epoch": 0.5830883624416393,
+      "grad_norm": 0.596315860748291,
+      "learning_rate": 0.00043991260014566644,
+      "loss": 1.8077,
+      "step": 843
+    },
+    {
+      "epoch": 0.5837800449593636,
+      "grad_norm": 0.7291851043701172,
+      "learning_rate": 0.00043918426802622,
+      "loss": 1.3719,
+      "step": 844
+    },
+    {
+      "epoch": 0.584471727477088,
+      "grad_norm": 1.4549719095230103,
+      "learning_rate": 0.0004384559359067735,
+      "loss": 1.402,
+      "step": 845
+    },
+    {
+      "epoch": 0.5851634099948124,
+      "grad_norm": 0.5116413831710815,
+      "learning_rate": 0.000437727603787327,
+      "loss": 1.1901,
+      "step": 846
+    },
+    {
+      "epoch": 0.5858550925125368,
+      "grad_norm": 1.1522141695022583,
+      "learning_rate": 0.00043699927166788056,
+      "loss": 1.6226,
+      "step": 847
+    },
+    {
+      "epoch": 0.5865467750302611,
+      "grad_norm": 0.5651256442070007,
+      "learning_rate": 0.0004362709395484341,
+      "loss": 0.9863,
+      "step": 848
+    },
+    {
+      "epoch": 0.5872384575479854,
+      "grad_norm": 0.9690898656845093,
+      "learning_rate": 0.0004355426074289876,
+      "loss": 1.0906,
+      "step": 849
+    },
+    {
+      "epoch": 0.5879301400657099,
+      "grad_norm": 0.603584349155426,
+      "learning_rate": 0.00043481427530954117,
+      "loss": 1.4331,
+      "step": 850
+    },
+    {
+      "epoch": 0.5886218225834342,
+      "grad_norm": 10.612972259521484,
+      "learning_rate": 0.0004340859431900947,
+      "loss": 1.1645,
+      "step": 851
+    },
+    {
+      "epoch": 0.5893135051011585,
+      "grad_norm": 0.6220032572746277,
+      "learning_rate": 0.00043335761107064823,
+      "loss": 1.7473,
+      "step": 852
+    },
+    {
+      "epoch": 0.590005187618883,
+      "grad_norm": 0.546869158744812,
+      "learning_rate": 0.00043262927895120173,
+      "loss": 1.8727,
+      "step": 853
+    },
+    {
+      "epoch": 0.5906968701366073,
+      "grad_norm": 0.5782111883163452,
+      "learning_rate": 0.0004319009468317553,
+      "loss": 1.5827,
+      "step": 854
+    },
+    {
+      "epoch": 0.5913885526543317,
+      "grad_norm": 0.7139537930488586,
+      "learning_rate": 0.00043117261471230884,
+      "loss": 1.8199,
+      "step": 855
+    },
+    {
+      "epoch": 0.592080235172056,
+      "grad_norm": 0.8115746378898621,
+      "learning_rate": 0.00043044428259286235,
+      "loss": 2.009,
+      "step": 856
+    },
+    {
+      "epoch": 0.5927719176897804,
+      "grad_norm": 0.5989879369735718,
+      "learning_rate": 0.0004297159504734159,
+      "loss": 1.6683,
+      "step": 857
+    },
+    {
+      "epoch": 0.5934636002075048,
+      "grad_norm": 0.6566680669784546,
+      "learning_rate": 0.00042898761835396946,
+      "loss": 1.7271,
+      "step": 858
+    },
+    {
+      "epoch": 0.5941552827252291,
+      "grad_norm": 0.9507800936698914,
+      "learning_rate": 0.0004282592862345229,
+      "loss": 1.0203,
+      "step": 859
+    },
+    {
+      "epoch": 0.5948469652429534,
+      "grad_norm": 0.6159283518791199,
+      "learning_rate": 0.00042753095411507646,
+      "loss": 1.6809,
+      "step": 860
+    },
+    {
+      "epoch": 0.5955386477606779,
+      "grad_norm": 0.6028535962104797,
+      "learning_rate": 0.00042680262199563,
+      "loss": 1.4339,
+      "step": 861
+    },
+    {
+      "epoch": 0.5962303302784022,
+      "grad_norm": 0.6777454614639282,
+      "learning_rate": 0.0004260742898761836,
+      "loss": 1.5344,
+      "step": 862
+    },
+    {
+      "epoch": 0.5969220127961266,
+      "grad_norm": 1.078660488128662,
+      "learning_rate": 0.0004253459577567371,
+      "loss": 1.1091,
+      "step": 863
+    },
+    {
+      "epoch": 0.5976136953138509,
+      "grad_norm": 0.7826130390167236,
+      "learning_rate": 0.00042461762563729064,
+      "loss": 1.8942,
+      "step": 864
+    },
+    {
+      "epoch": 0.5983053778315753,
+      "grad_norm": 0.623664140701294,
+      "learning_rate": 0.0004238892935178442,
+      "loss": 1.862,
+      "step": 865
+    },
+    {
+      "epoch": 0.5989970603492997,
+      "grad_norm": 1.0740686655044556,
+      "learning_rate": 0.00042316096139839764,
+      "loss": 1.5176,
+      "step": 866
+    },
+    {
+      "epoch": 0.599688742867024,
+      "grad_norm": 0.5756235122680664,
+      "learning_rate": 0.0004224326292789512,
+      "loss": 1.8423,
+      "step": 867
+    },
+    {
+      "epoch": 0.6003804253847485,
+      "grad_norm": 0.7312625646591187,
+      "learning_rate": 0.00042170429715950475,
+      "loss": 1.404,
+      "step": 868
+    },
+    {
+      "epoch": 0.6010721079024728,
+      "grad_norm": 0.5717254281044006,
+      "learning_rate": 0.00042097596504005826,
+      "loss": 1.1473,
+      "step": 869
+    },
+    {
+      "epoch": 0.6017637904201971,
+      "grad_norm": 0.6751309633255005,
+      "learning_rate": 0.0004202476329206118,
+      "loss": 1.982,
+      "step": 870
+    },
+    {
+      "epoch": 0.6024554729379215,
+      "grad_norm": 0.6197341680526733,
+      "learning_rate": 0.00041951930080116537,
+      "loss": 1.7876,
+      "step": 871
+    },
+    {
+      "epoch": 0.6031471554556459,
+      "grad_norm": 0.6944209337234497,
+      "learning_rate": 0.0004187909686817189,
+      "loss": 1.0763,
+      "step": 872
+    },
+    {
+      "epoch": 0.6038388379733702,
+      "grad_norm": 0.7229098677635193,
+      "learning_rate": 0.00041806263656227237,
+      "loss": 1.6516,
+      "step": 873
+    },
+    {
+      "epoch": 0.6045305204910946,
+      "grad_norm": 1.5600318908691406,
+      "learning_rate": 0.00041733430444282593,
+      "loss": 1.4807,
+      "step": 874
+    },
+    {
+      "epoch": 0.6052222030088189,
+      "grad_norm": 0.9896885752677917,
+      "learning_rate": 0.0004166059723233795,
+      "loss": 1.398,
+      "step": 875
+    },
+    {
+      "epoch": 0.6059138855265433,
+      "grad_norm": 0.5078806281089783,
+      "learning_rate": 0.000415877640203933,
+      "loss": 1.3289,
+      "step": 876
+    },
+    {
+      "epoch": 0.6066055680442677,
+      "grad_norm": 0.7672819495201111,
+      "learning_rate": 0.00041514930808448654,
+      "loss": 1.2339,
+      "step": 877
+    },
+    {
+      "epoch": 0.607297250561992,
+      "grad_norm": 0.5641161799430847,
+      "learning_rate": 0.0004144209759650401,
+      "loss": 1.4188,
+      "step": 878
+    },
+    {
+      "epoch": 0.6079889330797165,
+      "grad_norm": 0.5341874361038208,
+      "learning_rate": 0.00041369264384559355,
+      "loss": 1.5544,
+      "step": 879
+    },
+    {
+      "epoch": 0.6086806155974408,
+      "grad_norm": 0.6755079627037048,
+      "learning_rate": 0.0004129643117261471,
+      "loss": 1.8125,
+      "step": 880
+    },
+    {
+      "epoch": 0.6093722981151651,
+      "grad_norm": 0.6716341972351074,
+      "learning_rate": 0.00041223597960670066,
+      "loss": 1.469,
+      "step": 881
+    },
+    {
+      "epoch": 0.6100639806328895,
+      "grad_norm": 0.8121787905693054,
+      "learning_rate": 0.0004115076474872542,
+      "loss": 1.7901,
+      "step": 882
+    },
+    {
+      "epoch": 0.6107556631506139,
+      "grad_norm": 0.608357846736908,
+      "learning_rate": 0.0004107793153678077,
+      "loss": 1.3145,
+      "step": 883
+    },
+    {
+      "epoch": 0.6114473456683382,
+      "grad_norm": 0.7797583341598511,
+      "learning_rate": 0.0004100509832483613,
+      "loss": 1.0858,
+      "step": 884
+    },
+    {
+      "epoch": 0.6121390281860626,
+      "grad_norm": 0.6277884840965271,
+      "learning_rate": 0.00040932265112891483,
+      "loss": 1.4323,
+      "step": 885
+    },
+    {
+      "epoch": 0.6128307107037869,
+      "grad_norm": 0.7140945196151733,
+      "learning_rate": 0.0004085943190094683,
+      "loss": 1.3862,
+      "step": 886
+    },
+    {
+      "epoch": 0.6135223932215114,
+      "grad_norm": 0.7441515326499939,
+      "learning_rate": 0.00040786598689002184,
+      "loss": 1.5699,
+      "step": 887
+    },
+    {
+      "epoch": 0.6142140757392357,
+      "grad_norm": 0.514007031917572,
+      "learning_rate": 0.0004071376547705754,
+      "loss": 1.4761,
+      "step": 888
+    },
+    {
+      "epoch": 0.61490575825696,
+      "grad_norm": 0.6850712895393372,
+      "learning_rate": 0.0004064093226511289,
+      "loss": 1.1819,
+      "step": 889
+    },
+    {
+      "epoch": 0.6155974407746844,
+      "grad_norm": 0.6241645216941833,
+      "learning_rate": 0.00040568099053168245,
+      "loss": 1.3215,
+      "step": 890
+    },
+    {
+      "epoch": 0.6162891232924088,
+      "grad_norm": 1.1027741432189941,
+      "learning_rate": 0.000404952658412236,
+      "loss": 1.6761,
+      "step": 891
+    },
+    {
+      "epoch": 0.6169808058101331,
+      "grad_norm": 0.6459061503410339,
+      "learning_rate": 0.00040422432629278957,
+      "loss": 1.6112,
+      "step": 892
+    },
+    {
+      "epoch": 0.6176724883278575,
+      "grad_norm": 0.5237783789634705,
+      "learning_rate": 0.000403495994173343,
+      "loss": 0.878,
+      "step": 893
+    },
+    {
+      "epoch": 0.6183641708455819,
+      "grad_norm": 0.6251216530799866,
+      "learning_rate": 0.00040276766205389657,
+      "loss": 1.7803,
+      "step": 894
+    },
+    {
+      "epoch": 0.6190558533633063,
+      "grad_norm": 1.613736629486084,
+      "learning_rate": 0.0004020393299344501,
+      "loss": 1.3126,
+      "step": 895
+    },
+    {
+      "epoch": 0.6197475358810306,
+      "grad_norm": 0.7189272046089172,
+      "learning_rate": 0.00040131099781500363,
+      "loss": 1.3213,
+      "step": 896
+    },
+    {
+      "epoch": 0.6204392183987549,
+      "grad_norm": 0.6003819704055786,
+      "learning_rate": 0.0004005826656955572,
+      "loss": 2.0014,
+      "step": 897
+    },
+    {
+      "epoch": 0.6211309009164794,
+      "grad_norm": 0.6268942356109619,
+      "learning_rate": 0.00039985433357611074,
+      "loss": 0.9653,
+      "step": 898
+    },
+    {
+      "epoch": 0.6218225834342037,
+      "grad_norm": 0.5853712558746338,
+      "learning_rate": 0.00039912600145666424,
+      "loss": 1.9615,
+      "step": 899
+    },
+    {
+      "epoch": 0.622514265951928,
+      "grad_norm": 0.6711516380310059,
+      "learning_rate": 0.00039839766933721775,
+      "loss": 1.4365,
+      "step": 900
+    },
+    {
+      "epoch": 0.6232059484696524,
+      "grad_norm": 0.7002463936805725,
+      "learning_rate": 0.0003976693372177713,
+      "loss": 1.3427,
+      "step": 901
+    },
+    {
+      "epoch": 0.6238976309873768,
+      "grad_norm": 0.6239266991615295,
+      "learning_rate": 0.00039694100509832486,
+      "loss": 1.874,
+      "step": 902
+    },
+    {
+      "epoch": 0.6245893135051012,
+      "grad_norm": 0.7198671698570251,
+      "learning_rate": 0.00039621267297887836,
+      "loss": 1.6782,
+      "step": 903
+    },
+    {
+      "epoch": 0.6252809960228255,
+      "grad_norm": 1.8313370943069458,
+      "learning_rate": 0.0003954843408594319,
+      "loss": 1.7617,
+      "step": 904
+    },
+    {
+      "epoch": 0.6259726785405499,
+      "grad_norm": 0.6570086479187012,
+      "learning_rate": 0.0003947560087399855,
+      "loss": 1.6372,
+      "step": 905
+    },
+    {
+      "epoch": 0.6266643610582743,
+      "grad_norm": 0.5317332148551941,
+      "learning_rate": 0.000394027676620539,
+      "loss": 1.1686,
+      "step": 906
+    },
+    {
+      "epoch": 0.6273560435759986,
+      "grad_norm": 1.1987481117248535,
+      "learning_rate": 0.0003932993445010925,
+      "loss": 1.3278,
+      "step": 907
+    },
+    {
+      "epoch": 0.628047726093723,
+      "grad_norm": 0.5247228741645813,
+      "learning_rate": 0.00039257101238164603,
+      "loss": 1.3715,
+      "step": 908
+    },
+    {
+      "epoch": 0.6287394086114474,
+      "grad_norm": 0.6109928488731384,
+      "learning_rate": 0.00039184268026219954,
+      "loss": 1.7888,
+      "step": 909
+    },
+    {
+      "epoch": 0.6294310911291717,
+      "grad_norm": 0.6539821028709412,
+      "learning_rate": 0.0003911143481427531,
+      "loss": 1.3144,
+      "step": 910
+    },
+    {
+      "epoch": 0.6301227736468961,
+      "grad_norm": 0.870820164680481,
+      "learning_rate": 0.00039038601602330665,
+      "loss": 1.3851,
+      "step": 911
+    },
+    {
+      "epoch": 0.6308144561646204,
+      "grad_norm": 0.8201245069503784,
+      "learning_rate": 0.0003896576839038602,
+      "loss": 0.8613,
+      "step": 912
+    },
+    {
+      "epoch": 0.6315061386823448,
+      "grad_norm": 0.707269549369812,
+      "learning_rate": 0.0003889293517844137,
+      "loss": 1.0491,
+      "step": 913
+    },
+    {
+      "epoch": 0.6321978212000692,
+      "grad_norm": 0.7596359848976135,
+      "learning_rate": 0.0003882010196649672,
+      "loss": 1.5361,
+      "step": 914
+    },
+    {
+      "epoch": 0.6328895037177935,
+      "grad_norm": 0.5276856422424316,
+      "learning_rate": 0.00038747268754552077,
+      "loss": 1.9265,
+      "step": 915
+    },
+    {
+      "epoch": 0.6335811862355178,
+      "grad_norm": 0.6771373152732849,
+      "learning_rate": 0.00038674435542607427,
+      "loss": 1.2081,
+      "step": 916
+    },
+    {
+      "epoch": 0.6342728687532423,
+      "grad_norm": 0.788817822933197,
+      "learning_rate": 0.0003860160233066278,
+      "loss": 0.9206,
+      "step": 917
+    },
+    {
+      "epoch": 0.6349645512709666,
+      "grad_norm": 0.5923412442207336,
+      "learning_rate": 0.0003852876911871814,
+      "loss": 1.4172,
+      "step": 918
+    },
+    {
+      "epoch": 0.635656233788691,
+      "grad_norm": 0.6840768456459045,
+      "learning_rate": 0.0003845593590677349,
+      "loss": 1.7511,
+      "step": 919
+    },
+    {
+      "epoch": 0.6363479163064154,
+      "grad_norm": 0.6866530776023865,
+      "learning_rate": 0.00038383102694828844,
+      "loss": 1.5415,
+      "step": 920
+    },
+    {
+      "epoch": 0.6370395988241397,
+      "grad_norm": 0.852659285068512,
+      "learning_rate": 0.00038310269482884194,
+      "loss": 1.329,
+      "step": 921
+    },
+    {
+      "epoch": 0.6377312813418641,
+      "grad_norm": 0.8291088938713074,
+      "learning_rate": 0.0003823743627093955,
+      "loss": 1.4088,
+      "step": 922
+    },
+    {
+      "epoch": 0.6384229638595884,
+      "grad_norm": 1.7244031429290771,
+      "learning_rate": 0.000381646030589949,
+      "loss": 1.6084,
+      "step": 923
+    },
+    {
+      "epoch": 0.6391146463773129,
+      "grad_norm": 0.54539555311203,
+      "learning_rate": 0.00038091769847050256,
+      "loss": 0.745,
+      "step": 924
+    },
+    {
+      "epoch": 0.6398063288950372,
+      "grad_norm": 0.7444538474082947,
+      "learning_rate": 0.0003801893663510561,
+      "loss": 1.7405,
+      "step": 925
+    },
+    {
+      "epoch": 0.6404980114127615,
+      "grad_norm": 0.8226865530014038,
+      "learning_rate": 0.0003794610342316096,
+      "loss": 1.5868,
+      "step": 926
+    },
+    {
+      "epoch": 0.6411896939304859,
+      "grad_norm": 0.7856529355049133,
+      "learning_rate": 0.0003787327021121632,
+      "loss": 0.9057,
+      "step": 927
+    },
+    {
+      "epoch": 0.6418813764482103,
+      "grad_norm": 0.6824025511741638,
+      "learning_rate": 0.0003780043699927167,
+      "loss": 1.9523,
+      "step": 928
+    },
+    {
+      "epoch": 0.6425730589659346,
+      "grad_norm": 3.9347681999206543,
+      "learning_rate": 0.0003772760378732702,
+      "loss": 1.7031,
+      "step": 929
+    },
+    {
+      "epoch": 0.643264741483659,
+      "grad_norm": 0.7192727327346802,
+      "learning_rate": 0.00037654770575382373,
+      "loss": 1.7645,
+      "step": 930
+    },
+    {
+      "epoch": 0.6439564240013833,
+      "grad_norm": 0.8705196380615234,
+      "learning_rate": 0.0003758193736343773,
+      "loss": 1.0606,
+      "step": 931
+    },
+    {
+      "epoch": 0.6446481065191078,
+      "grad_norm": 0.5852888226509094,
+      "learning_rate": 0.00037509104151493085,
+      "loss": 1.6836,
+      "step": 932
+    },
+    {
+      "epoch": 0.6453397890368321,
+      "grad_norm": 0.5580787658691406,
+      "learning_rate": 0.00037436270939548435,
+      "loss": 1.6459,
+      "step": 933
+    },
+    {
+      "epoch": 0.6460314715545564,
+      "grad_norm": 0.8001941442489624,
+      "learning_rate": 0.0003736343772760379,
+      "loss": 1.0433,
+      "step": 934
+    },
+    {
+      "epoch": 0.6467231540722809,
+      "grad_norm": 0.5320255160331726,
+      "learning_rate": 0.0003729060451565914,
+      "loss": 1.2846,
+      "step": 935
+    },
+    {
+      "epoch": 0.6474148365900052,
+      "grad_norm": 0.6623178124427795,
+      "learning_rate": 0.0003721777130371449,
+      "loss": 1.689,
+      "step": 936
+    },
+    {
+      "epoch": 0.6481065191077295,
+      "grad_norm": 0.5291332602500916,
+      "learning_rate": 0.00037144938091769847,
+      "loss": 1.8375,
+      "step": 937
+    },
+    {
+      "epoch": 0.6487982016254539,
+      "grad_norm": 0.6939443945884705,
+      "learning_rate": 0.000370721048798252,
+      "loss": 1.9226,
+      "step": 938
+    },
+    {
+      "epoch": 0.6494898841431783,
+      "grad_norm": 0.5086541771888733,
+      "learning_rate": 0.0003699927166788055,
+      "loss": 1.4844,
+      "step": 939
+    },
+    {
+      "epoch": 0.6501815666609027,
+      "grad_norm": 0.6537139415740967,
+      "learning_rate": 0.0003692643845593591,
+      "loss": 1.512,
+      "step": 940
+    },
+    {
+      "epoch": 0.650873249178627,
+      "grad_norm": 1.0320565700531006,
+      "learning_rate": 0.00036853605243991264,
+      "loss": 1.4956,
+      "step": 941
+    },
+    {
+      "epoch": 0.6515649316963513,
+      "grad_norm": 0.7342500686645508,
+      "learning_rate": 0.00036780772032046614,
+      "loss": 1.5453,
+      "step": 942
+    },
+    {
+      "epoch": 0.6522566142140758,
+      "grad_norm": 0.8326630592346191,
+      "learning_rate": 0.00036707938820101964,
+      "loss": 1.392,
+      "step": 943
+    },
+    {
+      "epoch": 0.6529482967318001,
+      "grad_norm": 0.6338616013526917,
+      "learning_rate": 0.0003663510560815732,
+      "loss": 1.6057,
+      "step": 944
+    },
+    {
+      "epoch": 0.6536399792495244,
+      "grad_norm": 0.6175053119659424,
+      "learning_rate": 0.00036562272396212676,
+      "loss": 1.7356,
+      "step": 945
+    },
+    {
+      "epoch": 0.6543316617672489,
+      "grad_norm": 0.5108622908592224,
+      "learning_rate": 0.00036489439184268026,
+      "loss": 1.182,
+      "step": 946
+    },
+    {
+      "epoch": 0.6550233442849732,
+      "grad_norm": 0.7233152389526367,
+      "learning_rate": 0.0003641660597232338,
+      "loss": 1.3504,
+      "step": 947
+    },
+    {
+      "epoch": 0.6557150268026976,
+      "grad_norm": 0.6574891209602356,
+      "learning_rate": 0.00036343772760378737,
+      "loss": 1.9128,
+      "step": 948
+    },
+    {
+      "epoch": 0.6564067093204219,
+      "grad_norm": 0.676141083240509,
+      "learning_rate": 0.0003627093954843408,
+      "loss": 1.724,
+      "step": 949
+    },
+    {
+      "epoch": 0.6570983918381463,
+      "grad_norm": 0.8102545738220215,
+      "learning_rate": 0.0003619810633648944,
+      "loss": 1.1221,
+      "step": 950
+    },
+    {
+      "epoch": 0.6577900743558707,
+      "grad_norm": 0.7310335636138916,
+      "learning_rate": 0.00036125273124544793,
+      "loss": 1.8765,
+      "step": 951
+    },
+    {
+      "epoch": 0.658481756873595,
+      "grad_norm": 0.622969388961792,
+      "learning_rate": 0.0003605243991260015,
+      "loss": 0.7725,
+      "step": 952
+    },
+    {
+      "epoch": 0.6591734393913193,
+      "grad_norm": 0.7207367420196533,
+      "learning_rate": 0.000359796067006555,
+      "loss": 1.8594,
+      "step": 953
+    },
+    {
+      "epoch": 0.6598651219090438,
+      "grad_norm": 0.612112820148468,
+      "learning_rate": 0.00035906773488710855,
+      "loss": 1.3199,
+      "step": 954
+    },
+    {
+      "epoch": 0.6605568044267681,
+      "grad_norm": 0.6712756752967834,
+      "learning_rate": 0.0003583394027676621,
+      "loss": 1.8219,
+      "step": 955
+    },
+    {
+      "epoch": 0.6612484869444925,
+      "grad_norm": 0.5637266039848328,
+      "learning_rate": 0.00035761107064821555,
+      "loss": 1.5599,
+      "step": 956
+    },
+    {
+      "epoch": 0.6619401694622168,
+      "grad_norm": 0.714928150177002,
+      "learning_rate": 0.0003568827385287691,
+      "loss": 1.9542,
+      "step": 957
+    },
+    {
+      "epoch": 0.6626318519799412,
+      "grad_norm": 1.0304123163223267,
+      "learning_rate": 0.00035615440640932266,
+      "loss": 1.5718,
+      "step": 958
+    },
+    {
+      "epoch": 0.6633235344976656,
+      "grad_norm": 0.5427642464637756,
+      "learning_rate": 0.00035542607428987617,
+      "loss": 1.361,
+      "step": 959
+    },
+    {
+      "epoch": 0.6640152170153899,
+      "grad_norm": 0.640608012676239,
+      "learning_rate": 0.0003546977421704297,
+      "loss": 1.3447,
+      "step": 960
+    },
+    {
+      "epoch": 0.6647068995331143,
+      "grad_norm": 2.1725761890411377,
+      "learning_rate": 0.0003539694100509833,
+      "loss": 1.9901,
+      "step": 961
+    },
+    {
+      "epoch": 0.6653985820508387,
+      "grad_norm": 1.3823773860931396,
+      "learning_rate": 0.00035324107793153684,
+      "loss": 1.794,
+      "step": 962
+    },
+    {
+      "epoch": 0.666090264568563,
+      "grad_norm": 0.6191059947013855,
+      "learning_rate": 0.0003525127458120903,
+      "loss": 1.1826,
+      "step": 963
+    },
+    {
+      "epoch": 0.6667819470862874,
+      "grad_norm": 0.9410331845283508,
+      "learning_rate": 0.00035178441369264384,
+      "loss": 1.0219,
+      "step": 964
+    },
+    {
+      "epoch": 0.6674736296040118,
+      "grad_norm": 0.7751091718673706,
+      "learning_rate": 0.0003510560815731974,
+      "loss": 1.3428,
+      "step": 965
+    },
+    {
+      "epoch": 0.6681653121217361,
+      "grad_norm": 0.6246415376663208,
+      "learning_rate": 0.0003503277494537509,
+      "loss": 1.7892,
+      "step": 966
+    },
+    {
+      "epoch": 0.6688569946394605,
+      "grad_norm": 0.47676700353622437,
+      "learning_rate": 0.00034959941733430446,
+      "loss": 1.0902,
+      "step": 967
+    },
+    {
+      "epoch": 0.6695486771571848,
+      "grad_norm": 0.6154366731643677,
+      "learning_rate": 0.000348871085214858,
+      "loss": 1.1601,
+      "step": 968
+    },
+    {
+      "epoch": 0.6702403596749092,
+      "grad_norm": 0.6108272671699524,
+      "learning_rate": 0.0003481427530954115,
+      "loss": 1.9166,
+      "step": 969
+    },
+    {
+      "epoch": 0.6709320421926336,
+      "grad_norm": 0.6346696019172668,
+      "learning_rate": 0.000347414420975965,
+      "loss": 1.8435,
+      "step": 970
+    },
+    {
+      "epoch": 0.6716237247103579,
+      "grad_norm": 0.675031304359436,
+      "learning_rate": 0.00034668608885651857,
+      "loss": 1.5768,
+      "step": 971
+    },
+    {
+      "epoch": 0.6723154072280824,
+      "grad_norm": 0.7120993137359619,
+      "learning_rate": 0.00034595775673707213,
+      "loss": 1.5565,
+      "step": 972
+    },
+    {
+      "epoch": 0.6730070897458067,
+      "grad_norm": 0.643236517906189,
+      "learning_rate": 0.00034522942461762563,
+      "loss": 1.7646,
+      "step": 973
+    },
+    {
+      "epoch": 0.673698772263531,
+      "grad_norm": 0.6599898934364319,
+      "learning_rate": 0.0003445010924981792,
+      "loss": 1.704,
+      "step": 974
+    },
+    {
+      "epoch": 0.6743904547812554,
+      "grad_norm": 0.7331796884536743,
+      "learning_rate": 0.00034377276037873274,
+      "loss": 1.3173,
+      "step": 975
+    },
+    {
+      "epoch": 0.6750821372989798,
+      "grad_norm": 0.8388747572898865,
+      "learning_rate": 0.00034304442825928625,
+      "loss": 1.4021,
+      "step": 976
+    },
+    {
+      "epoch": 0.6757738198167041,
+      "grad_norm": 0.820971667766571,
+      "learning_rate": 0.00034231609613983975,
+      "loss": 2.0084,
+      "step": 977
+    },
+    {
+      "epoch": 0.6764655023344285,
+      "grad_norm": 0.64729243516922,
+      "learning_rate": 0.0003415877640203933,
+      "loss": 1.848,
+      "step": 978
+    },
+    {
+      "epoch": 0.6771571848521528,
+      "grad_norm": 18.543529510498047,
+      "learning_rate": 0.0003408594319009468,
+      "loss": 1.104,
+      "step": 979
+    },
+    {
+      "epoch": 0.6778488673698773,
+      "grad_norm": 0.7021201848983765,
+      "learning_rate": 0.00034013109978150036,
+      "loss": 1.2265,
+      "step": 980
+    },
+    {
+      "epoch": 0.6785405498876016,
+      "grad_norm": 0.9745551943778992,
+      "learning_rate": 0.0003394027676620539,
+      "loss": 1.4934,
+      "step": 981
+    },
+    {
+      "epoch": 0.6792322324053259,
+      "grad_norm": 0.6969733834266663,
+      "learning_rate": 0.0003386744355426075,
+      "loss": 1.1012,
+      "step": 982
+    },
+    {
+      "epoch": 0.6799239149230503,
+      "grad_norm": 0.9625135064125061,
+      "learning_rate": 0.000337946103423161,
+      "loss": 1.3152,
+      "step": 983
+    },
+    {
+      "epoch": 0.6806155974407747,
+      "grad_norm": 0.5473238825798035,
+      "learning_rate": 0.0003372177713037145,
+      "loss": 0.8124,
+      "step": 984
+    },
+    {
+      "epoch": 0.681307279958499,
+      "grad_norm": 0.5325528383255005,
+      "learning_rate": 0.00033648943918426804,
+      "loss": 0.9812,
+      "step": 985
+    },
+    {
+      "epoch": 0.6819989624762234,
+      "grad_norm": 0.745832622051239,
+      "learning_rate": 0.00033576110706482154,
+      "loss": 1.9812,
+      "step": 986
+    },
+    {
+      "epoch": 0.6826906449939478,
+      "grad_norm": 0.5990468263626099,
+      "learning_rate": 0.0003350327749453751,
+      "loss": 1.753,
+      "step": 987
+    },
+    {
+      "epoch": 0.6833823275116722,
+      "grad_norm": 1.1015442609786987,
+      "learning_rate": 0.00033430444282592865,
+      "loss": 1.3853,
+      "step": 988
+    },
+    {
+      "epoch": 0.6840740100293965,
+      "grad_norm": 0.5846887230873108,
+      "learning_rate": 0.00033357611070648215,
+      "loss": 1.1549,
+      "step": 989
+    },
+    {
+      "epoch": 0.6847656925471208,
+      "grad_norm": 0.6844741702079773,
+      "learning_rate": 0.0003328477785870357,
+      "loss": 1.6326,
+      "step": 990
+    },
+    {
+      "epoch": 0.6854573750648453,
+      "grad_norm": 0.6528374552726746,
+      "learning_rate": 0.0003321194464675892,
+      "loss": 1.5923,
+      "step": 991
+    },
+    {
+      "epoch": 0.6861490575825696,
+      "grad_norm": 7.470677375793457,
+      "learning_rate": 0.00033139111434814277,
+      "loss": 1.3873,
+      "step": 992
+    },
+    {
+      "epoch": 0.686840740100294,
+      "grad_norm": 0.6172521710395813,
+      "learning_rate": 0.00033066278222869627,
+      "loss": 1.6455,
+      "step": 993
+    },
+    {
+      "epoch": 0.6875324226180183,
+      "grad_norm": 0.6366342306137085,
+      "learning_rate": 0.00032993445010924983,
+      "loss": 1.2664,
+      "step": 994
+    },
+    {
+      "epoch": 0.6882241051357427,
+      "grad_norm": 0.6257708668708801,
+      "learning_rate": 0.0003292061179898034,
+      "loss": 1.8094,
+      "step": 995
+    },
+    {
+      "epoch": 0.6889157876534671,
+      "grad_norm": 0.5113406181335449,
+      "learning_rate": 0.0003284777858703569,
+      "loss": 0.8943,
+      "step": 996
+    },
+    {
+      "epoch": 0.6896074701711914,
+      "grad_norm": 0.8148225545883179,
+      "learning_rate": 0.00032774945375091044,
+      "loss": 1.3974,
+      "step": 997
+    },
+    {
+      "epoch": 0.6902991526889158,
+      "grad_norm": 0.7986158728599548,
+      "learning_rate": 0.00032702112163146395,
+      "loss": 0.6925,
+      "step": 998
+    },
+    {
+      "epoch": 0.6909908352066402,
+      "grad_norm": 0.983278751373291,
+      "learning_rate": 0.00032629278951201745,
+      "loss": 1.0282,
+      "step": 999
+    },
+    {
+      "epoch": 0.6916825177243645,
+      "grad_norm": 0.5543628931045532,
+      "learning_rate": 0.000325564457392571,
+      "loss": 1.8791,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6916825177243645,
+      "eval_loss": 1.3676680326461792,
+      "eval_runtime": 586.3584,
+      "eval_samples_per_second": 2.191,
+      "eval_steps_per_second": 1.097,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1446,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.178081149272064e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}