diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,15783 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.8853552192869967,
+  "eval_steps": 500,
+  "global_step": 22500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.003968253968253968,
+      "grad_norm": 2.719743251800537,
+      "learning_rate": 0.0019973544973544977,
+      "loss": 2.0582,
+      "step": 10
+    },
+    {
+      "epoch": 0.007936507936507936,
+      "grad_norm": 3.9530892372131348,
+      "learning_rate": 0.001994708994708995,
+      "loss": 1.5106,
+      "step": 20
+    },
+    {
+      "epoch": 0.011904761904761904,
+      "grad_norm": 2.873342514038086,
+      "learning_rate": 0.001992063492063492,
+      "loss": 1.3441,
+      "step": 30
+    },
+    {
+      "epoch": 0.015873015873015872,
+      "grad_norm": 2.5402843952178955,
+      "learning_rate": 0.001989417989417989,
+      "loss": 1.3346,
+      "step": 40
+    },
+    {
+      "epoch": 0.01984126984126984,
+      "grad_norm": 8.011046409606934,
+      "learning_rate": 0.001986772486772487,
+      "loss": 1.4858,
+      "step": 50
+    },
+    {
+      "epoch": 0.023809523809523808,
+      "grad_norm": 1.8670936822891235,
+      "learning_rate": 0.001984126984126984,
+      "loss": 1.3795,
+      "step": 60
+    },
+    {
+      "epoch": 0.027777777777777776,
+      "grad_norm": 4.0268025398254395,
+      "learning_rate": 0.0019814814814814816,
+      "loss": 1.2093,
+      "step": 70
+    },
+    {
+      "epoch": 0.031746031746031744,
+      "grad_norm": 2.250659227371216,
+      "learning_rate": 0.001978835978835979,
+      "loss": 1.4175,
+      "step": 80
+    },
+    {
+      "epoch": 0.03571428571428571,
+      "grad_norm": 2.6111180782318115,
+      "learning_rate": 0.0019761904761904764,
+      "loss": 1.2632,
+      "step": 90
+    },
+    {
+      "epoch": 0.03968253968253968,
+      "grad_norm": 2.8487629890441895,
+      "learning_rate": 0.0019735449735449736,
+      "loss": 1.3824,
+      "step": 100
+    },
+    {
+      "epoch": 0.04365079365079365,
+      "grad_norm": 2.2919723987579346,
+      "learning_rate": 0.001970899470899471,
+      "loss": 1.1783,
+      "step": 110
+    },
+    {
+      "epoch": 0.047619047619047616,
+      "grad_norm": 2.1400911808013916,
+      "learning_rate": 0.001968253968253968,
+      "loss": 1.2046,
+      "step": 120
+    },
+    {
+      "epoch": 0.051587301587301584,
+      "grad_norm": 2.4251630306243896,
+      "learning_rate": 0.0019656084656084656,
+      "loss": 0.9373,
+      "step": 130
+    },
+    {
+      "epoch": 0.05555555555555555,
+      "grad_norm": 4.872255325317383,
+      "learning_rate": 0.0019629629629629632,
+      "loss": 1.1487,
+      "step": 140
+    },
+    {
+      "epoch": 0.05952380952380952,
+      "grad_norm": 1.1926871538162231,
+      "learning_rate": 0.0019603174603174604,
+      "loss": 0.9628,
+      "step": 150
+    },
+    {
+      "epoch": 0.06349206349206349,
+      "grad_norm": 1.6297978162765503,
+      "learning_rate": 0.0019576719576719576,
+      "loss": 1.0797,
+      "step": 160
+    },
+    {
+      "epoch": 0.06746031746031746,
+      "grad_norm": 1.9080616235733032,
+      "learning_rate": 0.001955026455026455,
+      "loss": 1.1868,
+      "step": 170
+    },
+    {
+      "epoch": 0.07142857142857142,
+      "grad_norm": 1.5988057851791382,
+      "learning_rate": 0.0019523809523809524,
+      "loss": 1.2171,
+      "step": 180
+    },
+    {
+      "epoch": 0.07539682539682539,
+      "grad_norm": 3.464204788208008,
+      "learning_rate": 0.0019497354497354498,
+      "loss": 1.0786,
+      "step": 190
+    },
+    {
+      "epoch": 0.07936507936507936,
+      "grad_norm": 1.0056853294372559,
+      "learning_rate": 0.001947089947089947,
+      "loss": 1.0321,
+      "step": 200
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 0.7744622230529785,
+      "learning_rate": 0.0019444444444444444,
+      "loss": 1.0652,
+      "step": 210
+    },
+    {
+      "epoch": 0.0873015873015873,
+      "grad_norm": 1.8306702375411987,
+      "learning_rate": 0.0019417989417989418,
+      "loss": 1.2596,
+      "step": 220
+    },
+    {
+      "epoch": 0.09126984126984126,
+      "grad_norm": 4.866456985473633,
+      "learning_rate": 0.0019391534391534392,
+      "loss": 0.9659,
+      "step": 230
+    },
+    {
+      "epoch": 0.09523809523809523,
+      "grad_norm": 2.4852681159973145,
+      "learning_rate": 0.0019365079365079366,
+      "loss": 0.9604,
+      "step": 240
+    },
+    {
+      "epoch": 0.0992063492063492,
+      "grad_norm": 1.818182349205017,
+      "learning_rate": 0.001933862433862434,
+      "loss": 0.84,
+      "step": 250
+    },
+    {
+      "epoch": 0.10317460317460317,
+      "grad_norm": 2.9667937755584717,
+      "learning_rate": 0.0019312169312169312,
+      "loss": 1.0415,
+      "step": 260
+    },
+    {
+      "epoch": 0.10714285714285714,
+      "grad_norm": 2.0339770317077637,
+      "learning_rate": 0.0019285714285714286,
+      "loss": 1.1436,
+      "step": 270
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 1.2213151454925537,
+      "learning_rate": 0.0019259259259259258,
+      "loss": 0.9627,
+      "step": 280
+    },
+    {
+      "epoch": 0.11507936507936507,
+      "grad_norm": 0.9745686054229736,
+      "learning_rate": 0.0019232804232804234,
+      "loss": 1.1314,
+      "step": 290
+    },
+    {
+      "epoch": 0.11904761904761904,
+      "grad_norm": 0.9821905493736267,
+      "learning_rate": 0.0019206349206349208,
+      "loss": 0.8616,
+      "step": 300
+    },
+    {
+      "epoch": 0.12301587301587301,
+      "grad_norm": 1.1417244672775269,
+      "learning_rate": 0.001917989417989418,
+      "loss": 0.8589,
+      "step": 310
+    },
+    {
+      "epoch": 0.12698412698412698,
+      "grad_norm": 1.10502028465271,
+      "learning_rate": 0.0019153439153439154,
+      "loss": 0.8038,
+      "step": 320
+    },
+    {
+      "epoch": 0.13095238095238096,
+      "grad_norm": 3.0337259769439697,
+      "learning_rate": 0.0019126984126984128,
+      "loss": 1.1473,
+      "step": 330
+    },
+    {
+      "epoch": 0.1349206349206349,
+      "grad_norm": 1.5644335746765137,
+      "learning_rate": 0.00191005291005291,
+      "loss": 1.377,
+      "step": 340
+    },
+    {
+      "epoch": 0.1388888888888889,
+      "grad_norm": 1.4690322875976562,
+      "learning_rate": 0.0019074074074074076,
+      "loss": 0.9918,
+      "step": 350
+    },
+    {
+      "epoch": 0.14285714285714285,
+      "grad_norm": 1.3624849319458008,
+      "learning_rate": 0.0019047619047619048,
+      "loss": 1.0086,
+      "step": 360
+    },
+    {
+      "epoch": 0.14682539682539683,
+      "grad_norm": 1.9925272464752197,
+      "learning_rate": 0.0019021164021164022,
+      "loss": 0.8111,
+      "step": 370
+    },
+    {
+      "epoch": 0.15079365079365079,
+      "grad_norm": 1.8325337171554565,
+      "learning_rate": 0.0018994708994708996,
+      "loss": 0.9993,
+      "step": 380
+    },
+    {
+      "epoch": 0.15476190476190477,
+      "grad_norm": 1.2556744813919067,
+      "learning_rate": 0.0018968253968253967,
+      "loss": 0.9214,
+      "step": 390
+    },
+    {
+      "epoch": 0.15873015873015872,
+      "grad_norm": 2.262421131134033,
+      "learning_rate": 0.0018941798941798941,
+      "loss": 0.9461,
+      "step": 400
+    },
+    {
+      "epoch": 0.1626984126984127,
+      "grad_norm": 1.9751100540161133,
+      "learning_rate": 0.0018915343915343918,
+      "loss": 1.0798,
+      "step": 410
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 6.7523040771484375,
+      "learning_rate": 0.001888888888888889,
+      "loss": 0.9272,
+      "step": 420
+    },
+    {
+      "epoch": 0.17063492063492064,
+      "grad_norm": 3.15874981880188,
+      "learning_rate": 0.0018862433862433864,
+      "loss": 0.9037,
+      "step": 430
+    },
+    {
+      "epoch": 0.1746031746031746,
+      "grad_norm": 1.0322136878967285,
+      "learning_rate": 0.0018835978835978835,
+      "loss": 0.9704,
+      "step": 440
+    },
+    {
+      "epoch": 0.17857142857142858,
+      "grad_norm": 1.655899167060852,
+      "learning_rate": 0.001880952380952381,
+      "loss": 0.9093,
+      "step": 450
+    },
+    {
+      "epoch": 0.18253968253968253,
+      "grad_norm": 0.8752370476722717,
+      "learning_rate": 0.0018783068783068783,
+      "loss": 0.9002,
+      "step": 460
+    },
+    {
+      "epoch": 0.1865079365079365,
+      "grad_norm": 1.024077296257019,
+      "learning_rate": 0.0018756613756613755,
+      "loss": 0.9406,
+      "step": 470
+    },
+    {
+      "epoch": 0.19047619047619047,
+      "grad_norm": 1.2974797487258911,
+      "learning_rate": 0.0018730158730158731,
+      "loss": 1.0027,
+      "step": 480
+    },
+    {
+      "epoch": 0.19444444444444445,
+      "grad_norm": 1.0525192022323608,
+      "learning_rate": 0.0018703703703703705,
+      "loss": 0.8173,
+      "step": 490
+    },
+    {
+      "epoch": 0.1984126984126984,
+      "grad_norm": 0.8886928558349609,
+      "learning_rate": 0.0018677248677248677,
+      "loss": 0.8162,
+      "step": 500
+    },
+    {
+      "epoch": 0.20238095238095238,
+      "grad_norm": 2.219409704208374,
+      "learning_rate": 0.0018650793650793651,
+      "loss": 0.9328,
+      "step": 510
+    },
+    {
+      "epoch": 0.20634920634920634,
+      "grad_norm": 1.2269400358200073,
+      "learning_rate": 0.0018624338624338623,
+      "loss": 0.7617,
+      "step": 520
+    },
+    {
+      "epoch": 0.21031746031746032,
+      "grad_norm": 1.3941247463226318,
+      "learning_rate": 0.0018597883597883597,
+      "loss": 0.733,
+      "step": 530
+    },
+    {
+      "epoch": 0.21428571428571427,
+      "grad_norm": 1.3957165479660034,
+      "learning_rate": 0.0018571428571428573,
+      "loss": 0.8621,
+      "step": 540
+    },
+    {
+      "epoch": 0.21825396825396826,
+      "grad_norm": 1.3213554620742798,
+      "learning_rate": 0.0018544973544973545,
+      "loss": 0.9746,
+      "step": 550
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 1.3588542938232422,
+      "learning_rate": 0.001851851851851852,
+      "loss": 0.7973,
+      "step": 560
+    },
+    {
+      "epoch": 0.2261904761904762,
+      "grad_norm": 1.7744730710983276,
+      "learning_rate": 0.0018492063492063493,
+      "loss": 0.887,
+      "step": 570
+    },
+    {
+      "epoch": 0.23015873015873015,
+      "grad_norm": 0.7673001289367676,
+      "learning_rate": 0.0018465608465608465,
+      "loss": 0.7976,
+      "step": 580
+    },
+    {
+      "epoch": 0.23412698412698413,
+      "grad_norm": 1.4514744281768799,
+      "learning_rate": 0.001843915343915344,
+      "loss": 0.7594,
+      "step": 590
+    },
+    {
+      "epoch": 0.23809523809523808,
+      "grad_norm": 1.408557653427124,
+      "learning_rate": 0.0018412698412698413,
+      "loss": 0.8519,
+      "step": 600
+    },
+    {
+      "epoch": 0.24206349206349206,
+      "grad_norm": 1.758348822593689,
+      "learning_rate": 0.0018386243386243387,
+      "loss": 1.0071,
+      "step": 610
+    },
+    {
+      "epoch": 0.24603174603174602,
+      "grad_norm": 1.6447445154190063,
+      "learning_rate": 0.0018359788359788361,
+      "loss": 0.7517,
+      "step": 620
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.568068027496338,
+      "learning_rate": 0.0018333333333333333,
+      "loss": 0.9271,
+      "step": 630
+    },
+    {
+      "epoch": 0.25396825396825395,
+      "grad_norm": 1.2021923065185547,
+      "learning_rate": 0.0018306878306878307,
+      "loss": 1.1121,
+      "step": 640
+    },
+    {
+      "epoch": 0.25793650793650796,
+      "grad_norm": 2.1598119735717773,
+      "learning_rate": 0.001828042328042328,
+      "loss": 0.8373,
+      "step": 650
+    },
+    {
+      "epoch": 0.2619047619047619,
+      "grad_norm": 1.0078835487365723,
+      "learning_rate": 0.0018253968253968253,
+      "loss": 0.8333,
+      "step": 660
+    },
+    {
+      "epoch": 0.26587301587301587,
+      "grad_norm": 0.9753168225288391,
+      "learning_rate": 0.001822751322751323,
+      "loss": 1.0549,
+      "step": 670
+    },
+    {
+      "epoch": 0.2698412698412698,
+      "grad_norm": 1.491974949836731,
+      "learning_rate": 0.00182010582010582,
+      "loss": 1.0431,
+      "step": 680
+    },
+    {
+      "epoch": 0.27380952380952384,
+      "grad_norm": 1.1669495105743408,
+      "learning_rate": 0.0018174603174603175,
+      "loss": 0.9501,
+      "step": 690
+    },
+    {
+      "epoch": 0.2777777777777778,
+      "grad_norm": 0.8744311332702637,
+      "learning_rate": 0.001814814814814815,
+      "loss": 0.7574,
+      "step": 700
+    },
+    {
+      "epoch": 0.28174603174603174,
+      "grad_norm": 0.619263768196106,
+      "learning_rate": 0.001812169312169312,
+      "loss": 0.8899,
+      "step": 710
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 1.276594638824463,
+      "learning_rate": 0.0018095238095238095,
+      "loss": 0.8611,
+      "step": 720
+    },
+    {
+      "epoch": 0.2896825396825397,
+      "grad_norm": 1.1073200702667236,
+      "learning_rate": 0.001806878306878307,
+      "loss": 0.7205,
+      "step": 730
+    },
+    {
+      "epoch": 0.29365079365079366,
+      "grad_norm": 1.8631259202957153,
+      "learning_rate": 0.0018042328042328043,
+      "loss": 0.9344,
+      "step": 740
+    },
+    {
+      "epoch": 0.2976190476190476,
+      "grad_norm": 1.406410813331604,
+      "learning_rate": 0.0018015873015873017,
+      "loss": 0.8569,
+      "step": 750
+    },
+    {
+      "epoch": 0.30158730158730157,
+      "grad_norm": 1.26906156539917,
+      "learning_rate": 0.0017989417989417989,
+      "loss": 0.7488,
+      "step": 760
+    },
+    {
+      "epoch": 0.3055555555555556,
+      "grad_norm": 1.0014851093292236,
+      "learning_rate": 0.0017962962962962963,
+      "loss": 0.6643,
+      "step": 770
+    },
+    {
+      "epoch": 0.30952380952380953,
+      "grad_norm": 1.0010994672775269,
+      "learning_rate": 0.0017936507936507937,
+      "loss": 0.8164,
+      "step": 780
+    },
+    {
+      "epoch": 0.3134920634920635,
+      "grad_norm": 1.0928398370742798,
+      "learning_rate": 0.001791005291005291,
+      "loss": 0.8814,
+      "step": 790
+    },
+    {
+      "epoch": 0.31746031746031744,
+      "grad_norm": 1.6183459758758545,
+      "learning_rate": 0.0017883597883597885,
+      "loss": 1.0908,
+      "step": 800
+    },
+    {
+      "epoch": 0.32142857142857145,
+      "grad_norm": 0.7748919129371643,
+      "learning_rate": 0.0017857142857142859,
+      "loss": 0.7623,
+      "step": 810
+    },
+    {
+      "epoch": 0.3253968253968254,
+      "grad_norm": 1.22903311252594,
+      "learning_rate": 0.001783068783068783,
+      "loss": 0.8888,
+      "step": 820
+    },
+    {
+      "epoch": 0.32936507936507936,
+      "grad_norm": 1.9972559213638306,
+      "learning_rate": 0.0017804232804232805,
+      "loss": 0.8963,
+      "step": 830
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 1.2421702146530151,
+      "learning_rate": 0.0017777777777777776,
+      "loss": 0.7896,
+      "step": 840
+    },
+    {
+      "epoch": 0.3373015873015873,
+      "grad_norm": 0.676760196685791,
+      "learning_rate": 0.001775132275132275,
+      "loss": 0.7917,
+      "step": 850
+    },
+    {
+      "epoch": 0.3412698412698413,
+      "grad_norm": 2.124894857406616,
+      "learning_rate": 0.0017724867724867727,
+      "loss": 0.8721,
+      "step": 860
+    },
+    {
+      "epoch": 0.34523809523809523,
+      "grad_norm": 1.416979432106018,
+      "learning_rate": 0.0017698412698412699,
+      "loss": 0.7461,
+      "step": 870
+    },
+    {
+      "epoch": 0.3492063492063492,
+      "grad_norm": 0.9547367691993713,
+      "learning_rate": 0.0017671957671957673,
+      "loss": 0.6878,
+      "step": 880
+    },
+    {
+      "epoch": 0.3531746031746032,
+      "grad_norm": 0.814999520778656,
+      "learning_rate": 0.0017645502645502647,
+      "loss": 0.7173,
+      "step": 890
+    },
+    {
+      "epoch": 0.35714285714285715,
+      "grad_norm": 1.4335911273956299,
+      "learning_rate": 0.0017619047619047618,
+      "loss": 0.9165,
+      "step": 900
+    },
+    {
+      "epoch": 0.3611111111111111,
+      "grad_norm": 2.418215274810791,
+      "learning_rate": 0.0017592592592592592,
+      "loss": 0.8312,
+      "step": 910
+    },
+    {
+      "epoch": 0.36507936507936506,
+      "grad_norm": 0.6499120593070984,
+      "learning_rate": 0.0017566137566137566,
+      "loss": 0.6524,
+      "step": 920
+    },
+    {
+      "epoch": 0.36904761904761907,
+      "grad_norm": 1.4244420528411865,
+      "learning_rate": 0.001753968253968254,
+      "loss": 0.7283,
+      "step": 930
+    },
+    {
+      "epoch": 0.373015873015873,
+      "grad_norm": 0.9467722177505493,
+      "learning_rate": 0.0017513227513227514,
+      "loss": 0.7102,
+      "step": 940
+    },
+    {
+      "epoch": 0.376984126984127,
+      "grad_norm": 0.9126266241073608,
+      "learning_rate": 0.0017486772486772486,
+      "loss": 0.8736,
+      "step": 950
+    },
+    {
+      "epoch": 0.38095238095238093,
+      "grad_norm": 0.739183783531189,
+      "learning_rate": 0.001746031746031746,
+      "loss": 0.7108,
+      "step": 960
+    },
+    {
+      "epoch": 0.38492063492063494,
+      "grad_norm": 0.7012743949890137,
+      "learning_rate": 0.0017433862433862434,
+      "loss": 0.708,
+      "step": 970
+    },
+    {
+      "epoch": 0.3888888888888889,
+      "grad_norm": 1.4281548261642456,
+      "learning_rate": 0.0017407407407407408,
+      "loss": 0.7982,
+      "step": 980
+    },
+    {
+      "epoch": 0.39285714285714285,
+      "grad_norm": 1.850917935371399,
+      "learning_rate": 0.0017380952380952382,
+      "loss": 0.7927,
+      "step": 990
+    },
+    {
+      "epoch": 0.3968253968253968,
+      "grad_norm": 1.2646055221557617,
+      "learning_rate": 0.0017354497354497354,
+      "loss": 0.6407,
+      "step": 1000
+    },
+    {
+      "epoch": 0.4007936507936508,
+      "grad_norm": 2.1877217292785645,
+      "learning_rate": 0.0017328042328042328,
+      "loss": 0.636,
+      "step": 1010
+    },
+    {
+      "epoch": 0.40476190476190477,
+      "grad_norm": 1.4416710138320923,
+      "learning_rate": 0.0017301587301587302,
+      "loss": 0.6643,
+      "step": 1020
+    },
+    {
+      "epoch": 0.4087301587301587,
+      "grad_norm": 0.9752436876296997,
+      "learning_rate": 0.0017275132275132274,
+      "loss": 0.7655,
+      "step": 1030
+    },
+    {
+      "epoch": 0.4126984126984127,
+      "grad_norm": 0.6438788175582886,
+      "learning_rate": 0.001724867724867725,
+      "loss": 0.8659,
+      "step": 1040
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.9634172320365906,
+      "learning_rate": 0.0017222222222222224,
+      "loss": 0.8015,
+      "step": 1050
+    },
+    {
+      "epoch": 0.42063492063492064,
+      "grad_norm": 1.4185785055160522,
+      "learning_rate": 0.0017195767195767196,
+      "loss": 0.991,
+      "step": 1060
+    },
+    {
+      "epoch": 0.4246031746031746,
+      "grad_norm": 1.0508280992507935,
+      "learning_rate": 0.001716931216931217,
+      "loss": 0.9047,
+      "step": 1070
+    },
+    {
+      "epoch": 0.42857142857142855,
+      "grad_norm": 1.1847171783447266,
+      "learning_rate": 0.0017142857142857142,
+      "loss": 0.7529,
+      "step": 1080
+    },
+    {
+      "epoch": 0.43253968253968256,
+      "grad_norm": 0.8445650935173035,
+      "learning_rate": 0.0017116402116402116,
+      "loss": 0.6472,
+      "step": 1090
+    },
+    {
+      "epoch": 0.4365079365079365,
+      "grad_norm": 0.6549813151359558,
+      "learning_rate": 0.001708994708994709,
+      "loss": 0.753,
+      "step": 1100
+    },
+    {
+      "epoch": 0.44047619047619047,
+      "grad_norm": 1.5086162090301514,
+      "learning_rate": 0.0017063492063492064,
+      "loss": 0.6774,
+      "step": 1110
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 1.5609638690948486,
+      "learning_rate": 0.0017037037037037038,
+      "loss": 0.6732,
+      "step": 1120
+    },
+    {
+      "epoch": 0.44841269841269843,
+      "grad_norm": 1.2099113464355469,
+      "learning_rate": 0.0017010582010582012,
+      "loss": 0.7132,
+      "step": 1130
+    },
+    {
+      "epoch": 0.4523809523809524,
+      "grad_norm": 1.5899118185043335,
+      "learning_rate": 0.0016984126984126984,
+      "loss": 1.1286,
+      "step": 1140
+    },
+    {
+      "epoch": 0.45634920634920634,
+      "grad_norm": 1.4785903692245483,
+      "learning_rate": 0.0016957671957671958,
+      "loss": 0.8924,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4603174603174603,
+      "grad_norm": 1.7442249059677124,
+      "learning_rate": 0.001693121693121693,
+      "loss": 0.7868,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4642857142857143,
+      "grad_norm": 1.7168885469436646,
+      "learning_rate": 0.0016904761904761906,
+      "loss": 0.7758,
+      "step": 1170
+    },
+    {
+      "epoch": 0.46825396825396826,
+      "grad_norm": 0.735222339630127,
+      "learning_rate": 0.001687830687830688,
+      "loss": 0.9121,
+      "step": 1180
+    },
+    {
+      "epoch": 0.4722222222222222,
+      "grad_norm": 1.0101484060287476,
+      "learning_rate": 0.0016851851851851852,
+      "loss": 0.6875,
+      "step": 1190
+    },
+    {
+      "epoch": 0.47619047619047616,
+      "grad_norm": 0.8721634149551392,
+      "learning_rate": 0.0016825396825396826,
+      "loss": 0.7557,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4801587301587302,
+      "grad_norm": 1.2771320343017578,
+      "learning_rate": 0.00167989417989418,
+      "loss": 0.6477,
+      "step": 1210
+    },
+    {
+      "epoch": 0.48412698412698413,
+      "grad_norm": 0.573085606098175,
+      "learning_rate": 0.0016772486772486772,
+      "loss": 0.7522,
+      "step": 1220
+    },
+    {
+      "epoch": 0.4880952380952381,
+      "grad_norm": 0.6810621023178101,
+      "learning_rate": 0.0016746031746031748,
+      "loss": 0.9133,
+      "step": 1230
+    },
+    {
+      "epoch": 0.49206349206349204,
+      "grad_norm": 0.5593830347061157,
+      "learning_rate": 0.001671957671957672,
+      "loss": 0.6225,
+      "step": 1240
+    },
+    {
+      "epoch": 0.49603174603174605,
+      "grad_norm": 1.1917506456375122,
+      "learning_rate": 0.0016693121693121694,
+      "loss": 0.713,
+      "step": 1250
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 2.748424530029297,
+      "learning_rate": 0.0016666666666666668,
+      "loss": 0.6865,
+      "step": 1260
+    },
+    {
+      "epoch": 0.503968253968254,
+      "grad_norm": 1.4518764019012451,
+      "learning_rate": 0.001664021164021164,
+      "loss": 0.6681,
+      "step": 1270
+    },
+    {
+      "epoch": 0.5079365079365079,
+      "grad_norm": 0.7594536542892456,
+      "learning_rate": 0.0016613756613756614,
+      "loss": 0.542,
+      "step": 1280
+    },
+    {
+      "epoch": 0.5119047619047619,
+      "grad_norm": 0.6531535387039185,
+      "learning_rate": 0.0016587301587301588,
+      "loss": 0.6516,
+      "step": 1290
+    },
+    {
+      "epoch": 0.5158730158730159,
+      "grad_norm": 1.2486604452133179,
+      "learning_rate": 0.0016560846560846562,
+      "loss": 0.5894,
+      "step": 1300
+    },
+    {
+      "epoch": 0.5198412698412699,
+      "grad_norm": 1.1929885149002075,
+      "learning_rate": 0.0016534391534391536,
+      "loss": 0.8147,
+      "step": 1310
+    },
+    {
+      "epoch": 0.5238095238095238,
+      "grad_norm": 1.1954102516174316,
+      "learning_rate": 0.0016507936507936507,
+      "loss": 0.7889,
+      "step": 1320
+    },
+    {
+      "epoch": 0.5277777777777778,
+      "grad_norm": 1.271843671798706,
+      "learning_rate": 0.0016481481481481482,
+      "loss": 0.5804,
+      "step": 1330
+    },
+    {
+      "epoch": 0.5317460317460317,
+      "grad_norm": 1.0248411893844604,
+      "learning_rate": 0.0016455026455026456,
+      "loss": 0.6674,
+      "step": 1340
+    },
+    {
+      "epoch": 0.5357142857142857,
+      "grad_norm": 0.9981194734573364,
+      "learning_rate": 0.0016428571428571427,
+      "loss": 0.7461,
+      "step": 1350
+    },
+    {
+      "epoch": 0.5396825396825397,
+      "grad_norm": 1.431178331375122,
+      "learning_rate": 0.0016402116402116404,
+      "loss": 0.8174,
+      "step": 1360
+    },
+    {
+      "epoch": 0.5436507936507936,
+      "grad_norm": 1.7068381309509277,
+      "learning_rate": 0.0016375661375661378,
+      "loss": 0.7248,
+      "step": 1370
+    },
+    {
+      "epoch": 0.5476190476190477,
+      "grad_norm": 1.1310241222381592,
+      "learning_rate": 0.001634920634920635,
+      "loss": 0.5469,
+      "step": 1380
+    },
+    {
+      "epoch": 0.5515873015873016,
+      "grad_norm": 0.8217313289642334,
+      "learning_rate": 0.0016322751322751323,
+      "loss": 0.689,
+      "step": 1390
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 1.0212846994400024,
+      "learning_rate": 0.0016296296296296295,
+      "loss": 0.8884,
+      "step": 1400
+    },
+    {
+      "epoch": 0.5595238095238095,
+      "grad_norm": 0.6781401634216309,
+      "learning_rate": 0.001626984126984127,
+      "loss": 0.6765,
+      "step": 1410
+    },
+    {
+      "epoch": 0.5634920634920635,
+      "grad_norm": 1.3569077253341675,
+      "learning_rate": 0.0016243386243386245,
+      "loss": 0.7301,
+      "step": 1420
+    },
+    {
+      "epoch": 0.5674603174603174,
+      "grad_norm": 1.1712183952331543,
+      "learning_rate": 0.0016216931216931217,
+      "loss": 0.7642,
+      "step": 1430
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.8609166145324707,
+      "learning_rate": 0.0016190476190476191,
+      "loss": 0.6985,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5753968253968254,
+      "grad_norm": 1.7427066564559937,
+      "learning_rate": 0.0016164021164021165,
+      "loss": 0.6473,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5793650793650794,
+      "grad_norm": 0.6781764030456543,
+      "learning_rate": 0.0016137566137566137,
+      "loss": 0.6664,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 1.3013015985488892,
+      "learning_rate": 0.0016111111111111111,
+      "loss": 0.68,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5873015873015873,
+      "grad_norm": 0.5826123356819153,
+      "learning_rate": 0.0016084656084656083,
+      "loss": 0.6387,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5912698412698413,
+      "grad_norm": 0.5697736144065857,
+      "learning_rate": 0.001605820105820106,
+      "loss": 0.6077,
+      "step": 1490
+    },
+    {
+      "epoch": 0.5952380952380952,
+      "grad_norm": 1.1636980772018433,
+      "learning_rate": 0.0016031746031746033,
+      "loss": 0.7455,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5992063492063492,
+      "grad_norm": 1.3436973094940186,
+      "learning_rate": 0.0016005291005291005,
+      "loss": 0.7223,
+      "step": 1510
+    },
+    {
+      "epoch": 0.6031746031746031,
+      "grad_norm": 0.5332604050636292,
+      "learning_rate": 0.001597883597883598,
+      "loss": 0.7019,
+      "step": 1520
+    },
+    {
+      "epoch": 0.6071428571428571,
+      "grad_norm": 1.6034159660339355,
+      "learning_rate": 0.0015952380952380953,
+      "loss": 0.7367,
+      "step": 1530
+    },
+    {
+      "epoch": 0.6111111111111112,
+      "grad_norm": 1.0178996324539185,
+      "learning_rate": 0.0015925925925925925,
+      "loss": 0.8806,
+      "step": 1540
+    },
+    {
+      "epoch": 0.6150793650793651,
+      "grad_norm": 2.342480182647705,
+      "learning_rate": 0.0015899470899470901,
+      "loss": 0.5731,
+      "step": 1550
+    },
+    {
+      "epoch": 0.6190476190476191,
+      "grad_norm": 3.211264133453369,
+      "learning_rate": 0.0015873015873015873,
+      "loss": 0.754,
+      "step": 1560
+    },
+    {
+      "epoch": 0.623015873015873,
+      "grad_norm": 1.243814468383789,
+      "learning_rate": 0.0015846560846560847,
+      "loss": 0.8197,
+      "step": 1570
+    },
+    {
+      "epoch": 0.626984126984127,
+      "grad_norm": 0.645529568195343,
+      "learning_rate": 0.001582010582010582,
+      "loss": 0.8089,
+      "step": 1580
+    },
+    {
+      "epoch": 0.6309523809523809,
+      "grad_norm": 0.8136078715324402,
+      "learning_rate": 0.0015793650793650793,
+      "loss": 0.7806,
+      "step": 1590
+    },
+    {
+      "epoch": 0.6349206349206349,
+      "grad_norm": 0.820977509021759,
+      "learning_rate": 0.0015767195767195767,
+      "loss": 0.5663,
+      "step": 1600
+    },
+    {
+      "epoch": 0.6388888888888888,
+      "grad_norm": 0.8995200991630554,
+      "learning_rate": 0.0015740740740740743,
+      "loss": 0.5782,
+      "step": 1610
+    },
+    {
+      "epoch": 0.6428571428571429,
+      "grad_norm": 0.7738690376281738,
+      "learning_rate": 0.0015714285714285715,
+      "loss": 0.7698,
+      "step": 1620
+    },
+    {
+      "epoch": 0.6468253968253969,
+      "grad_norm": 0.6192114949226379,
+      "learning_rate": 0.001568783068783069,
+      "loss": 0.8364,
+      "step": 1630
+    },
+    {
+      "epoch": 0.6507936507936508,
+      "grad_norm": 1.5578278303146362,
+      "learning_rate": 0.001566137566137566,
+      "loss": 0.8536,
+      "step": 1640
+    },
+    {
+      "epoch": 0.6547619047619048,
+      "grad_norm": 1.2771915197372437,
+      "learning_rate": 0.0015634920634920635,
+      "loss": 0.5684,
+      "step": 1650
+    },
+    {
+      "epoch": 0.6587301587301587,
+      "grad_norm": 0.8459761738777161,
+      "learning_rate": 0.0015608465608465609,
+      "loss": 0.6247,
+      "step": 1660
+    },
+    {
+      "epoch": 0.6626984126984127,
+      "grad_norm": 1.2737908363342285,
+      "learning_rate": 0.0015582010582010583,
+      "loss": 0.6484,
+      "step": 1670
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 1.2066800594329834,
+      "learning_rate": 0.0015555555555555557,
+      "loss": 0.669,
+      "step": 1680
+    },
+    {
+      "epoch": 0.6706349206349206,
+      "grad_norm": 1.2899553775787354,
+      "learning_rate": 0.001552910052910053,
+      "loss": 0.6382,
+      "step": 1690
+    },
+    {
+      "epoch": 0.6746031746031746,
+      "grad_norm": 1.2886145114898682,
+      "learning_rate": 0.0015502645502645503,
+      "loss": 0.6506,
+      "step": 1700
+    },
+    {
+      "epoch": 0.6785714285714286,
+      "grad_norm": 1.1416516304016113,
+      "learning_rate": 0.0015476190476190477,
+      "loss": 0.6058,
+      "step": 1710
+    },
+    {
+      "epoch": 0.6825396825396826,
+      "grad_norm": 1.1607820987701416,
+      "learning_rate": 0.0015449735449735449,
+      "loss": 0.7332,
+      "step": 1720
+    },
+    {
+      "epoch": 0.6865079365079365,
+      "grad_norm": 1.350909948348999,
+      "learning_rate": 0.0015423280423280423,
+      "loss": 0.9355,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6904761904761905,
+      "grad_norm": 0.5112187266349792,
+      "learning_rate": 0.0015396825396825399,
+      "loss": 0.7205,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6944444444444444,
+      "grad_norm": 0.9334474205970764,
+      "learning_rate": 0.001537037037037037,
+      "loss": 0.6192,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6984126984126984,
+      "grad_norm": 0.635986864566803,
+      "learning_rate": 0.0015343915343915345,
+      "loss": 0.6414,
+      "step": 1760
+    },
+    {
+      "epoch": 0.7023809523809523,
+      "grad_norm": 1.4325546026229858,
+      "learning_rate": 0.0015317460317460319,
+      "loss": 0.7799,
+      "step": 1770
+    },
+    {
+      "epoch": 0.7063492063492064,
+      "grad_norm": 0.814120352268219,
+      "learning_rate": 0.001529100529100529,
+      "loss": 0.6072,
+      "step": 1780
+    },
+    {
+      "epoch": 0.7103174603174603,
+      "grad_norm": 0.9185436367988586,
+      "learning_rate": 0.0015264550264550265,
+      "loss": 0.728,
+      "step": 1790
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.8025707602500916,
+      "learning_rate": 0.0015238095238095239,
+      "loss": 0.7527,
+      "step": 1800
+    },
+    {
+      "epoch": 0.7182539682539683,
+      "grad_norm": 0.816798985004425,
+      "learning_rate": 0.0015211640211640213,
+      "loss": 0.5058,
+      "step": 1810
+    },
+    {
+      "epoch": 0.7222222222222222,
+      "grad_norm": 0.8499689698219299,
+      "learning_rate": 0.0015185185185185187,
+      "loss": 0.5843,
+      "step": 1820
+    },
+    {
+      "epoch": 0.7261904761904762,
+      "grad_norm": 1.3355066776275635,
+      "learning_rate": 0.0015158730158730158,
+      "loss": 0.7095,
+      "step": 1830
+    },
+    {
+      "epoch": 0.7301587301587301,
+      "grad_norm": 1.4383025169372559,
+      "learning_rate": 0.0015132275132275132,
+      "loss": 0.7277,
+      "step": 1840
+    },
+    {
+      "epoch": 0.7341269841269841,
+      "grad_norm": 1.1233898401260376,
+      "learning_rate": 0.0015105820105820106,
+      "loss": 0.5746,
+      "step": 1850
+    },
+    {
+      "epoch": 0.7380952380952381,
+      "grad_norm": 0.6341880559921265,
+      "learning_rate": 0.001507936507936508,
+      "loss": 0.7063,
+      "step": 1860
+    },
+    {
+      "epoch": 0.7420634920634921,
+      "grad_norm": 0.8784427642822266,
+      "learning_rate": 0.0015052910052910054,
+      "loss": 0.5603,
+      "step": 1870
+    },
+    {
+      "epoch": 0.746031746031746,
+      "grad_norm": 1.2914808988571167,
+      "learning_rate": 0.0015026455026455026,
+      "loss": 0.6714,
+      "step": 1880
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.7286548018455505,
+      "learning_rate": 0.0015,
+      "loss": 0.6926,
+      "step": 1890
+    },
+    {
+      "epoch": 0.753968253968254,
+      "grad_norm": 0.6523261070251465,
+      "learning_rate": 0.0014973544973544974,
+      "loss": 0.6169,
+      "step": 1900
+    },
+    {
+      "epoch": 0.7579365079365079,
+      "grad_norm": 0.971722424030304,
+      "learning_rate": 0.0014947089947089946,
+      "loss": 0.8645,
+      "step": 1910
+    },
+    {
+      "epoch": 0.7619047619047619,
+      "grad_norm": 0.7515843510627747,
+      "learning_rate": 0.001492063492063492,
+      "loss": 0.5931,
+      "step": 1920
+    },
+    {
+      "epoch": 0.7658730158730159,
+      "grad_norm": 0.8675608038902283,
+      "learning_rate": 0.0014894179894179894,
+      "loss": 0.5703,
+      "step": 1930
+    },
+    {
+      "epoch": 0.7698412698412699,
+      "grad_norm": 1.131606101989746,
+      "learning_rate": 0.0014867724867724868,
+      "loss": 0.6542,
+      "step": 1940
+    },
+    {
+      "epoch": 0.7738095238095238,
+      "grad_norm": 1.4298430681228638,
+      "learning_rate": 0.0014841269841269842,
+      "loss": 1.127,
+      "step": 1950
+    },
+    {
+      "epoch": 0.7777777777777778,
+      "grad_norm": 0.9001014828681946,
+      "learning_rate": 0.0014814814814814814,
+      "loss": 0.6976,
+      "step": 1960
+    },
+    {
+      "epoch": 0.7817460317460317,
+      "grad_norm": 0.9711846113204956,
+      "learning_rate": 0.0014788359788359788,
+      "loss": 0.6721,
+      "step": 1970
+    },
+    {
+      "epoch": 0.7857142857142857,
+      "grad_norm": 0.6609967350959778,
+      "learning_rate": 0.0014761904761904762,
+      "loss": 0.6133,
+      "step": 1980
+    },
+    {
+      "epoch": 0.7896825396825397,
+      "grad_norm": 1.0555015802383423,
+      "learning_rate": 0.0014735449735449736,
+      "loss": 0.7108,
+      "step": 1990
+    },
+    {
+      "epoch": 0.7936507936507936,
+      "grad_norm": 1.7377722263336182,
+      "learning_rate": 0.001470899470899471,
+      "loss": 0.5734,
+      "step": 2000
+    },
+    {
+      "epoch": 0.7976190476190477,
+      "grad_norm": 0.48703524470329285,
+      "learning_rate": 0.0014682539682539682,
+      "loss": 0.509,
+      "step": 2010
+    },
+    {
+      "epoch": 0.8015873015873016,
+      "grad_norm": 0.7599615454673767,
+      "learning_rate": 0.0014656084656084656,
+      "loss": 0.6224,
+      "step": 2020
+    },
+    {
+      "epoch": 0.8055555555555556,
+      "grad_norm": 1.351830005645752,
+      "learning_rate": 0.001462962962962963,
+      "loss": 0.6759,
+      "step": 2030
+    },
+    {
+      "epoch": 0.8095238095238095,
+      "grad_norm": 0.7260966897010803,
+      "learning_rate": 0.0014603174603174602,
+      "loss": 0.7076,
+      "step": 2040
+    },
+    {
+      "epoch": 0.8134920634920635,
+      "grad_norm": 1.2171436548233032,
+      "learning_rate": 0.0014576719576719578,
+      "loss": 0.6794,
+      "step": 2050
+    },
+    {
+      "epoch": 0.8174603174603174,
+      "grad_norm": 0.6401930451393127,
+      "learning_rate": 0.0014550264550264552,
+      "loss": 0.5448,
+      "step": 2060
+    },
+    {
+      "epoch": 0.8214285714285714,
+      "grad_norm": 1.0115227699279785,
+      "learning_rate": 0.0014523809523809524,
+      "loss": 0.7069,
+      "step": 2070
+    },
+    {
+      "epoch": 0.8253968253968254,
+      "grad_norm": 1.0564064979553223,
+      "learning_rate": 0.0014497354497354498,
+      "loss": 0.5207,
+      "step": 2080
+    },
+    {
+      "epoch": 0.8293650793650794,
+      "grad_norm": 1.908964991569519,
+      "learning_rate": 0.001447089947089947,
+      "loss": 0.7147,
+      "step": 2090
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 1.2274842262268066,
+      "learning_rate": 0.0014444444444444444,
+      "loss": 0.7366,
+      "step": 2100
+    },
+    {
+      "epoch": 0.8373015873015873,
+      "grad_norm": 0.8221492767333984,
+      "learning_rate": 0.0014417989417989418,
+      "loss": 0.5421,
+      "step": 2110
+    },
+    {
+      "epoch": 0.8412698412698413,
+      "grad_norm": 1.3362950086593628,
+      "learning_rate": 0.0014391534391534392,
+      "loss": 0.7575,
+      "step": 2120
+    },
+    {
+      "epoch": 0.8452380952380952,
+      "grad_norm": 0.8134861588478088,
+      "learning_rate": 0.0014365079365079366,
+      "loss": 0.6713,
+      "step": 2130
+    },
+    {
+      "epoch": 0.8492063492063492,
+      "grad_norm": 0.650597095489502,
+      "learning_rate": 0.001433862433862434,
+      "loss": 0.8714,
+      "step": 2140
+    },
+    {
+      "epoch": 0.8531746031746031,
+      "grad_norm": 1.5303138494491577,
+      "learning_rate": 0.0014312169312169312,
+      "loss": 0.6425,
+      "step": 2150
+    },
+    {
+      "epoch": 0.8571428571428571,
+      "grad_norm": 1.0913094282150269,
+      "learning_rate": 0.0014285714285714286,
+      "loss": 0.8642,
+      "step": 2160
+    },
+    {
+      "epoch": 0.8611111111111112,
+      "grad_norm": 0.6576964259147644,
+      "learning_rate": 0.0014259259259259258,
+      "loss": 0.5981,
+      "step": 2170
+    },
+    {
+      "epoch": 0.8650793650793651,
+      "grad_norm": 1.4192836284637451,
+      "learning_rate": 0.0014232804232804234,
+      "loss": 0.8404,
+      "step": 2180
+    },
+    {
+      "epoch": 0.8690476190476191,
+      "grad_norm": 1.345991611480713,
+      "learning_rate": 0.0014206349206349208,
+      "loss": 0.6921,
+      "step": 2190
+    },
+    {
+      "epoch": 0.873015873015873,
+      "grad_norm": 1.310991644859314,
+      "learning_rate": 0.001417989417989418,
+      "loss": 0.7689,
+      "step": 2200
+    },
+    {
+      "epoch": 0.876984126984127,
+      "grad_norm": 1.0328586101531982,
+      "learning_rate": 0.0014153439153439154,
+      "loss": 0.5498,
+      "step": 2210
+    },
+    {
+      "epoch": 0.8809523809523809,
+      "grad_norm": 1.0331602096557617,
+      "learning_rate": 0.0014126984126984128,
+      "loss": 0.7736,
+      "step": 2220
+    },
+    {
+      "epoch": 0.8849206349206349,
+      "grad_norm": 0.9896045327186584,
+      "learning_rate": 0.00141005291005291,
+      "loss": 0.6907,
+      "step": 2230
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.7972356677055359,
+      "learning_rate": 0.0014074074074074076,
+      "loss": 0.6476,
+      "step": 2240
+    },
+    {
+      "epoch": 0.8928571428571429,
+      "grad_norm": 1.479012131690979,
+      "learning_rate": 0.0014047619047619047,
+      "loss": 0.6944,
+      "step": 2250
+    },
+    {
+      "epoch": 0.8968253968253969,
+      "grad_norm": 0.8372600674629211,
+      "learning_rate": 0.0014021164021164022,
+      "loss": 0.7234,
+      "step": 2260
+    },
+    {
+      "epoch": 0.9007936507936508,
+      "grad_norm": 0.9432483911514282,
+      "learning_rate": 0.0013994708994708996,
+      "loss": 0.5945,
+      "step": 2270
+    },
+    {
+      "epoch": 0.9047619047619048,
+      "grad_norm": 1.3562203645706177,
+      "learning_rate": 0.0013968253968253967,
+      "loss": 0.6736,
+      "step": 2280
+    },
+    {
+      "epoch": 0.9087301587301587,
+      "grad_norm": 1.511753797531128,
+      "learning_rate": 0.0013941798941798941,
+      "loss": 0.7797,
+      "step": 2290
+    },
+    {
+      "epoch": 0.9126984126984127,
+      "grad_norm": 1.4588041305541992,
+      "learning_rate": 0.0013915343915343918,
+      "loss": 0.6369,
+      "step": 2300
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 1.3627748489379883,
+      "learning_rate": 0.001388888888888889,
+      "loss": 0.5948,
+      "step": 2310
+    },
+    {
+      "epoch": 0.9206349206349206,
+      "grad_norm": 0.9026773571968079,
+      "learning_rate": 0.0013862433862433863,
+      "loss": 0.8748,
+      "step": 2320
+    },
+    {
+      "epoch": 0.9246031746031746,
+      "grad_norm": 2.1526966094970703,
+      "learning_rate": 0.0013835978835978835,
+      "loss": 0.7363,
+      "step": 2330
+    },
+    {
+      "epoch": 0.9285714285714286,
+      "grad_norm": 0.6556802988052368,
+      "learning_rate": 0.001380952380952381,
+      "loss": 0.6449,
+      "step": 2340
+    },
+    {
+      "epoch": 0.9325396825396826,
+      "grad_norm": 1.622631549835205,
+      "learning_rate": 0.0013783068783068783,
+      "loss": 0.6811,
+      "step": 2350
+    },
+    {
+      "epoch": 0.9365079365079365,
+      "grad_norm": 1.133255124092102,
+      "learning_rate": 0.0013756613756613755,
+      "loss": 0.7778,
+      "step": 2360
+    },
+    {
+      "epoch": 0.9404761904761905,
+      "grad_norm": 1.2756290435791016,
+      "learning_rate": 0.0013730158730158731,
+      "loss": 0.6244,
+      "step": 2370
+    },
+    {
+      "epoch": 0.9444444444444444,
+      "grad_norm": 0.6911134719848633,
+      "learning_rate": 0.0013703703703703705,
+      "loss": 0.6354,
+      "step": 2380
+    },
+    {
+      "epoch": 0.9484126984126984,
+      "grad_norm": 1.2925828695297241,
+      "learning_rate": 0.0013677248677248677,
+      "loss": 0.8165,
+      "step": 2390
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 0.8971231579780579,
+      "learning_rate": 0.0013650793650793651,
+      "loss": 0.6201,
+      "step": 2400
+    },
+    {
+      "epoch": 0.9563492063492064,
+      "grad_norm": 0.7667912244796753,
+      "learning_rate": 0.0013624338624338623,
+      "loss": 0.5925,
+      "step": 2410
+    },
+    {
+      "epoch": 0.9603174603174603,
+      "grad_norm": 2.7550241947174072,
+      "learning_rate": 0.0013597883597883597,
+      "loss": 0.6813,
+      "step": 2420
+    },
+    {
+      "epoch": 0.9642857142857143,
+      "grad_norm": 0.8356139659881592,
+      "learning_rate": 0.0013571428571428573,
+      "loss": 0.526,
+      "step": 2430
+    },
+    {
+      "epoch": 0.9682539682539683,
+      "grad_norm": 1.1391632556915283,
+      "learning_rate": 0.0013544973544973545,
+      "loss": 0.63,
+      "step": 2440
+    },
+    {
+      "epoch": 0.9722222222222222,
+      "grad_norm": 0.9076061248779297,
+      "learning_rate": 0.001351851851851852,
+      "loss": 0.6514,
+      "step": 2450
+    },
+    {
+      "epoch": 0.9761904761904762,
+      "grad_norm": 0.8281214237213135,
+      "learning_rate": 0.0013492063492063493,
+      "loss": 0.6106,
+      "step": 2460
+    },
+    {
+      "epoch": 0.9801587301587301,
+      "grad_norm": 0.7302814722061157,
+      "learning_rate": 0.0013465608465608465,
+      "loss": 0.5481,
+      "step": 2470
+    },
+    {
+      "epoch": 0.9841269841269841,
+      "grad_norm": 0.7019608020782471,
+      "learning_rate": 0.001343915343915344,
+      "loss": 0.6074,
+      "step": 2480
+    },
+    {
+      "epoch": 0.9880952380952381,
+      "grad_norm": 0.6560447812080383,
+      "learning_rate": 0.0013412698412698413,
+      "loss": 0.7778,
+      "step": 2490
+    },
+    {
+      "epoch": 0.9920634920634921,
+      "grad_norm": 0.9499639868736267,
+      "learning_rate": 0.0013386243386243387,
+      "loss": 0.6651,
+      "step": 2500
+    },
+    {
+      "epoch": 0.996031746031746,
+      "grad_norm": 0.9400144815444946,
+      "learning_rate": 0.001335978835978836,
+      "loss": 0.5866,
+      "step": 2510
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.7043092250823975,
+      "learning_rate": 0.0013333333333333333,
+      "loss": 0.7614,
+      "step": 2520
+    },
+    {
+      "epoch": 1.003968253968254,
+      "grad_norm": 1.06705641746521,
+      "learning_rate": 0.0013306878306878307,
+      "loss": 0.4675,
+      "step": 2530
+    },
+    {
+      "epoch": 1.007936507936508,
+      "grad_norm": 1.0158171653747559,
+      "learning_rate": 0.001328042328042328,
+      "loss": 0.6015,
+      "step": 2540
+    },
+    {
+      "epoch": 1.0119047619047619,
+      "grad_norm": 0.9164927005767822,
+      "learning_rate": 0.0013253968253968253,
+      "loss": 0.5214,
+      "step": 2550
+    },
+    {
+      "epoch": 1.0158730158730158,
+      "grad_norm": 0.9178239703178406,
+      "learning_rate": 0.001322751322751323,
+      "loss": 0.5135,
+      "step": 2560
+    },
+    {
+      "epoch": 1.0198412698412698,
+      "grad_norm": 1.3159326314926147,
+      "learning_rate": 0.00132010582010582,
+      "loss": 0.6184,
+      "step": 2570
+    },
+    {
+      "epoch": 1.0238095238095237,
+      "grad_norm": 1.290663719177246,
+      "learning_rate": 0.0013174603174603175,
+      "loss": 0.5791,
+      "step": 2580
+    },
+    {
+      "epoch": 1.0277777777777777,
+      "grad_norm": 0.8518033027648926,
+      "learning_rate": 0.0013148148148148149,
+      "loss": 0.3943,
+      "step": 2590
+    },
+    {
+      "epoch": 1.0317460317460316,
+      "grad_norm": 0.523811399936676,
+      "learning_rate": 0.001312169312169312,
+      "loss": 0.4826,
+      "step": 2600
+    },
+    {
+      "epoch": 1.0357142857142858,
+      "grad_norm": 2.362725257873535,
+      "learning_rate": 0.0013095238095238095,
+      "loss": 0.6,
+      "step": 2610
+    },
+    {
+      "epoch": 1.0396825396825398,
+      "grad_norm": 0.7334272861480713,
+      "learning_rate": 0.001306878306878307,
+      "loss": 0.5216,
+      "step": 2620
+    },
+    {
+      "epoch": 1.0436507936507937,
+      "grad_norm": 1.4758929014205933,
+      "learning_rate": 0.0013042328042328043,
+      "loss": 0.5011,
+      "step": 2630
+    },
+    {
+      "epoch": 1.0476190476190477,
+      "grad_norm": 1.296991229057312,
+      "learning_rate": 0.0013015873015873017,
+      "loss": 0.5896,
+      "step": 2640
+    },
+    {
+      "epoch": 1.0515873015873016,
+      "grad_norm": 0.6447119116783142,
+      "learning_rate": 0.0012989417989417989,
+      "loss": 0.4612,
+      "step": 2650
+    },
+    {
+      "epoch": 1.0555555555555556,
+      "grad_norm": 1.2804654836654663,
+      "learning_rate": 0.0012962962962962963,
+      "loss": 0.5531,
+      "step": 2660
+    },
+    {
+      "epoch": 1.0595238095238095,
+      "grad_norm": 0.6714935898780823,
+      "learning_rate": 0.0012936507936507937,
+      "loss": 0.4928,
+      "step": 2670
+    },
+    {
+      "epoch": 1.0634920634920635,
+      "grad_norm": 2.083782434463501,
+      "learning_rate": 0.001291005291005291,
+      "loss": 0.3696,
+      "step": 2680
+    },
+    {
+      "epoch": 1.0674603174603174,
+      "grad_norm": 1.4924397468566895,
+      "learning_rate": 0.0012883597883597885,
+      "loss": 0.4776,
+      "step": 2690
+    },
+    {
+      "epoch": 1.0714285714285714,
+      "grad_norm": 0.8140655159950256,
+      "learning_rate": 0.0012857142857142859,
+      "loss": 0.4731,
+      "step": 2700
+    },
+    {
+      "epoch": 1.0753968253968254,
+      "grad_norm": 0.47565603256225586,
+      "learning_rate": 0.001283068783068783,
+      "loss": 0.6289,
+      "step": 2710
+    },
+    {
+      "epoch": 1.0793650793650793,
+      "grad_norm": 1.3005656003952026,
+      "learning_rate": 0.0012804232804232805,
+      "loss": 0.5688,
+      "step": 2720
+    },
+    {
+      "epoch": 1.0833333333333333,
+      "grad_norm": 1.2472827434539795,
+      "learning_rate": 0.0012777777777777776,
+      "loss": 0.6273,
+      "step": 2730
+    },
+    {
+      "epoch": 1.0873015873015872,
+      "grad_norm": 1.0685155391693115,
+      "learning_rate": 0.001275132275132275,
+      "loss": 0.5222,
+      "step": 2740
+    },
+    {
+      "epoch": 1.0912698412698412,
+      "grad_norm": 1.2605559825897217,
+      "learning_rate": 0.0012724867724867727,
+      "loss": 0.4724,
+      "step": 2750
+    },
+    {
+      "epoch": 1.0952380952380953,
+      "grad_norm": 0.9913002848625183,
+      "learning_rate": 0.0012698412698412698,
+      "loss": 0.5158,
+      "step": 2760
+    },
+    {
+      "epoch": 1.0992063492063493,
+      "grad_norm": 0.5711252093315125,
+      "learning_rate": 0.0012671957671957672,
+      "loss": 0.4382,
+      "step": 2770
+    },
+    {
+      "epoch": 1.1031746031746033,
+      "grad_norm": 1.4559530019760132,
+      "learning_rate": 0.0012645502645502646,
+      "loss": 0.7059,
+      "step": 2780
+    },
+    {
+      "epoch": 1.1071428571428572,
+      "grad_norm": 0.9595462083816528,
+      "learning_rate": 0.0012619047619047618,
+      "loss": 0.5346,
+      "step": 2790
+    },
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 0.7950549721717834,
+      "learning_rate": 0.0012592592592592592,
+      "loss": 0.4881,
+      "step": 2800
+    },
+    {
+      "epoch": 1.1150793650793651,
+      "grad_norm": 1.297609567642212,
+      "learning_rate": 0.0012566137566137566,
+      "loss": 0.4305,
+      "step": 2810
+    },
+    {
+      "epoch": 1.119047619047619,
+      "grad_norm": 0.741604745388031,
+      "learning_rate": 0.001253968253968254,
+      "loss": 0.6249,
+      "step": 2820
+    },
+    {
+      "epoch": 1.123015873015873,
+      "grad_norm": 1.4942420721054077,
+      "learning_rate": 0.0012513227513227514,
+      "loss": 0.4769,
+      "step": 2830
+    },
+    {
+      "epoch": 1.126984126984127,
+      "grad_norm": 1.299843192100525,
+      "learning_rate": 0.0012486772486772486,
+      "loss": 0.5311,
+      "step": 2840
+    },
+    {
+      "epoch": 1.130952380952381,
+      "grad_norm": 0.5215968489646912,
+      "learning_rate": 0.001246031746031746,
+      "loss": 0.4032,
+      "step": 2850
+    },
+    {
+      "epoch": 1.1349206349206349,
+      "grad_norm": 0.9502798914909363,
+      "learning_rate": 0.0012433862433862434,
+      "loss": 0.4089,
+      "step": 2860
+    },
+    {
+      "epoch": 1.1388888888888888,
+      "grad_norm": 0.5403910279273987,
+      "learning_rate": 0.0012407407407407408,
+      "loss": 0.6482,
+      "step": 2870
+    },
+    {
+      "epoch": 1.1428571428571428,
+      "grad_norm": 1.0824073553085327,
+      "learning_rate": 0.0012380952380952382,
+      "loss": 0.6369,
+      "step": 2880
+    },
+    {
+      "epoch": 1.1468253968253967,
+      "grad_norm": 0.7724151015281677,
+      "learning_rate": 0.0012354497354497354,
+      "loss": 0.7722,
+      "step": 2890
+    },
+    {
+      "epoch": 1.1507936507936507,
+      "grad_norm": 1.6870607137680054,
+      "learning_rate": 0.0012328042328042328,
+      "loss": 0.5139,
+      "step": 2900
+    },
+    {
+      "epoch": 1.1547619047619047,
+      "grad_norm": 1.8609074354171753,
+      "learning_rate": 0.0012301587301587302,
+      "loss": 0.5745,
+      "step": 2910
+    },
+    {
+      "epoch": 1.1587301587301586,
+      "grad_norm": 0.664623498916626,
+      "learning_rate": 0.0012275132275132274,
+      "loss": 0.6334,
+      "step": 2920
+    },
+    {
+      "epoch": 1.1626984126984128,
+      "grad_norm": 0.836618959903717,
+      "learning_rate": 0.001224867724867725,
+      "loss": 0.5948,
+      "step": 2930
+    },
+    {
+      "epoch": 1.1666666666666667,
+      "grad_norm": 0.8063789010047913,
+      "learning_rate": 0.0012222222222222224,
+      "loss": 0.5868,
+      "step": 2940
+    },
+    {
+      "epoch": 1.1706349206349207,
+      "grad_norm": 1.02044677734375,
+      "learning_rate": 0.0012195767195767196,
+      "loss": 0.5168,
+      "step": 2950
+    },
+    {
+      "epoch": 1.1746031746031746,
+      "grad_norm": 0.7230445742607117,
+      "learning_rate": 0.001216931216931217,
+      "loss": 0.4973,
+      "step": 2960
+    },
+    {
+      "epoch": 1.1785714285714286,
+      "grad_norm": 1.4907546043395996,
+      "learning_rate": 0.0012142857142857142,
+      "loss": 0.5717,
+      "step": 2970
+    },
+    {
+      "epoch": 1.1825396825396826,
+      "grad_norm": 0.5981312394142151,
+      "learning_rate": 0.0012116402116402116,
+      "loss": 0.5287,
+      "step": 2980
+    },
+    {
+      "epoch": 1.1865079365079365,
+      "grad_norm": 1.6976572275161743,
+      "learning_rate": 0.001208994708994709,
+      "loss": 0.4958,
+      "step": 2990
+    },
+    {
+      "epoch": 1.1904761904761905,
+      "grad_norm": 1.2186094522476196,
+      "learning_rate": 0.0012063492063492064,
+      "loss": 0.5103,
+      "step": 3000
+    },
+    {
+      "epoch": 1.1944444444444444,
+      "grad_norm": 2.3313498497009277,
+      "learning_rate": 0.0012037037037037038,
+      "loss": 0.3989,
+      "step": 3010
+    },
+    {
+      "epoch": 1.1984126984126984,
+      "grad_norm": 0.8640299439430237,
+      "learning_rate": 0.0012010582010582012,
+      "loss": 0.5046,
+      "step": 3020
+    },
+    {
+      "epoch": 1.2023809523809523,
+      "grad_norm": 0.7302188277244568,
+      "learning_rate": 0.0011984126984126984,
+      "loss": 0.4876,
+      "step": 3030
+    },
+    {
+      "epoch": 1.2063492063492063,
+      "grad_norm": 0.6321560740470886,
+      "learning_rate": 0.0011957671957671958,
+      "loss": 0.5512,
+      "step": 3040
+    },
+    {
+      "epoch": 1.2103174603174602,
+      "grad_norm": 1.4281076192855835,
+      "learning_rate": 0.001193121693121693,
+      "loss": 0.6275,
+      "step": 3050
+    },
+    {
+      "epoch": 1.2142857142857142,
+      "grad_norm": 1.3028194904327393,
+      "learning_rate": 0.0011904761904761906,
+      "loss": 0.4403,
+      "step": 3060
+    },
+    {
+      "epoch": 1.2182539682539684,
+      "grad_norm": 1.7041105031967163,
+      "learning_rate": 0.001187830687830688,
+      "loss": 0.6193,
+      "step": 3070
+    },
+    {
+      "epoch": 1.2222222222222223,
+      "grad_norm": 0.6587647199630737,
+      "learning_rate": 0.0011851851851851852,
+      "loss": 0.4893,
+      "step": 3080
+    },
+    {
+      "epoch": 1.2261904761904763,
+      "grad_norm": 1.2939643859863281,
+      "learning_rate": 0.0011825396825396826,
+      "loss": 0.6198,
+      "step": 3090
+    },
+    {
+      "epoch": 1.2301587301587302,
+      "grad_norm": 0.5572563409805298,
+      "learning_rate": 0.00117989417989418,
+      "loss": 0.5438,
+      "step": 3100
+    },
+    {
+      "epoch": 1.2341269841269842,
+      "grad_norm": 0.7885312438011169,
+      "learning_rate": 0.0011772486772486772,
+      "loss": 0.4951,
+      "step": 3110
+    },
+    {
+      "epoch": 1.2380952380952381,
+      "grad_norm": 0.7055696249008179,
+      "learning_rate": 0.0011746031746031748,
+      "loss": 0.6549,
+      "step": 3120
+    },
+    {
+      "epoch": 1.242063492063492,
+      "grad_norm": 0.9367688894271851,
+      "learning_rate": 0.001171957671957672,
+      "loss": 0.3874,
+      "step": 3130
+    },
+    {
+      "epoch": 1.246031746031746,
+      "grad_norm": 1.2354093790054321,
+      "learning_rate": 0.0011693121693121694,
+      "loss": 0.5545,
+      "step": 3140
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.2741392850875854,
+      "learning_rate": 0.0011666666666666668,
+      "loss": 0.5277,
+      "step": 3150
+    },
+    {
+      "epoch": 1.253968253968254,
+      "grad_norm": 0.9361393451690674,
+      "learning_rate": 0.001164021164021164,
+      "loss": 0.5458,
+      "step": 3160
+    },
+    {
+      "epoch": 1.257936507936508,
+      "grad_norm": 1.4866970777511597,
+      "learning_rate": 0.0011613756613756613,
+      "loss": 0.5137,
+      "step": 3170
+    },
+    {
+      "epoch": 1.2619047619047619,
+      "grad_norm": 0.6895744800567627,
+      "learning_rate": 0.0011587301587301588,
+      "loss": 0.4681,
+      "step": 3180
+    },
+    {
+      "epoch": 1.2658730158730158,
+      "grad_norm": 1.1036232709884644,
+      "learning_rate": 0.0011560846560846562,
+      "loss": 0.5561,
+      "step": 3190
+    },
+    {
+      "epoch": 1.2698412698412698,
+      "grad_norm": 0.5537109375,
+      "learning_rate": 0.0011534391534391536,
+      "loss": 0.4245,
+      "step": 3200
+    },
+    {
+      "epoch": 1.2738095238095237,
+      "grad_norm": 1.1008318662643433,
+      "learning_rate": 0.0011507936507936507,
+      "loss": 0.5559,
+      "step": 3210
+    },
+    {
+      "epoch": 1.2777777777777777,
+      "grad_norm": 1.5348010063171387,
+      "learning_rate": 0.0011481481481481481,
+      "loss": 0.6614,
+      "step": 3220
+    },
+    {
+      "epoch": 1.2817460317460316,
+      "grad_norm": 0.7859022617340088,
+      "learning_rate": 0.0011455026455026455,
+      "loss": 0.5489,
+      "step": 3230
+    },
+    {
+      "epoch": 1.2857142857142856,
+      "grad_norm": 1.6240460872650146,
+      "learning_rate": 0.0011428571428571427,
+      "loss": 0.6612,
+      "step": 3240
+    },
+    {
+      "epoch": 1.2896825396825398,
+      "grad_norm": 1.0166038274765015,
+      "learning_rate": 0.0011402116402116403,
+      "loss": 0.4639,
+      "step": 3250
+    },
+    {
+      "epoch": 1.2936507936507937,
+      "grad_norm": 2.2691445350646973,
+      "learning_rate": 0.0011375661375661377,
+      "loss": 0.5383,
+      "step": 3260
+    },
+    {
+      "epoch": 1.2976190476190477,
+      "grad_norm": 0.6648916602134705,
+      "learning_rate": 0.001134920634920635,
+      "loss": 0.6158,
+      "step": 3270
+    },
+    {
+      "epoch": 1.3015873015873016,
+      "grad_norm": 0.6790510416030884,
+      "learning_rate": 0.0011322751322751323,
+      "loss": 0.5251,
+      "step": 3280
+    },
+    {
+      "epoch": 1.3055555555555556,
+      "grad_norm": 0.5222778916358948,
+      "learning_rate": 0.0011296296296296295,
+      "loss": 0.4507,
+      "step": 3290
+    },
+    {
+      "epoch": 1.3095238095238095,
+      "grad_norm": 1.31193208694458,
+      "learning_rate": 0.001126984126984127,
+      "loss": 0.6471,
+      "step": 3300
+    },
+    {
+      "epoch": 1.3134920634920635,
+      "grad_norm": 0.7240389585494995,
+      "learning_rate": 0.0011243386243386245,
+      "loss": 0.492,
+      "step": 3310
+    },
+    {
+      "epoch": 1.3174603174603174,
+      "grad_norm": 1.4572322368621826,
+      "learning_rate": 0.0011216931216931217,
+      "loss": 0.636,
+      "step": 3320
+    },
+    {
+      "epoch": 1.3214285714285714,
+      "grad_norm": 0.7390062212944031,
+      "learning_rate": 0.0011190476190476191,
+      "loss": 0.5134,
+      "step": 3330
+    },
+    {
+      "epoch": 1.3253968253968254,
+      "grad_norm": 0.9129742383956909,
+      "learning_rate": 0.0011164021164021165,
+      "loss": 0.5521,
+      "step": 3340
+    },
+    {
+      "epoch": 1.3293650793650793,
+      "grad_norm": 0.9507137537002563,
+      "learning_rate": 0.0011137566137566137,
+      "loss": 0.5191,
+      "step": 3350
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.7048954367637634,
+      "learning_rate": 0.0011111111111111111,
+      "loss": 0.4399,
+      "step": 3360
+    },
+    {
+      "epoch": 1.3373015873015874,
+      "grad_norm": 1.2110259532928467,
+      "learning_rate": 0.0011084656084656083,
+      "loss": 0.5302,
+      "step": 3370
+    },
+    {
+      "epoch": 1.3412698412698414,
+      "grad_norm": 1.2376341819763184,
+      "learning_rate": 0.001105820105820106,
+      "loss": 0.535,
+      "step": 3380
+    },
+    {
+      "epoch": 1.3452380952380953,
+      "grad_norm": 1.2114317417144775,
+      "learning_rate": 0.0011031746031746033,
+      "loss": 0.4426,
+      "step": 3390
+    },
+    {
+      "epoch": 1.3492063492063493,
+      "grad_norm": 1.3357186317443848,
+      "learning_rate": 0.0011005291005291005,
+      "loss": 0.5123,
+      "step": 3400
+    },
+    {
+      "epoch": 1.3531746031746033,
+      "grad_norm": 1.4146705865859985,
+      "learning_rate": 0.001097883597883598,
+      "loss": 0.6128,
+      "step": 3410
+    },
+    {
+      "epoch": 1.3571428571428572,
+      "grad_norm": 0.6163337230682373,
+      "learning_rate": 0.0010952380952380953,
+      "loss": 0.537,
+      "step": 3420
+    },
+    {
+      "epoch": 1.3611111111111112,
+      "grad_norm": 1.9845856428146362,
+      "learning_rate": 0.0010925925925925925,
+      "loss": 0.5789,
+      "step": 3430
+    },
+    {
+      "epoch": 1.3650793650793651,
+      "grad_norm": 0.7714751958847046,
+      "learning_rate": 0.00108994708994709,
+      "loss": 0.4769,
+      "step": 3440
+    },
+    {
+      "epoch": 1.369047619047619,
+      "grad_norm": 1.3484938144683838,
+      "learning_rate": 0.0010873015873015873,
+      "loss": 0.5798,
+      "step": 3450
+    },
+    {
+      "epoch": 1.373015873015873,
+      "grad_norm": 0.9264288544654846,
+      "learning_rate": 0.0010846560846560847,
+      "loss": 0.4747,
+      "step": 3460
+    },
+    {
+      "epoch": 1.376984126984127,
+      "grad_norm": 0.6862549185752869,
+      "learning_rate": 0.001082010582010582,
+      "loss": 0.4168,
+      "step": 3470
+    },
+    {
+      "epoch": 1.380952380952381,
+      "grad_norm": 0.9308891296386719,
+      "learning_rate": 0.0010793650793650793,
+      "loss": 0.4471,
+      "step": 3480
+    },
+    {
+      "epoch": 1.3849206349206349,
+      "grad_norm": 0.7059733867645264,
+      "learning_rate": 0.0010767195767195767,
+      "loss": 0.4222,
+      "step": 3490
+    },
+    {
+      "epoch": 1.3888888888888888,
+      "grad_norm": 1.4370836019515991,
+      "learning_rate": 0.0010740740740740743,
+      "loss": 0.4517,
+      "step": 3500
+    },
+    {
+      "epoch": 1.3928571428571428,
+      "grad_norm": 1.125847578048706,
+      "learning_rate": 0.0010714285714285715,
+      "loss": 0.6114,
+      "step": 3510
+    },
+    {
+      "epoch": 1.3968253968253967,
+      "grad_norm": 0.4711201786994934,
+      "learning_rate": 0.0010687830687830689,
+      "loss": 0.5844,
+      "step": 3520
+    },
+    {
+      "epoch": 1.4007936507936507,
+      "grad_norm": 1.1563987731933594,
+      "learning_rate": 0.001066137566137566,
+      "loss": 0.6552,
+      "step": 3530
+    },
+    {
+      "epoch": 1.4047619047619047,
+      "grad_norm": 0.5372576117515564,
+      "learning_rate": 0.0010634920634920635,
+      "loss": 0.4234,
+      "step": 3540
+    },
+    {
+      "epoch": 1.4087301587301586,
+      "grad_norm": 0.683944821357727,
+      "learning_rate": 0.0010608465608465609,
+      "loss": 0.5938,
+      "step": 3550
+    },
+    {
+      "epoch": 1.4126984126984126,
+      "grad_norm": 0.6815638542175293,
+      "learning_rate": 0.0010582010582010583,
+      "loss": 0.4907,
+      "step": 3560
+    },
+    {
+      "epoch": 1.4166666666666667,
+      "grad_norm": 1.6569042205810547,
+      "learning_rate": 0.0010555555555555557,
+      "loss": 0.5975,
+      "step": 3570
+    },
+    {
+      "epoch": 1.4206349206349207,
+      "grad_norm": 1.1780049800872803,
+      "learning_rate": 0.001052910052910053,
+      "loss": 0.4291,
+      "step": 3580
+    },
+    {
+      "epoch": 1.4246031746031746,
+      "grad_norm": 0.6545954346656799,
+      "learning_rate": 0.0010502645502645503,
+      "loss": 0.4288,
+      "step": 3590
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.9560195207595825,
+      "learning_rate": 0.0010476190476190477,
+      "loss": 0.4316,
+      "step": 3600
+    },
+    {
+      "epoch": 1.4325396825396826,
+      "grad_norm": 0.9896324276924133,
+      "learning_rate": 0.0010449735449735448,
+      "loss": 0.5869,
+      "step": 3610
+    },
+    {
+      "epoch": 1.4365079365079365,
+      "grad_norm": 1.3985390663146973,
+      "learning_rate": 0.0010423280423280422,
+      "loss": 0.4652,
+      "step": 3620
+    },
+    {
+      "epoch": 1.4404761904761905,
+      "grad_norm": 1.3849400281906128,
+      "learning_rate": 0.0010396825396825399,
+      "loss": 0.6423,
+      "step": 3630
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 0.9819910526275635,
+      "learning_rate": 0.001037037037037037,
+      "loss": 0.5072,
+      "step": 3640
+    },
+    {
+      "epoch": 1.4484126984126984,
+      "grad_norm": 0.9809389710426331,
+      "learning_rate": 0.0010343915343915345,
+      "loss": 0.4127,
+      "step": 3650
+    },
+    {
+      "epoch": 1.4523809523809523,
+      "grad_norm": 1.3953092098236084,
+      "learning_rate": 0.0010317460317460319,
+      "loss": 0.5313,
+      "step": 3660
+    },
+    {
+      "epoch": 1.4563492063492063,
+      "grad_norm": 1.1159425973892212,
+      "learning_rate": 0.001029100529100529,
+      "loss": 0.4939,
+      "step": 3670
+    },
+    {
+      "epoch": 1.4603174603174602,
+      "grad_norm": 0.5379135608673096,
+      "learning_rate": 0.0010264550264550264,
+      "loss": 0.4107,
+      "step": 3680
+    },
+    {
+      "epoch": 1.4642857142857144,
+      "grad_norm": 1.1204336881637573,
+      "learning_rate": 0.0010238095238095238,
+      "loss": 0.4742,
+      "step": 3690
+    },
+    {
+      "epoch": 1.4682539682539684,
+      "grad_norm": 0.8563843369483948,
+      "learning_rate": 0.0010211640211640212,
+      "loss": 0.5383,
+      "step": 3700
+    },
+    {
+      "epoch": 1.4722222222222223,
+      "grad_norm": 0.7000299096107483,
+      "learning_rate": 0.0010185185185185186,
+      "loss": 0.3803,
+      "step": 3710
+    },
+    {
+      "epoch": 1.4761904761904763,
+      "grad_norm": 1.4893783330917358,
+      "learning_rate": 0.0010158730158730158,
+      "loss": 0.6016,
+      "step": 3720
+    },
+    {
+      "epoch": 1.4801587301587302,
+      "grad_norm": 0.5601296424865723,
+      "learning_rate": 0.0010132275132275132,
+      "loss": 0.3314,
+      "step": 3730
+    },
+    {
+      "epoch": 1.4841269841269842,
+      "grad_norm": 0.5450819730758667,
+      "learning_rate": 0.0010105820105820106,
+      "loss": 0.5151,
+      "step": 3740
+    },
+    {
+      "epoch": 1.4880952380952381,
+      "grad_norm": 0.6305513381958008,
+      "learning_rate": 0.001007936507936508,
+      "loss": 0.5042,
+      "step": 3750
+    },
+    {
+      "epoch": 1.492063492063492,
+      "grad_norm": 1.2684389352798462,
+      "learning_rate": 0.0010052910052910054,
+      "loss": 0.5481,
+      "step": 3760
+    },
+    {
+      "epoch": 1.496031746031746,
+      "grad_norm": 1.5612815618515015,
+      "learning_rate": 0.0010026455026455026,
+      "loss": 0.4791,
+      "step": 3770
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.206734538078308,
+      "learning_rate": 0.001,
+      "loss": 0.6305,
+      "step": 3780
+    },
+    {
+      "epoch": 1.503968253968254,
+      "grad_norm": 1.069503664970398,
+      "learning_rate": 0.0009973544973544974,
+      "loss": 0.5058,
+      "step": 3790
+    },
+    {
+      "epoch": 1.507936507936508,
+      "grad_norm": 0.3658556044101715,
+      "learning_rate": 0.0009947089947089946,
+      "loss": 0.3957,
+      "step": 3800
+    },
+    {
+      "epoch": 1.5119047619047619,
+      "grad_norm": 1.0885382890701294,
+      "learning_rate": 0.000992063492063492,
+      "loss": 0.4997,
+      "step": 3810
+    },
+    {
+      "epoch": 1.5158730158730158,
+      "grad_norm": 0.7469413876533508,
+      "learning_rate": 0.0009894179894179894,
+      "loss": 0.5236,
+      "step": 3820
+    },
+    {
+      "epoch": 1.5198412698412698,
+      "grad_norm": 0.7196665406227112,
+      "learning_rate": 0.0009867724867724868,
+      "loss": 0.5083,
+      "step": 3830
+    },
+    {
+      "epoch": 1.5238095238095237,
+      "grad_norm": 0.6840754151344299,
+      "learning_rate": 0.000984126984126984,
+      "loss": 0.3656,
+      "step": 3840
+    },
+    {
+      "epoch": 1.5277777777777777,
+      "grad_norm": 1.3978683948516846,
+      "learning_rate": 0.0009814814814814816,
+      "loss": 0.4371,
+      "step": 3850
+    },
+    {
+      "epoch": 1.5317460317460316,
+      "grad_norm": 0.5583405494689941,
+      "learning_rate": 0.0009788359788359788,
+      "loss": 0.5274,
+      "step": 3860
+    },
+    {
+      "epoch": 1.5357142857142856,
+      "grad_norm": 1.8063452243804932,
+      "learning_rate": 0.0009761904761904762,
+      "loss": 0.6141,
+      "step": 3870
+    },
+    {
+      "epoch": 1.5396825396825395,
+      "grad_norm": 0.9723803400993347,
+      "learning_rate": 0.0009735449735449735,
+      "loss": 0.5883,
+      "step": 3880
+    },
+    {
+      "epoch": 1.5436507936507935,
+      "grad_norm": 0.30504110455513,
+      "learning_rate": 0.0009708994708994709,
+      "loss": 0.5268,
+      "step": 3890
+    },
+    {
+      "epoch": 1.5476190476190477,
+      "grad_norm": 0.6150854229927063,
+      "learning_rate": 0.0009682539682539683,
+      "loss": 0.5819,
+      "step": 3900
+    },
+    {
+      "epoch": 1.5515873015873016,
+      "grad_norm": 1.4445383548736572,
+      "learning_rate": 0.0009656084656084656,
+      "loss": 0.4811,
+      "step": 3910
+    },
+    {
+      "epoch": 1.5555555555555556,
+      "grad_norm": 0.662739634513855,
+      "learning_rate": 0.0009629629629629629,
+      "loss": 0.3249,
+      "step": 3920
+    },
+    {
+      "epoch": 1.5595238095238095,
+      "grad_norm": 0.6104711294174194,
+      "learning_rate": 0.0009603174603174604,
+      "loss": 0.3257,
+      "step": 3930
+    },
+    {
+      "epoch": 1.5634920634920635,
+      "grad_norm": 0.6666992902755737,
+      "learning_rate": 0.0009576719576719577,
+      "loss": 0.4348,
+      "step": 3940
+    },
+    {
+      "epoch": 1.5674603174603174,
+      "grad_norm": 1.3601847887039185,
+      "learning_rate": 0.000955026455026455,
+      "loss": 0.6486,
+      "step": 3950
+    },
+    {
+      "epoch": 1.5714285714285714,
+      "grad_norm": 1.4528306722640991,
+      "learning_rate": 0.0009523809523809524,
+      "loss": 0.5363,
+      "step": 3960
+    },
+    {
+      "epoch": 1.5753968253968254,
+      "grad_norm": 0.8328957557678223,
+      "learning_rate": 0.0009497354497354498,
+      "loss": 0.4086,
+      "step": 3970
+    },
+    {
+      "epoch": 1.5793650793650795,
+      "grad_norm": 0.6136783361434937,
+      "learning_rate": 0.0009470899470899471,
+      "loss": 0.5066,
+      "step": 3980
+    },
+    {
+      "epoch": 1.5833333333333335,
+      "grad_norm": 1.767379641532898,
+      "learning_rate": 0.0009444444444444445,
+      "loss": 0.4943,
+      "step": 3990
+    },
+    {
+      "epoch": 1.5873015873015874,
+      "grad_norm": 0.50275719165802,
+      "learning_rate": 0.0009417989417989418,
+      "loss": 0.5513,
+      "step": 4000
+    },
+    {
+      "epoch": 1.5912698412698414,
+      "grad_norm": 0.9671531915664673,
+      "learning_rate": 0.0009391534391534392,
+      "loss": 0.4399,
+      "step": 4010
+    },
+    {
+      "epoch": 1.5952380952380953,
+      "grad_norm": 1.084027647972107,
+      "learning_rate": 0.0009365079365079366,
+      "loss": 0.4851,
+      "step": 4020
+    },
+    {
+      "epoch": 1.5992063492063493,
+      "grad_norm": 0.944523274898529,
+      "learning_rate": 0.0009338624338624339,
+      "loss": 0.4931,
+      "step": 4030
+    },
+    {
+      "epoch": 1.6031746031746033,
+      "grad_norm": 0.7656432390213013,
+      "learning_rate": 0.0009312169312169312,
+      "loss": 0.4205,
+      "step": 4040
+    },
+    {
+      "epoch": 1.6071428571428572,
+      "grad_norm": 1.1295371055603027,
+      "learning_rate": 0.0009285714285714287,
+      "loss": 0.5147,
+      "step": 4050
+    },
+    {
+      "epoch": 1.6111111111111112,
+      "grad_norm": 1.0330742597579956,
+      "learning_rate": 0.000925925925925926,
+      "loss": 0.533,
+      "step": 4060
+    },
+    {
+      "epoch": 1.6150793650793651,
+      "grad_norm": 0.4578189253807068,
+      "learning_rate": 0.0009232804232804233,
+      "loss": 0.5074,
+      "step": 4070
+    },
+    {
+      "epoch": 1.619047619047619,
+      "grad_norm": 0.9493432641029358,
+      "learning_rate": 0.0009206349206349207,
+      "loss": 0.4964,
+      "step": 4080
+    },
+    {
+      "epoch": 1.623015873015873,
+      "grad_norm": 1.229602336883545,
+      "learning_rate": 0.0009179894179894181,
+      "loss": 0.4398,
+      "step": 4090
+    },
+    {
+      "epoch": 1.626984126984127,
+      "grad_norm": 1.182271957397461,
+      "learning_rate": 0.0009153439153439154,
+      "loss": 0.6409,
+      "step": 4100
+    },
+    {
+      "epoch": 1.630952380952381,
+      "grad_norm": 1.6596875190734863,
+      "learning_rate": 0.0009126984126984126,
+      "loss": 0.3213,
+      "step": 4110
+    },
+    {
+      "epoch": 1.6349206349206349,
+      "grad_norm": 0.9412317276000977,
+      "learning_rate": 0.00091005291005291,
+      "loss": 0.3736,
+      "step": 4120
+    },
+    {
+      "epoch": 1.6388888888888888,
+      "grad_norm": 1.4627306461334229,
+      "learning_rate": 0.0009074074074074074,
+      "loss": 0.5455,
+      "step": 4130
+    },
+    {
+      "epoch": 1.6428571428571428,
+      "grad_norm": 0.4782025218009949,
+      "learning_rate": 0.0009047619047619047,
+      "loss": 0.4291,
+      "step": 4140
+    },
+    {
+      "epoch": 1.6468253968253967,
+      "grad_norm": 0.907647430896759,
+      "learning_rate": 0.0009021164021164021,
+      "loss": 0.425,
+      "step": 4150
+    },
+    {
+      "epoch": 1.6507936507936507,
+      "grad_norm": 0.8232408761978149,
+      "learning_rate": 0.0008994708994708994,
+      "loss": 0.5669,
+      "step": 4160
+    },
+    {
+      "epoch": 1.6547619047619047,
+      "grad_norm": 0.5824115872383118,
+      "learning_rate": 0.0008968253968253968,
+      "loss": 0.4001,
+      "step": 4170
+    },
+    {
+      "epoch": 1.6587301587301586,
+      "grad_norm": 0.7836323976516724,
+      "learning_rate": 0.0008941798941798942,
+      "loss": 0.5163,
+      "step": 4180
+    },
+    {
+      "epoch": 1.6626984126984126,
+      "grad_norm": 0.9716808795928955,
+      "learning_rate": 0.0008915343915343915,
+      "loss": 0.4334,
+      "step": 4190
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.46734583377838135,
+      "learning_rate": 0.0008888888888888888,
+      "loss": 0.5188,
+      "step": 4200
+    },
+    {
+      "epoch": 1.6706349206349205,
+      "grad_norm": 1.1709452867507935,
+      "learning_rate": 0.0008862433862433863,
+      "loss": 0.4686,
+      "step": 4210
+    },
+    {
+      "epoch": 1.6746031746031746,
+      "grad_norm": 0.9173301458358765,
+      "learning_rate": 0.0008835978835978836,
+      "loss": 0.5697,
+      "step": 4220
+    },
+    {
+      "epoch": 1.6785714285714286,
+      "grad_norm": 1.190338134765625,
+      "learning_rate": 0.0008809523809523809,
+      "loss": 0.4324,
+      "step": 4230
+    },
+    {
+      "epoch": 1.6825396825396826,
+      "grad_norm": 1.278975248336792,
+      "learning_rate": 0.0008783068783068783,
+      "loss": 0.4726,
+      "step": 4240
+    },
+    {
+      "epoch": 1.6865079365079365,
+      "grad_norm": 0.8761826157569885,
+      "learning_rate": 0.0008756613756613757,
+      "loss": 0.5543,
+      "step": 4250
+    },
+    {
+      "epoch": 1.6904761904761905,
+      "grad_norm": 0.8508326411247253,
+      "learning_rate": 0.000873015873015873,
+      "loss": 0.4135,
+      "step": 4260
+    },
+    {
+      "epoch": 1.6944444444444444,
+      "grad_norm": 0.8877843618392944,
+      "learning_rate": 0.0008703703703703704,
+      "loss": 0.3276,
+      "step": 4270
+    },
+    {
+      "epoch": 1.6984126984126984,
+      "grad_norm": 2.439880609512329,
+      "learning_rate": 0.0008677248677248677,
+      "loss": 0.3676,
+      "step": 4280
+    },
+    {
+      "epoch": 1.7023809523809523,
+      "grad_norm": 1.454038143157959,
+      "learning_rate": 0.0008650793650793651,
+      "loss": 0.6022,
+      "step": 4290
+    },
+    {
+      "epoch": 1.7063492063492065,
+      "grad_norm": 0.6033250093460083,
+      "learning_rate": 0.0008624338624338625,
+      "loss": 0.4068,
+      "step": 4300
+    },
+    {
+      "epoch": 1.7103174603174605,
+      "grad_norm": 0.7904770374298096,
+      "learning_rate": 0.0008597883597883598,
+      "loss": 0.4654,
+      "step": 4310
+    },
+    {
+      "epoch": 1.7142857142857144,
+      "grad_norm": 1.0783374309539795,
+      "learning_rate": 0.0008571428571428571,
+      "loss": 0.326,
+      "step": 4320
+    },
+    {
+      "epoch": 1.7182539682539684,
+      "grad_norm": 0.923893928527832,
+      "learning_rate": 0.0008544973544973545,
+      "loss": 0.4963,
+      "step": 4330
+    },
+    {
+      "epoch": 1.7222222222222223,
+      "grad_norm": 0.36724144220352173,
+      "learning_rate": 0.0008518518518518519,
+      "loss": 0.3676,
+      "step": 4340
+    },
+    {
+      "epoch": 1.7261904761904763,
+      "grad_norm": 1.1232455968856812,
+      "learning_rate": 0.0008492063492063492,
+      "loss": 0.4462,
+      "step": 4350
+    },
+    {
+      "epoch": 1.7301587301587302,
+      "grad_norm": 1.3588309288024902,
+      "learning_rate": 0.0008465608465608465,
+      "loss": 0.4954,
+      "step": 4360
+    },
+    {
+      "epoch": 1.7341269841269842,
+      "grad_norm": 1.016571283340454,
+      "learning_rate": 0.000843915343915344,
+      "loss": 0.5502,
+      "step": 4370
+    },
+    {
+      "epoch": 1.7380952380952381,
+      "grad_norm": 0.9546862244606018,
+      "learning_rate": 0.0008412698412698413,
+      "loss": 0.4983,
+      "step": 4380
+    },
+    {
+      "epoch": 1.742063492063492,
+      "grad_norm": 1.2846907377243042,
+      "learning_rate": 0.0008386243386243386,
+      "loss": 0.6217,
+      "step": 4390
+    },
+    {
+      "epoch": 1.746031746031746,
+      "grad_norm": 1.4156478643417358,
+      "learning_rate": 0.000835978835978836,
+      "loss": 0.5286,
+      "step": 4400
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.692659318447113,
+      "learning_rate": 0.0008333333333333334,
+      "loss": 0.4571,
+      "step": 4410
+    },
+    {
+      "epoch": 1.753968253968254,
+      "grad_norm": 0.5487807989120483,
+      "learning_rate": 0.0008306878306878307,
+      "loss": 0.4216,
+      "step": 4420
+    },
+    {
+      "epoch": 1.757936507936508,
+      "grad_norm": 0.4136459529399872,
+      "learning_rate": 0.0008280423280423281,
+      "loss": 0.3706,
+      "step": 4430
+    },
+    {
+      "epoch": 1.7619047619047619,
+      "grad_norm": 2.730607748031616,
+      "learning_rate": 0.0008253968253968254,
+      "loss": 0.4341,
+      "step": 4440
+    },
+    {
+      "epoch": 1.7658730158730158,
+      "grad_norm": 1.0752816200256348,
+      "learning_rate": 0.0008227513227513228,
+      "loss": 0.6466,
+      "step": 4450
+    },
+    {
+      "epoch": 1.7698412698412698,
+      "grad_norm": 0.9848162531852722,
+      "learning_rate": 0.0008201058201058202,
+      "loss": 0.5904,
+      "step": 4460
+    },
+    {
+      "epoch": 1.7738095238095237,
+      "grad_norm": 1.4132823944091797,
+      "learning_rate": 0.0008174603174603175,
+      "loss": 0.4528,
+      "step": 4470
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": 0.8410534858703613,
+      "learning_rate": 0.0008148148148148148,
+      "loss": 0.431,
+      "step": 4480
+    },
+    {
+      "epoch": 1.7817460317460316,
+      "grad_norm": 0.7188355922698975,
+      "learning_rate": 0.0008121693121693123,
+      "loss": 0.5955,
+      "step": 4490
+    },
+    {
+      "epoch": 1.7857142857142856,
+      "grad_norm": 0.8639283776283264,
+      "learning_rate": 0.0008095238095238096,
+      "loss": 0.4491,
+      "step": 4500
+    },
+    {
+      "epoch": 1.7896825396825395,
+      "grad_norm": 1.0643069744110107,
+      "learning_rate": 0.0008068783068783069,
+      "loss": 0.5299,
+      "step": 4510
+    },
+    {
+      "epoch": 1.7936507936507935,
+      "grad_norm": 1.1698801517486572,
+      "learning_rate": 0.0008042328042328042,
+      "loss": 0.5063,
+      "step": 4520
+    },
+    {
+      "epoch": 1.7976190476190477,
+      "grad_norm": 1.222699522972107,
+      "learning_rate": 0.0008015873015873017,
+      "loss": 0.4743,
+      "step": 4530
+    },
+    {
+      "epoch": 1.8015873015873016,
+      "grad_norm": 0.8404491543769836,
+      "learning_rate": 0.000798941798941799,
+      "loss": 0.4811,
+      "step": 4540
+    },
+    {
+      "epoch": 1.8055555555555556,
+      "grad_norm": 0.7801256775856018,
+      "learning_rate": 0.0007962962962962962,
+      "loss": 0.5615,
+      "step": 4550
+    },
+    {
+      "epoch": 1.8095238095238095,
+      "grad_norm": 0.735230565071106,
+      "learning_rate": 0.0007936507936507937,
+      "loss": 0.4375,
+      "step": 4560
+    },
+    {
+      "epoch": 1.8134920634920635,
+      "grad_norm": 0.8510635495185852,
+      "learning_rate": 0.000791005291005291,
+      "loss": 0.4999,
+      "step": 4570
+    },
+    {
+      "epoch": 1.8174603174603174,
+      "grad_norm": 1.2653560638427734,
+      "learning_rate": 0.0007883597883597883,
+      "loss": 0.4563,
+      "step": 4580
+    },
+    {
+      "epoch": 1.8214285714285714,
+      "grad_norm": 0.475337952375412,
+      "learning_rate": 0.0007857142857142857,
+      "loss": 0.3737,
+      "step": 4590
+    },
+    {
+      "epoch": 1.8253968253968254,
+      "grad_norm": 0.6187211871147156,
+      "learning_rate": 0.000783068783068783,
+      "loss": 0.5287,
+      "step": 4600
+    },
+    {
+      "epoch": 1.8293650793650795,
+      "grad_norm": 1.4211279153823853,
+      "learning_rate": 0.0007804232804232804,
+      "loss": 0.522,
+      "step": 4610
+    },
+    {
+      "epoch": 1.8333333333333335,
+      "grad_norm": 1.4588719606399536,
+      "learning_rate": 0.0007777777777777778,
+      "loss": 0.6195,
+      "step": 4620
+    },
+    {
+      "epoch": 1.8373015873015874,
+      "grad_norm": 0.5156915783882141,
+      "learning_rate": 0.0007751322751322751,
+      "loss": 0.5105,
+      "step": 4630
+    },
+    {
+      "epoch": 1.8412698412698414,
+      "grad_norm": 0.9501180648803711,
+      "learning_rate": 0.0007724867724867724,
+      "loss": 0.4081,
+      "step": 4640
+    },
+    {
+      "epoch": 1.8452380952380953,
+      "grad_norm": 0.45203983783721924,
+      "learning_rate": 0.0007698412698412699,
+      "loss": 0.4493,
+      "step": 4650
+    },
+    {
+      "epoch": 1.8492063492063493,
+      "grad_norm": 0.4670614004135132,
+      "learning_rate": 0.0007671957671957672,
+      "loss": 0.3351,
+      "step": 4660
+    },
+    {
+      "epoch": 1.8531746031746033,
+      "grad_norm": 0.9876275062561035,
+      "learning_rate": 0.0007645502645502645,
+      "loss": 0.5741,
+      "step": 4670
+    },
+    {
+      "epoch": 1.8571428571428572,
+      "grad_norm": 0.8845266103744507,
+      "learning_rate": 0.0007619047619047619,
+      "loss": 0.4142,
+      "step": 4680
+    },
+    {
+      "epoch": 1.8611111111111112,
+      "grad_norm": 0.7441647052764893,
+      "learning_rate": 0.0007592592592592593,
+      "loss": 0.4072,
+      "step": 4690
+    },
+    {
+      "epoch": 1.8650793650793651,
+      "grad_norm": 0.9643361568450928,
+      "learning_rate": 0.0007566137566137566,
+      "loss": 0.5352,
+      "step": 4700
+    },
+    {
+      "epoch": 1.869047619047619,
+      "grad_norm": 0.8456591367721558,
+      "learning_rate": 0.000753968253968254,
+      "loss": 0.5337,
+      "step": 4710
+    },
+    {
+      "epoch": 1.873015873015873,
+      "grad_norm": 1.6536881923675537,
+      "learning_rate": 0.0007513227513227513,
+      "loss": 0.4462,
+      "step": 4720
+    },
+    {
+      "epoch": 1.876984126984127,
+      "grad_norm": 0.6966465711593628,
+      "learning_rate": 0.0007486772486772487,
+      "loss": 0.7229,
+      "step": 4730
+    },
+    {
+      "epoch": 1.880952380952381,
+      "grad_norm": 0.9560131430625916,
+      "learning_rate": 0.000746031746031746,
+      "loss": 0.4636,
+      "step": 4740
+    },
+    {
+      "epoch": 1.8849206349206349,
+      "grad_norm": 0.6783252358436584,
+      "learning_rate": 0.0007433862433862434,
+      "loss": 0.5508,
+      "step": 4750
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "grad_norm": 0.5133827328681946,
+      "learning_rate": 0.0007407407407407407,
+      "loss": 0.3607,
+      "step": 4760
+    },
+    {
+      "epoch": 1.8928571428571428,
+      "grad_norm": 0.6404028534889221,
+      "learning_rate": 0.0007380952380952381,
+      "loss": 0.5246,
+      "step": 4770
+    },
+    {
+      "epoch": 1.8968253968253967,
+      "grad_norm": 0.7048952579498291,
+      "learning_rate": 0.0007354497354497355,
+      "loss": 0.4684,
+      "step": 4780
+    },
+    {
+      "epoch": 1.9007936507936507,
+      "grad_norm": 0.5569032430648804,
+      "learning_rate": 0.0007328042328042328,
+      "loss": 0.3872,
+      "step": 4790
+    },
+    {
+      "epoch": 1.9047619047619047,
+      "grad_norm": 0.5231502652168274,
+      "learning_rate": 0.0007301587301587301,
+      "loss": 0.3945,
+      "step": 4800
+    },
+    {
+      "epoch": 1.9087301587301586,
+      "grad_norm": 1.2239683866500854,
+      "learning_rate": 0.0007275132275132276,
+      "loss": 0.5389,
+      "step": 4810
+    },
+    {
+      "epoch": 1.9126984126984126,
+      "grad_norm": 1.5195467472076416,
+      "learning_rate": 0.0007248677248677249,
+      "loss": 0.5958,
+      "step": 4820
+    },
+    {
+      "epoch": 1.9166666666666665,
+      "grad_norm": 0.8116055727005005,
+      "learning_rate": 0.0007222222222222222,
+      "loss": 0.3983,
+      "step": 4830
+    },
+    {
+      "epoch": 1.9206349206349205,
+      "grad_norm": 1.067101240158081,
+      "learning_rate": 0.0007195767195767196,
+      "loss": 0.4035,
+      "step": 4840
+    },
+    {
+      "epoch": 1.9246031746031746,
+      "grad_norm": 0.9318575263023376,
+      "learning_rate": 0.000716931216931217,
+      "loss": 0.5596,
+      "step": 4850
+    },
+    {
+      "epoch": 1.9285714285714286,
+      "grad_norm": 1.1583409309387207,
+      "learning_rate": 0.0007142857142857143,
+      "loss": 0.3916,
+      "step": 4860
+    },
+    {
+      "epoch": 1.9325396825396826,
+      "grad_norm": 0.813510000705719,
+      "learning_rate": 0.0007116402116402117,
+      "loss": 0.5113,
+      "step": 4870
+    },
+    {
+      "epoch": 1.9365079365079365,
+      "grad_norm": 1.5386079549789429,
+      "learning_rate": 0.000708994708994709,
+      "loss": 0.5188,
+      "step": 4880
+    },
+    {
+      "epoch": 1.9404761904761905,
+      "grad_norm": 1.1007848978042603,
+      "learning_rate": 0.0007063492063492064,
+      "loss": 0.5993,
+      "step": 4890
+    },
+    {
+      "epoch": 1.9444444444444444,
+      "grad_norm": 2.2866406440734863,
+      "learning_rate": 0.0007037037037037038,
+      "loss": 0.6263,
+      "step": 4900
+    },
+    {
+      "epoch": 1.9484126984126984,
+      "grad_norm": 1.5539257526397705,
+      "learning_rate": 0.0007010582010582011,
+      "loss": 0.4029,
+      "step": 4910
+    },
+    {
+      "epoch": 1.9523809523809523,
+      "grad_norm": 0.9776302576065063,
+      "learning_rate": 0.0006984126984126984,
+      "loss": 0.45,
+      "step": 4920
+    },
+    {
+      "epoch": 1.9563492063492065,
+      "grad_norm": 0.7598035335540771,
+      "learning_rate": 0.0006957671957671959,
+      "loss": 0.4909,
+      "step": 4930
+    },
+    {
+      "epoch": 1.9603174603174605,
+      "grad_norm": 1.1056677103042603,
+      "learning_rate": 0.0006931216931216932,
+      "loss": 0.4979,
+      "step": 4940
+    },
+    {
+      "epoch": 1.9642857142857144,
+      "grad_norm": 0.796816349029541,
+      "learning_rate": 0.0006904761904761905,
+      "loss": 0.4591,
+      "step": 4950
+    },
+    {
+      "epoch": 1.9682539682539684,
+      "grad_norm": 0.5265449285507202,
+      "learning_rate": 0.0006878306878306878,
+      "loss": 0.3988,
+      "step": 4960
+    },
+    {
+      "epoch": 1.9722222222222223,
+      "grad_norm": 0.3230462074279785,
+      "learning_rate": 0.0006851851851851853,
+      "loss": 0.4432,
+      "step": 4970
+    },
+    {
+      "epoch": 1.9761904761904763,
+      "grad_norm": 1.2444729804992676,
+      "learning_rate": 0.0006825396825396826,
+      "loss": 0.3928,
+      "step": 4980
+    },
+    {
+      "epoch": 1.9801587301587302,
+      "grad_norm": 0.7676456570625305,
+      "learning_rate": 0.0006798941798941799,
+      "loss": 0.4148,
+      "step": 4990
+    },
+    {
+      "epoch": 1.9841269841269842,
+      "grad_norm": 1.05657160282135,
+      "learning_rate": 0.0006772486772486773,
+      "loss": 0.5529,
+      "step": 5000
+    },
+    {
+      "epoch": 1.9880952380952381,
+      "grad_norm": 0.4994324743747711,
+      "learning_rate": 0.0006746031746031747,
+      "loss": 0.3886,
+      "step": 5010
+    },
+    {
+      "epoch": 1.992063492063492,
+      "grad_norm": 1.1352735757827759,
+      "learning_rate": 0.000671957671957672,
+      "loss": 0.5021,
+      "step": 5020
+    },
+    {
+      "epoch": 1.996031746031746,
+      "grad_norm": 1.0702826976776123,
+      "learning_rate": 0.0006693121693121694,
+      "loss": 0.7276,
+      "step": 5030
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.9455626010894775,
+      "learning_rate": 0.0006666666666666666,
+      "loss": 0.5663,
+      "step": 5040
+    },
+    {
+      "epoch": 2.003968253968254,
+      "grad_norm": 0.6044638156890869,
+      "learning_rate": 0.000664021164021164,
+      "loss": 0.3748,
+      "step": 5050
+    },
+    {
+      "epoch": 2.007936507936508,
+      "grad_norm": 0.8124226927757263,
+      "learning_rate": 0.0006613756613756614,
+      "loss": 0.3476,
+      "step": 5060
+    },
+    {
+      "epoch": 2.011904761904762,
+      "grad_norm": 0.5022208094596863,
+      "learning_rate": 0.0006587301587301587,
+      "loss": 0.313,
+      "step": 5070
+    },
+    {
+      "epoch": 2.015873015873016,
+      "grad_norm": 0.5413989424705505,
+      "learning_rate": 0.000656084656084656,
+      "loss": 0.3359,
+      "step": 5080
+    },
+    {
+      "epoch": 2.0198412698412698,
+      "grad_norm": 0.8055435419082642,
+      "learning_rate": 0.0006534391534391535,
+      "loss": 0.3807,
+      "step": 5090
+    },
+    {
+      "epoch": 2.0238095238095237,
+      "grad_norm": 1.5344974994659424,
+      "learning_rate": 0.0006507936507936508,
+      "loss": 0.3913,
+      "step": 5100
+    },
+    {
+      "epoch": 2.0277777777777777,
+      "grad_norm": 0.6911923289299011,
+      "learning_rate": 0.0006481481481481481,
+      "loss": 0.419,
+      "step": 5110
+    },
+    {
+      "epoch": 2.0317460317460316,
+      "grad_norm": 0.9209279417991638,
+      "learning_rate": 0.0006455026455026455,
+      "loss": 0.4246,
+      "step": 5120
+    },
+    {
+      "epoch": 2.0357142857142856,
+      "grad_norm": 0.7789056897163391,
+      "learning_rate": 0.0006428571428571429,
+      "loss": 0.3487,
+      "step": 5130
+    },
+    {
+      "epoch": 2.0396825396825395,
+      "grad_norm": 1.2143142223358154,
+      "learning_rate": 0.0006402116402116402,
+      "loss": 0.3457,
+      "step": 5140
+    },
+    {
+      "epoch": 2.0436507936507935,
+      "grad_norm": 1.2130590677261353,
+      "learning_rate": 0.0006375661375661375,
+      "loss": 0.4553,
+      "step": 5150
+    },
+    {
+      "epoch": 2.0476190476190474,
+      "grad_norm": 1.1139146089553833,
+      "learning_rate": 0.0006349206349206349,
+      "loss": 0.3192,
+      "step": 5160
+    },
+    {
+      "epoch": 2.0515873015873014,
+      "grad_norm": 0.9016938805580139,
+      "learning_rate": 0.0006322751322751323,
+      "loss": 0.284,
+      "step": 5170
+    },
+    {
+      "epoch": 2.0555555555555554,
+      "grad_norm": 1.2442255020141602,
+      "learning_rate": 0.0006296296296296296,
+      "loss": 0.3583,
+      "step": 5180
+    },
+    {
+      "epoch": 2.0595238095238093,
+      "grad_norm": 1.756134271621704,
+      "learning_rate": 0.000626984126984127,
+      "loss": 0.4804,
+      "step": 5190
+    },
+    {
+      "epoch": 2.0634920634920633,
+      "grad_norm": 0.9567892551422119,
+      "learning_rate": 0.0006243386243386243,
+      "loss": 0.3952,
+      "step": 5200
+    },
+    {
+      "epoch": 2.0674603174603177,
+      "grad_norm": 0.391501784324646,
+      "learning_rate": 0.0006216931216931217,
+      "loss": 0.3147,
+      "step": 5210
+    },
+    {
+      "epoch": 2.0714285714285716,
+      "grad_norm": 0.6419145464897156,
+      "learning_rate": 0.0006190476190476191,
+      "loss": 0.2739,
+      "step": 5220
+    },
+    {
+      "epoch": 2.0753968253968256,
+      "grad_norm": 0.8622870445251465,
+      "learning_rate": 0.0006164021164021164,
+      "loss": 0.3093,
+      "step": 5230
+    },
+    {
+      "epoch": 2.0793650793650795,
+      "grad_norm": 0.5181304812431335,
+      "learning_rate": 0.0006137566137566137,
+      "loss": 0.3315,
+      "step": 5240
+    },
+    {
+      "epoch": 2.0833333333333335,
+      "grad_norm": 0.9292448163032532,
+      "learning_rate": 0.0006111111111111112,
+      "loss": 0.3058,
+      "step": 5250
+    },
+    {
+      "epoch": 2.0873015873015874,
+      "grad_norm": 0.8386250734329224,
+      "learning_rate": 0.0006084656084656085,
+      "loss": 0.3641,
+      "step": 5260
+    },
+    {
+      "epoch": 2.0912698412698414,
+      "grad_norm": 0.7679039239883423,
+      "learning_rate": 0.0006058201058201058,
+      "loss": 0.4084,
+      "step": 5270
+    },
+    {
+      "epoch": 2.0952380952380953,
+      "grad_norm": 0.8268955945968628,
+      "learning_rate": 0.0006031746031746032,
+      "loss": 0.2579,
+      "step": 5280
+    },
+    {
+      "epoch": 2.0992063492063493,
+      "grad_norm": 0.9601532220840454,
+      "learning_rate": 0.0006005291005291006,
+      "loss": 0.3987,
+      "step": 5290
+    },
+    {
+      "epoch": 2.1031746031746033,
+      "grad_norm": 0.5090093612670898,
+      "learning_rate": 0.0005978835978835979,
+      "loss": 0.2707,
+      "step": 5300
+    },
+    {
+      "epoch": 2.107142857142857,
+      "grad_norm": 1.2176988124847412,
+      "learning_rate": 0.0005952380952380953,
+      "loss": 0.3985,
+      "step": 5310
+    },
+    {
+      "epoch": 2.111111111111111,
+      "grad_norm": 0.5723254680633545,
+      "learning_rate": 0.0005925925925925926,
+      "loss": 0.2727,
+      "step": 5320
+    },
+    {
+      "epoch": 2.115079365079365,
+      "grad_norm": 0.6939801573753357,
+      "learning_rate": 0.00058994708994709,
+      "loss": 0.3126,
+      "step": 5330
+    },
+    {
+      "epoch": 2.119047619047619,
+      "grad_norm": 0.9729529619216919,
+      "learning_rate": 0.0005873015873015874,
+      "loss": 0.3971,
+      "step": 5340
+    },
+    {
+      "epoch": 2.123015873015873,
+      "grad_norm": 0.7717307806015015,
+      "learning_rate": 0.0005846560846560847,
+      "loss": 0.3643,
+      "step": 5350
+    },
+    {
+      "epoch": 2.126984126984127,
+      "grad_norm": 0.8346803784370422,
+      "learning_rate": 0.000582010582010582,
+      "loss": 0.4888,
+      "step": 5360
+    },
+    {
+      "epoch": 2.130952380952381,
+      "grad_norm": 0.6180922389030457,
+      "learning_rate": 0.0005793650793650794,
+      "loss": 0.36,
+      "step": 5370
+    },
+    {
+      "epoch": 2.134920634920635,
+      "grad_norm": 0.6804755926132202,
+      "learning_rate": 0.0005767195767195768,
+      "loss": 0.3333,
+      "step": 5380
+    },
+    {
+      "epoch": 2.138888888888889,
+      "grad_norm": 1.1156859397888184,
+      "learning_rate": 0.0005740740740740741,
+      "loss": 0.3147,
+      "step": 5390
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 1.1516271829605103,
+      "learning_rate": 0.0005714285714285714,
+      "loss": 0.3723,
+      "step": 5400
+    },
+    {
+      "epoch": 2.1468253968253967,
+      "grad_norm": 0.7301619648933411,
+      "learning_rate": 0.0005687830687830689,
+      "loss": 0.2972,
+      "step": 5410
+    },
+    {
+      "epoch": 2.1507936507936507,
+      "grad_norm": 0.46621087193489075,
+      "learning_rate": 0.0005661375661375662,
+      "loss": 0.4814,
+      "step": 5420
+    },
+    {
+      "epoch": 2.1547619047619047,
+      "grad_norm": 1.4534790515899658,
+      "learning_rate": 0.0005634920634920635,
+      "loss": 0.3715,
+      "step": 5430
+    },
+    {
+      "epoch": 2.1587301587301586,
+      "grad_norm": 1.0283761024475098,
+      "learning_rate": 0.0005608465608465609,
+      "loss": 0.3096,
+      "step": 5440
+    },
+    {
+      "epoch": 2.1626984126984126,
+      "grad_norm": 1.517444372177124,
+      "learning_rate": 0.0005582010582010583,
+      "loss": 0.4282,
+      "step": 5450
+    },
+    {
+      "epoch": 2.1666666666666665,
+      "grad_norm": 1.1968739032745361,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 0.5653,
+      "step": 5460
+    },
+    {
+      "epoch": 2.1706349206349205,
+      "grad_norm": 0.8281181454658508,
+      "learning_rate": 0.000552910052910053,
+      "loss": 0.3734,
+      "step": 5470
+    },
+    {
+      "epoch": 2.1746031746031744,
+      "grad_norm": 0.825985312461853,
+      "learning_rate": 0.0005502645502645502,
+      "loss": 0.2995,
+      "step": 5480
+    },
+    {
+      "epoch": 2.1785714285714284,
+      "grad_norm": 1.011702060699463,
+      "learning_rate": 0.0005476190476190477,
+      "loss": 0.3567,
+      "step": 5490
+    },
+    {
+      "epoch": 2.1825396825396823,
+      "grad_norm": 1.0061122179031372,
+      "learning_rate": 0.000544973544973545,
+      "loss": 0.3029,
+      "step": 5500
+    },
+    {
+      "epoch": 2.1865079365079367,
+      "grad_norm": 0.7219818234443665,
+      "learning_rate": 0.0005423280423280423,
+      "loss": 0.2957,
+      "step": 5510
+    },
+    {
+      "epoch": 2.1904761904761907,
+      "grad_norm": 0.8629070520401001,
+      "learning_rate": 0.0005396825396825396,
+      "loss": 0.4025,
+      "step": 5520
+    },
+    {
+      "epoch": 2.1944444444444446,
+      "grad_norm": 0.846489429473877,
+      "learning_rate": 0.0005370370370370371,
+      "loss": 0.2563,
+      "step": 5530
+    },
+    {
+      "epoch": 2.1984126984126986,
+      "grad_norm": 1.013261079788208,
+      "learning_rate": 0.0005343915343915344,
+      "loss": 0.3241,
+      "step": 5540
+    },
+    {
+      "epoch": 2.2023809523809526,
+      "grad_norm": 1.1884324550628662,
+      "learning_rate": 0.0005317460317460317,
+      "loss": 0.3218,
+      "step": 5550
+    },
+    {
+      "epoch": 2.2063492063492065,
+      "grad_norm": 0.6852754950523376,
+      "learning_rate": 0.0005291005291005291,
+      "loss": 0.307,
+      "step": 5560
+    },
+    {
+      "epoch": 2.2103174603174605,
+      "grad_norm": 0.8409839272499084,
+      "learning_rate": 0.0005264550264550265,
+      "loss": 0.2683,
+      "step": 5570
+    },
+    {
+      "epoch": 2.2142857142857144,
+      "grad_norm": 0.6928064823150635,
+      "learning_rate": 0.0005238095238095238,
+      "loss": 0.3296,
+      "step": 5580
+    },
+    {
+      "epoch": 2.2182539682539684,
+      "grad_norm": 0.48399004340171814,
+      "learning_rate": 0.0005211640211640211,
+      "loss": 0.2453,
+      "step": 5590
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": 0.6243159174919128,
+      "learning_rate": 0.0005185185185185185,
+      "loss": 0.4726,
+      "step": 5600
+    },
+    {
+      "epoch": 2.2261904761904763,
+      "grad_norm": 0.8214319944381714,
+      "learning_rate": 0.0005158730158730159,
+      "loss": 0.3647,
+      "step": 5610
+    },
+    {
+      "epoch": 2.2301587301587302,
+      "grad_norm": 1.350664496421814,
+      "learning_rate": 0.0005132275132275132,
+      "loss": 0.3515,
+      "step": 5620
+    },
+    {
+      "epoch": 2.234126984126984,
+      "grad_norm": 0.598360002040863,
+      "learning_rate": 0.0005105820105820106,
+      "loss": 0.4379,
+      "step": 5630
+    },
+    {
+      "epoch": 2.238095238095238,
+      "grad_norm": 0.744739830493927,
+      "learning_rate": 0.0005079365079365079,
+      "loss": 0.3241,
+      "step": 5640
+    },
+    {
+      "epoch": 2.242063492063492,
+      "grad_norm": 9.888148307800293,
+      "learning_rate": 0.0005052910052910053,
+      "loss": 0.2648,
+      "step": 5650
+    },
+    {
+      "epoch": 2.246031746031746,
+      "grad_norm": 0.5198895931243896,
+      "learning_rate": 0.0005026455026455027,
+      "loss": 0.3836,
+      "step": 5660
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.9944855570793152,
+      "learning_rate": 0.0005,
+      "loss": 0.3527,
+      "step": 5670
+    },
+    {
+      "epoch": 2.253968253968254,
+      "grad_norm": 0.8176829218864441,
+      "learning_rate": 0.0004973544973544973,
+      "loss": 0.3,
+      "step": 5680
+    },
+    {
+      "epoch": 2.257936507936508,
+      "grad_norm": 0.37834370136260986,
+      "learning_rate": 0.0004947089947089947,
+      "loss": 0.2629,
+      "step": 5690
+    },
+    {
+      "epoch": 2.261904761904762,
+      "grad_norm": 1.0115917921066284,
+      "learning_rate": 0.000492063492063492,
+      "loss": 0.3813,
+      "step": 5700
+    },
+    {
+      "epoch": 2.265873015873016,
+      "grad_norm": 1.2166203260421753,
+      "learning_rate": 0.0004894179894179894,
+      "loss": 0.2698,
+      "step": 5710
+    },
+    {
+      "epoch": 2.2698412698412698,
+      "grad_norm": 0.4840317964553833,
+      "learning_rate": 0.00048677248677248675,
+      "loss": 0.2739,
+      "step": 5720
+    },
+    {
+      "epoch": 2.2738095238095237,
+      "grad_norm": 0.528724193572998,
+      "learning_rate": 0.00048412698412698415,
+      "loss": 0.3434,
+      "step": 5730
+    },
+    {
+      "epoch": 2.2777777777777777,
+      "grad_norm": 0.6342616081237793,
+      "learning_rate": 0.00048148148148148144,
+      "loss": 0.2495,
+      "step": 5740
+    },
+    {
+      "epoch": 2.2817460317460316,
+      "grad_norm": 0.5333026647567749,
+      "learning_rate": 0.00047883597883597884,
+      "loss": 0.2839,
+      "step": 5750
+    },
+    {
+      "epoch": 2.2857142857142856,
+      "grad_norm": 0.7824140787124634,
+      "learning_rate": 0.0004761904761904762,
+      "loss": 0.3332,
+      "step": 5760
+    },
+    {
+      "epoch": 2.2896825396825395,
+      "grad_norm": 0.9291231632232666,
+      "learning_rate": 0.00047354497354497354,
+      "loss": 0.4443,
+      "step": 5770
+    },
+    {
+      "epoch": 2.2936507936507935,
+      "grad_norm": 0.8615803122520447,
+      "learning_rate": 0.0004708994708994709,
+      "loss": 0.338,
+      "step": 5780
+    },
+    {
+      "epoch": 2.2976190476190474,
+      "grad_norm": 0.5790500640869141,
+      "learning_rate": 0.0004682539682539683,
+      "loss": 0.3032,
+      "step": 5790
+    },
+    {
+      "epoch": 2.3015873015873014,
+      "grad_norm": 0.5711954832077026,
+      "learning_rate": 0.0004656084656084656,
+      "loss": 0.2674,
+      "step": 5800
+    },
+    {
+      "epoch": 2.3055555555555554,
+      "grad_norm": 0.6912782192230225,
+      "learning_rate": 0.000462962962962963,
+      "loss": 0.3159,
+      "step": 5810
+    },
+    {
+      "epoch": 2.3095238095238093,
+      "grad_norm": 1.0069470405578613,
+      "learning_rate": 0.00046031746031746033,
+      "loss": 0.2548,
+      "step": 5820
+    },
+    {
+      "epoch": 2.3134920634920633,
+      "grad_norm": 0.7111985087394714,
+      "learning_rate": 0.0004576719576719577,
+      "loss": 0.4336,
+      "step": 5830
+    },
+    {
+      "epoch": 2.317460317460317,
+      "grad_norm": 0.7876987457275391,
+      "learning_rate": 0.000455026455026455,
+      "loss": 0.3417,
+      "step": 5840
+    },
+    {
+      "epoch": 2.3214285714285716,
+      "grad_norm": 1.222811222076416,
+      "learning_rate": 0.00045238095238095237,
+      "loss": 0.2989,
+      "step": 5850
+    },
+    {
+      "epoch": 2.3253968253968256,
+      "grad_norm": 0.6214492321014404,
+      "learning_rate": 0.0004497354497354497,
+      "loss": 0.3119,
+      "step": 5860
+    },
+    {
+      "epoch": 2.3293650793650795,
+      "grad_norm": 1.190848708152771,
+      "learning_rate": 0.0004470899470899471,
+      "loss": 0.3549,
+      "step": 5870
+    },
+    {
+      "epoch": 2.3333333333333335,
+      "grad_norm": 1.6466199159622192,
+      "learning_rate": 0.0004444444444444444,
+      "loss": 0.3483,
+      "step": 5880
+    },
+    {
+      "epoch": 2.3373015873015874,
+      "grad_norm": 1.195802927017212,
+      "learning_rate": 0.0004417989417989418,
+      "loss": 0.2867,
+      "step": 5890
+    },
+    {
+      "epoch": 2.3412698412698414,
+      "grad_norm": 0.705406665802002,
+      "learning_rate": 0.00043915343915343916,
+      "loss": 0.3974,
+      "step": 5900
+    },
+    {
+      "epoch": 2.3452380952380953,
+      "grad_norm": 1.0674729347229004,
+      "learning_rate": 0.0004365079365079365,
+      "loss": 0.3524,
+      "step": 5910
+    },
+    {
+      "epoch": 2.3492063492063493,
+      "grad_norm": 1.3207943439483643,
+      "learning_rate": 0.00043386243386243385,
+      "loss": 0.309,
+      "step": 5920
+    },
+    {
+      "epoch": 2.3531746031746033,
+      "grad_norm": 0.6910490393638611,
+      "learning_rate": 0.00043121693121693126,
+      "loss": 0.2557,
+      "step": 5930
+    },
+    {
+      "epoch": 2.357142857142857,
+      "grad_norm": 0.7160533666610718,
+      "learning_rate": 0.00042857142857142855,
+      "loss": 0.3447,
+      "step": 5940
+    },
+    {
+      "epoch": 2.361111111111111,
+      "grad_norm": 1.117875576019287,
+      "learning_rate": 0.00042592592592592595,
+      "loss": 0.3796,
+      "step": 5950
+    },
+    {
+      "epoch": 2.365079365079365,
+      "grad_norm": 0.7119603753089905,
+      "learning_rate": 0.00042328042328042324,
+      "loss": 0.3458,
+      "step": 5960
+    },
+    {
+      "epoch": 2.369047619047619,
+      "grad_norm": 0.6892464756965637,
+      "learning_rate": 0.00042063492063492065,
+      "loss": 0.3888,
+      "step": 5970
+    },
+    {
+      "epoch": 2.373015873015873,
+      "grad_norm": 0.8669295310974121,
+      "learning_rate": 0.000417989417989418,
+      "loss": 0.4486,
+      "step": 5980
+    },
+    {
+      "epoch": 2.376984126984127,
+      "grad_norm": 0.3692854642868042,
+      "learning_rate": 0.00041534391534391534,
+      "loss": 0.2848,
+      "step": 5990
+    },
+    {
+      "epoch": 2.380952380952381,
+      "grad_norm": 0.8515878915786743,
+      "learning_rate": 0.0004126984126984127,
+      "loss": 0.3851,
+      "step": 6000
+    },
+    {
+      "epoch": 2.384920634920635,
+      "grad_norm": 0.8710914850234985,
+      "learning_rate": 0.0004100529100529101,
+      "loss": 0.3629,
+      "step": 6010
+    },
+    {
+      "epoch": 2.388888888888889,
+      "grad_norm": 1.1649229526519775,
+      "learning_rate": 0.0004074074074074074,
+      "loss": 0.3405,
+      "step": 6020
+    },
+    {
+      "epoch": 2.392857142857143,
+      "grad_norm": 0.536342442035675,
+      "learning_rate": 0.0004047619047619048,
+      "loss": 0.3541,
+      "step": 6030
+    },
+    {
+      "epoch": 2.3968253968253967,
+      "grad_norm": 0.6506990790367126,
+      "learning_rate": 0.0004021164021164021,
+      "loss": 0.3217,
+      "step": 6040
+    },
+    {
+      "epoch": 2.4007936507936507,
+      "grad_norm": 0.39036527276039124,
+      "learning_rate": 0.0003994708994708995,
+      "loss": 0.3307,
+      "step": 6050
+    },
+    {
+      "epoch": 2.4047619047619047,
+      "grad_norm": 0.5971412658691406,
+      "learning_rate": 0.0003968253968253968,
+      "loss": 0.3523,
+      "step": 6060
+    },
+    {
+      "epoch": 2.4087301587301586,
+      "grad_norm": 1.4851547479629517,
+      "learning_rate": 0.00039417989417989417,
+      "loss": 0.315,
+      "step": 6070
+    },
+    {
+      "epoch": 2.4126984126984126,
+      "grad_norm": 0.7956401705741882,
+      "learning_rate": 0.0003915343915343915,
+      "loss": 0.3218,
+      "step": 6080
+    },
+    {
+      "epoch": 2.4166666666666665,
+      "grad_norm": 0.7292457818984985,
+      "learning_rate": 0.0003888888888888889,
+      "loss": 0.3398,
+      "step": 6090
+    },
+    {
+      "epoch": 2.4206349206349205,
+      "grad_norm": 1.0612292289733887,
+      "learning_rate": 0.0003862433862433862,
+      "loss": 0.3405,
+      "step": 6100
+    },
+    {
+      "epoch": 2.4246031746031744,
+      "grad_norm": 0.7647016644477844,
+      "learning_rate": 0.0003835978835978836,
+      "loss": 0.4365,
+      "step": 6110
+    },
+    {
+      "epoch": 2.4285714285714284,
+      "grad_norm": 0.6238649487495422,
+      "learning_rate": 0.00038095238095238096,
+      "loss": 0.3462,
+      "step": 6120
+    },
+    {
+      "epoch": 2.432539682539683,
+      "grad_norm": 0.7567634582519531,
+      "learning_rate": 0.0003783068783068783,
+      "loss": 0.2732,
+      "step": 6130
+    },
+    {
+      "epoch": 2.4365079365079367,
+      "grad_norm": 0.589939534664154,
+      "learning_rate": 0.00037566137566137566,
+      "loss": 0.4249,
+      "step": 6140
+    },
+    {
+      "epoch": 2.4404761904761907,
+      "grad_norm": 0.9400720596313477,
+      "learning_rate": 0.000373015873015873,
+      "loss": 0.4112,
+      "step": 6150
+    },
+    {
+      "epoch": 2.4444444444444446,
+      "grad_norm": 0.7339090704917908,
+      "learning_rate": 0.00037037037037037035,
+      "loss": 0.3596,
+      "step": 6160
+    },
+    {
+      "epoch": 2.4484126984126986,
+      "grad_norm": 1.508101463317871,
+      "learning_rate": 0.00036772486772486775,
+      "loss": 0.2354,
+      "step": 6170
+    },
+    {
+      "epoch": 2.4523809523809526,
+      "grad_norm": 1.042312741279602,
+      "learning_rate": 0.00036507936507936505,
+      "loss": 0.4263,
+      "step": 6180
+    },
+    {
+      "epoch": 2.4563492063492065,
+      "grad_norm": 1.1017494201660156,
+      "learning_rate": 0.00036243386243386245,
+      "loss": 0.4159,
+      "step": 6190
+    },
+    {
+      "epoch": 2.4603174603174605,
+      "grad_norm": 0.7952788472175598,
+      "learning_rate": 0.0003597883597883598,
+      "loss": 0.2949,
+      "step": 6200
+    },
+    {
+      "epoch": 2.4642857142857144,
+      "grad_norm": 0.652211606502533,
+      "learning_rate": 0.00035714285714285714,
+      "loss": 0.3523,
+      "step": 6210
+    },
+    {
+      "epoch": 2.4682539682539684,
+      "grad_norm": 1.0506590604782104,
+      "learning_rate": 0.0003544973544973545,
+      "loss": 0.4683,
+      "step": 6220
+    },
+    {
+      "epoch": 2.4722222222222223,
+      "grad_norm": 0.7924396991729736,
+      "learning_rate": 0.0003518518518518519,
+      "loss": 0.3162,
+      "step": 6230
+    },
+    {
+      "epoch": 2.4761904761904763,
+      "grad_norm": 0.5057342052459717,
+      "learning_rate": 0.0003492063492063492,
+      "loss": 0.2741,
+      "step": 6240
+    },
+    {
+      "epoch": 2.4801587301587302,
+      "grad_norm": 1.0041768550872803,
+      "learning_rate": 0.0003465608465608466,
+      "loss": 0.3528,
+      "step": 6250
+    },
+    {
+      "epoch": 2.484126984126984,
+      "grad_norm": 1.06671941280365,
+      "learning_rate": 0.0003439153439153439,
+      "loss": 0.3605,
+      "step": 6260
+    },
+    {
+      "epoch": 2.488095238095238,
+      "grad_norm": 0.41186466813087463,
+      "learning_rate": 0.0003412698412698413,
+      "loss": 0.2841,
+      "step": 6270
+    },
+    {
+      "epoch": 2.492063492063492,
+      "grad_norm": 0.3925606906414032,
+      "learning_rate": 0.00033862433862433863,
+      "loss": 0.3427,
+      "step": 6280
+    },
+    {
+      "epoch": 2.496031746031746,
+      "grad_norm": 1.4012260437011719,
+      "learning_rate": 0.000335978835978836,
+      "loss": 0.4803,
+      "step": 6290
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.5710623264312744,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 0.3037,
+      "step": 6300
+    },
+    {
+      "epoch": 2.503968253968254,
+      "grad_norm": 0.9036715030670166,
+      "learning_rate": 0.0003306878306878307,
+      "loss": 0.3943,
+      "step": 6310
+    },
+    {
+      "epoch": 2.507936507936508,
+      "grad_norm": 0.6256608366966248,
+      "learning_rate": 0.000328042328042328,
+      "loss": 0.3014,
+      "step": 6320
+    },
+    {
+      "epoch": 2.511904761904762,
+      "grad_norm": 0.8218435645103455,
+      "learning_rate": 0.0003253968253968254,
+      "loss": 0.4707,
+      "step": 6330
+    },
+    {
+      "epoch": 2.515873015873016,
+      "grad_norm": 0.6735277771949768,
+      "learning_rate": 0.00032275132275132277,
+      "loss": 0.2808,
+      "step": 6340
+    },
+    {
+      "epoch": 2.5198412698412698,
+      "grad_norm": 0.6450037360191345,
+      "learning_rate": 0.0003201058201058201,
+      "loss": 0.3727,
+      "step": 6350
+    },
+    {
+      "epoch": 2.5238095238095237,
+      "grad_norm": 1.246138334274292,
+      "learning_rate": 0.00031746031746031746,
+      "loss": 0.3881,
+      "step": 6360
+    },
+    {
+      "epoch": 2.5277777777777777,
+      "grad_norm": 0.5396189093589783,
+      "learning_rate": 0.0003148148148148148,
+      "loss": 0.2661,
+      "step": 6370
+    },
+    {
+      "epoch": 2.5317460317460316,
+      "grad_norm": 1.2827895879745483,
+      "learning_rate": 0.00031216931216931215,
+      "loss": 0.2871,
+      "step": 6380
+    },
+    {
+      "epoch": 2.5357142857142856,
+      "grad_norm": 0.7319866418838501,
+      "learning_rate": 0.00030952380952380956,
+      "loss": 0.3458,
+      "step": 6390
+    },
+    {
+      "epoch": 2.5396825396825395,
+      "grad_norm": 0.5848907828330994,
+      "learning_rate": 0.00030687830687830685,
+      "loss": 0.3542,
+      "step": 6400
+    },
+    {
+      "epoch": 2.5436507936507935,
+      "grad_norm": 1.022750735282898,
+      "learning_rate": 0.00030423280423280425,
+      "loss": 0.3295,
+      "step": 6410
+    },
+    {
+      "epoch": 2.5476190476190474,
+      "grad_norm": 0.6221028566360474,
+      "learning_rate": 0.0003015873015873016,
+      "loss": 0.3135,
+      "step": 6420
+    },
+    {
+      "epoch": 2.5515873015873014,
+      "grad_norm": 0.7685695886611938,
+      "learning_rate": 0.00029894179894179895,
+      "loss": 0.3074,
+      "step": 6430
+    },
+    {
+      "epoch": 2.5555555555555554,
+      "grad_norm": 1.1064790487289429,
+      "learning_rate": 0.0002962962962962963,
+      "loss": 0.2694,
+      "step": 6440
+    },
+    {
+      "epoch": 2.5595238095238093,
+      "grad_norm": 1.2743747234344482,
+      "learning_rate": 0.0002936507936507937,
+      "loss": 0.3203,
+      "step": 6450
+    },
+    {
+      "epoch": 2.5634920634920633,
+      "grad_norm": 0.8724698424339294,
+      "learning_rate": 0.000291005291005291,
+      "loss": 0.3696,
+      "step": 6460
+    },
+    {
+      "epoch": 2.567460317460317,
+      "grad_norm": 0.5731073617935181,
+      "learning_rate": 0.0002883597883597884,
+      "loss": 0.4232,
+      "step": 6470
+    },
+    {
+      "epoch": 2.571428571428571,
+      "grad_norm": 1.1916602849960327,
+      "learning_rate": 0.0002857142857142857,
+      "loss": 0.3249,
+      "step": 6480
+    },
+    {
+      "epoch": 2.575396825396825,
+      "grad_norm": 0.6559428572654724,
+      "learning_rate": 0.0002830687830687831,
+      "loss": 0.3469,
+      "step": 6490
+    },
+    {
+      "epoch": 2.5793650793650795,
+      "grad_norm": 0.7409236431121826,
+      "learning_rate": 0.00028042328042328043,
+      "loss": 0.2983,
+      "step": 6500
+    },
+    {
+      "epoch": 2.5833333333333335,
+      "grad_norm": 0.9593034982681274,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 0.2303,
+      "step": 6510
+    },
+    {
+      "epoch": 2.5873015873015874,
+      "grad_norm": 1.2059444189071655,
+      "learning_rate": 0.0002751322751322751,
+      "loss": 0.2786,
+      "step": 6520
+    },
+    {
+      "epoch": 2.5912698412698414,
+      "grad_norm": 0.47993749380111694,
+      "learning_rate": 0.0002724867724867725,
+      "loss": 0.2946,
+      "step": 6530
+    },
+    {
+      "epoch": 2.5952380952380953,
+      "grad_norm": 1.1158372163772583,
+      "learning_rate": 0.0002698412698412698,
+      "loss": 0.3429,
+      "step": 6540
+    },
+    {
+      "epoch": 2.5992063492063493,
+      "grad_norm": 0.6710345149040222,
+      "learning_rate": 0.0002671957671957672,
+      "loss": 0.4059,
+      "step": 6550
+    },
+    {
+      "epoch": 2.6031746031746033,
+      "grad_norm": 0.6601153016090393,
+      "learning_rate": 0.00026455026455026457,
+      "loss": 0.4536,
+      "step": 6560
+    },
+    {
+      "epoch": 2.607142857142857,
+      "grad_norm": 1.258527398109436,
+      "learning_rate": 0.0002619047619047619,
+      "loss": 0.3204,
+      "step": 6570
+    },
+    {
+      "epoch": 2.611111111111111,
+      "grad_norm": 0.6397349834442139,
+      "learning_rate": 0.00025925925925925926,
+      "loss": 0.34,
+      "step": 6580
+    },
+    {
+      "epoch": 2.615079365079365,
+      "grad_norm": 0.6242520213127136,
+      "learning_rate": 0.0002566137566137566,
+      "loss": 0.2976,
+      "step": 6590
+    },
+    {
+      "epoch": 2.619047619047619,
+      "grad_norm": 1.0702687501907349,
+      "learning_rate": 0.00025396825396825396,
+      "loss": 0.302,
+      "step": 6600
+    },
+    {
+      "epoch": 2.623015873015873,
+      "grad_norm": 0.38248881697654724,
+      "learning_rate": 0.00025132275132275136,
+      "loss": 0.2235,
+      "step": 6610
+    },
+    {
+      "epoch": 2.626984126984127,
+      "grad_norm": 0.67015141248703,
+      "learning_rate": 0.00024867724867724865,
+      "loss": 0.3465,
+      "step": 6620
+    },
+    {
+      "epoch": 2.630952380952381,
+      "grad_norm": 0.9071609377861023,
+      "learning_rate": 0.000246031746031746,
+      "loss": 0.3543,
+      "step": 6630
+    },
+    {
+      "epoch": 2.634920634920635,
+      "grad_norm": 1.1956970691680908,
+      "learning_rate": 0.00024338624338624337,
+      "loss": 0.3436,
+      "step": 6640
+    },
+    {
+      "epoch": 2.638888888888889,
+      "grad_norm": 0.6545996069908142,
+      "learning_rate": 0.00024074074074074072,
+      "loss": 0.2752,
+      "step": 6650
+    },
+    {
+      "epoch": 2.642857142857143,
+      "grad_norm": 0.8755260705947876,
+      "learning_rate": 0.0002380952380952381,
+      "loss": 0.3287,
+      "step": 6660
+    },
+    {
+      "epoch": 2.6468253968253967,
+      "grad_norm": 0.5090301036834717,
+      "learning_rate": 0.00023544973544973544,
+      "loss": 0.2705,
+      "step": 6670
+    },
+    {
+      "epoch": 2.6507936507936507,
+      "grad_norm": 0.776059091091156,
+      "learning_rate": 0.0002328042328042328,
+      "loss": 0.315,
+      "step": 6680
+    },
+    {
+      "epoch": 2.6547619047619047,
+      "grad_norm": 0.752827525138855,
+      "learning_rate": 0.00023015873015873016,
+      "loss": 0.4351,
+      "step": 6690
+    },
+    {
+      "epoch": 2.6587301587301586,
+      "grad_norm": 0.5817317366600037,
+      "learning_rate": 0.0002275132275132275,
+      "loss": 0.3015,
+      "step": 6700
+    },
+    {
+      "epoch": 2.6626984126984126,
+      "grad_norm": 0.7703492641448975,
+      "learning_rate": 0.00022486772486772486,
+      "loss": 0.3444,
+      "step": 6710
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 1.1149251461029053,
+      "learning_rate": 0.0002222222222222222,
+      "loss": 0.3366,
+      "step": 6720
+    },
+    {
+      "epoch": 2.6706349206349205,
+      "grad_norm": 0.5407519340515137,
+      "learning_rate": 0.00021957671957671958,
+      "loss": 0.3062,
+      "step": 6730
+    },
+    {
+      "epoch": 2.674603174603175,
+      "grad_norm": 0.999150276184082,
+      "learning_rate": 0.00021693121693121693,
+      "loss": 0.3877,
+      "step": 6740
+    },
+    {
+      "epoch": 2.678571428571429,
+      "grad_norm": 1.0281010866165161,
+      "learning_rate": 0.00021428571428571427,
+      "loss": 0.3898,
+      "step": 6750
+    },
+    {
+      "epoch": 2.682539682539683,
+      "grad_norm": 0.5821579098701477,
+      "learning_rate": 0.00021164021164021162,
+      "loss": 0.4434,
+      "step": 6760
+    },
+    {
+      "epoch": 2.6865079365079367,
+      "grad_norm": 0.7311249375343323,
+      "learning_rate": 0.000208994708994709,
+      "loss": 0.2707,
+      "step": 6770
+    },
+    {
+      "epoch": 2.6904761904761907,
+      "grad_norm": 1.1552441120147705,
+      "learning_rate": 0.00020634920634920634,
+      "loss": 0.3594,
+      "step": 6780
+    },
+    {
+      "epoch": 2.6944444444444446,
+      "grad_norm": 0.5154855847358704,
+      "learning_rate": 0.0002037037037037037,
+      "loss": 0.2184,
+      "step": 6790
+    },
+    {
+      "epoch": 2.6984126984126986,
+      "grad_norm": 1.3952319622039795,
+      "learning_rate": 0.00020105820105820104,
+      "loss": 0.3249,
+      "step": 6800
+    },
+    {
+      "epoch": 2.7023809523809526,
+      "grad_norm": 1.300567626953125,
+      "learning_rate": 0.0001984126984126984,
+      "loss": 0.3479,
+      "step": 6810
+    },
+    {
+      "epoch": 2.7063492063492065,
+      "grad_norm": 0.8334280848503113,
+      "learning_rate": 0.00019576719576719576,
+      "loss": 0.2659,
+      "step": 6820
+    },
+    {
+      "epoch": 2.7103174603174605,
+      "grad_norm": 0.6446446776390076,
+      "learning_rate": 0.0001931216931216931,
+      "loss": 0.2593,
+      "step": 6830
+    },
+    {
+      "epoch": 2.7142857142857144,
+      "grad_norm": 0.5977747440338135,
+      "learning_rate": 0.00019047619047619048,
+      "loss": 0.3015,
+      "step": 6840
+    },
+    {
+      "epoch": 2.7182539682539684,
+      "grad_norm": 0.6966296434402466,
+      "learning_rate": 0.00018783068783068783,
+      "loss": 0.2615,
+      "step": 6850
+    },
+    {
+      "epoch": 2.7222222222222223,
+      "grad_norm": 1.3358402252197266,
+      "learning_rate": 0.00018518518518518518,
+      "loss": 0.3439,
+      "step": 6860
+    },
+    {
+      "epoch": 2.7261904761904763,
+      "grad_norm": 1.1806023120880127,
+      "learning_rate": 0.00018253968253968252,
+      "loss": 0.2893,
+      "step": 6870
+    },
+    {
+      "epoch": 2.7301587301587302,
+      "grad_norm": 0.6638475656509399,
+      "learning_rate": 0.0001798941798941799,
+      "loss": 0.236,
+      "step": 6880
+    },
+    {
+      "epoch": 2.734126984126984,
+      "grad_norm": 0.9930281639099121,
+      "learning_rate": 0.00017724867724867724,
+      "loss": 0.3489,
+      "step": 6890
+    },
+    {
+      "epoch": 2.738095238095238,
+      "grad_norm": 0.7173562049865723,
+      "learning_rate": 0.0001746031746031746,
+      "loss": 0.2933,
+      "step": 6900
+    },
+    {
+      "epoch": 2.742063492063492,
+      "grad_norm": 1.0661985874176025,
+      "learning_rate": 0.00017195767195767194,
+      "loss": 0.2888,
+      "step": 6910
+    },
+    {
+      "epoch": 2.746031746031746,
+      "grad_norm": 0.6391360759735107,
+      "learning_rate": 0.00016931216931216931,
+      "loss": 0.1776,
+      "step": 6920
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.8126150965690613,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 0.3875,
+      "step": 6930
+    },
+    {
+      "epoch": 2.753968253968254,
+      "grad_norm": 0.5906522274017334,
+      "learning_rate": 0.000164021164021164,
+      "loss": 0.2361,
+      "step": 6940
+    },
+    {
+      "epoch": 2.757936507936508,
+      "grad_norm": 0.7056891918182373,
+      "learning_rate": 0.00016137566137566138,
+      "loss": 0.356,
+      "step": 6950
+    },
+    {
+      "epoch": 2.761904761904762,
+      "grad_norm": 0.8128471970558167,
+      "learning_rate": 0.00015873015873015873,
+      "loss": 0.2629,
+      "step": 6960
+    },
+    {
+      "epoch": 2.765873015873016,
+      "grad_norm": 0.76819908618927,
+      "learning_rate": 0.00015608465608465608,
+      "loss": 0.2926,
+      "step": 6970
+    },
+    {
+      "epoch": 2.7698412698412698,
+      "grad_norm": 0.8994793891906738,
+      "learning_rate": 0.00015343915343915342,
+      "loss": 0.4022,
+      "step": 6980
+    },
+    {
+      "epoch": 2.7738095238095237,
+      "grad_norm": 0.6637232303619385,
+      "learning_rate": 0.0001507936507936508,
+      "loss": 0.393,
+      "step": 6990
+    },
+    {
+      "epoch": 2.7777777777777777,
+      "grad_norm": 1.1956053972244263,
+      "learning_rate": 0.00014814814814814815,
+      "loss": 0.3125,
+      "step": 7000
+    },
+    {
+      "epoch": 2.7817460317460316,
+      "grad_norm": 0.8361969590187073,
+      "learning_rate": 0.0001455026455026455,
+      "loss": 0.337,
+      "step": 7010
+    },
+    {
+      "epoch": 2.7857142857142856,
+      "grad_norm": 0.6757733225822449,
+      "learning_rate": 0.00014285714285714284,
+      "loss": 0.266,
+      "step": 7020
+    },
+    {
+      "epoch": 2.7896825396825395,
+      "grad_norm": 0.808002769947052,
+      "learning_rate": 0.00014021164021164022,
+      "loss": 0.3207,
+      "step": 7030
+    },
+    {
+      "epoch": 2.7936507936507935,
+      "grad_norm": 0.670218288898468,
+      "learning_rate": 0.00013756613756613756,
+      "loss": 0.3628,
+      "step": 7040
+    },
+    {
+      "epoch": 2.7976190476190474,
+      "grad_norm": 0.9069200158119202,
+      "learning_rate": 0.0001349206349206349,
+      "loss": 0.4583,
+      "step": 7050
+    },
+    {
+      "epoch": 2.8015873015873014,
+      "grad_norm": 0.7543951869010925,
+      "learning_rate": 0.00013227513227513228,
+      "loss": 0.3408,
+      "step": 7060
+    },
+    {
+      "epoch": 2.8055555555555554,
+      "grad_norm": 0.6418523788452148,
+      "learning_rate": 0.00012962962962962963,
+      "loss": 0.2587,
+      "step": 7070
+    },
+    {
+      "epoch": 2.8095238095238093,
+      "grad_norm": 0.4243696928024292,
+      "learning_rate": 0.00012698412698412698,
+      "loss": 0.2978,
+      "step": 7080
+    },
+    {
+      "epoch": 2.8134920634920633,
+      "grad_norm": 0.8575748801231384,
+      "learning_rate": 0.00012433862433862433,
+      "loss": 0.3119,
+      "step": 7090
+    },
+    {
+      "epoch": 2.817460317460317,
+      "grad_norm": 0.8136184215545654,
+      "learning_rate": 0.00012169312169312169,
+      "loss": 0.245,
+      "step": 7100
+    },
+    {
+      "epoch": 2.821428571428571,
+      "grad_norm": 1.1264744997024536,
+      "learning_rate": 0.00011904761904761905,
+      "loss": 0.3884,
+      "step": 7110
+    },
+    {
+      "epoch": 2.825396825396825,
+      "grad_norm": 0.6529180407524109,
+      "learning_rate": 0.0001164021164021164,
+      "loss": 0.2985,
+      "step": 7120
+    },
+    {
+      "epoch": 2.8293650793650795,
+      "grad_norm": 1.2286404371261597,
+      "learning_rate": 0.00011375661375661376,
+      "loss": 0.3886,
+      "step": 7130
+    },
+    {
+      "epoch": 2.8333333333333335,
+      "grad_norm": 0.46890988945961,
+      "learning_rate": 0.0001111111111111111,
+      "loss": 0.3925,
+      "step": 7140
+    },
+    {
+      "epoch": 2.8373015873015874,
+      "grad_norm": 0.8656564354896545,
+      "learning_rate": 0.00010846560846560846,
+      "loss": 0.2998,
+      "step": 7150
+    },
+    {
+      "epoch": 2.8412698412698414,
+      "grad_norm": 0.6795648336410522,
+      "learning_rate": 0.00010582010582010581,
+      "loss": 0.2654,
+      "step": 7160
+    },
+    {
+      "epoch": 2.8452380952380953,
+      "grad_norm": 0.9066348075866699,
+      "learning_rate": 0.00010317460317460317,
+      "loss": 0.4258,
+      "step": 7170
+    },
+    {
+      "epoch": 2.8492063492063493,
+      "grad_norm": 1.2527462244033813,
+      "learning_rate": 0.00010052910052910052,
+      "loss": 0.3391,
+      "step": 7180
+    },
+    {
+      "epoch": 2.8531746031746033,
+      "grad_norm": 0.767871081829071,
+      "learning_rate": 9.788359788359788e-05,
+      "loss": 0.2898,
+      "step": 7190
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.3189416825771332,
+      "learning_rate": 9.523809523809524e-05,
+      "loss": 0.2147,
+      "step": 7200
+    },
+    {
+      "epoch": 2.861111111111111,
+      "grad_norm": 0.7316614985466003,
+      "learning_rate": 9.259259259259259e-05,
+      "loss": 0.2906,
+      "step": 7210
+    },
+    {
+      "epoch": 2.865079365079365,
+      "grad_norm": 0.6827272772789001,
+      "learning_rate": 8.994708994708995e-05,
+      "loss": 0.2558,
+      "step": 7220
+    },
+    {
+      "epoch": 2.869047619047619,
+      "grad_norm": 0.40644726157188416,
+      "learning_rate": 8.73015873015873e-05,
+      "loss": 0.2516,
+      "step": 7230
+    },
+    {
+      "epoch": 2.873015873015873,
+      "grad_norm": 0.9451491236686707,
+      "learning_rate": 8.465608465608466e-05,
+      "loss": 0.3458,
+      "step": 7240
+    },
+    {
+      "epoch": 2.876984126984127,
+      "grad_norm": 0.5476970672607422,
+      "learning_rate": 8.2010582010582e-05,
+      "loss": 0.3106,
+      "step": 7250
+    },
+    {
+      "epoch": 2.880952380952381,
+      "grad_norm": 0.8001719117164612,
+      "learning_rate": 7.936507936507937e-05,
+      "loss": 0.3429,
+      "step": 7260
+    },
+    {
+      "epoch": 2.884920634920635,
+      "grad_norm": 0.5511224269866943,
+      "learning_rate": 7.671957671957671e-05,
+      "loss": 0.3601,
+      "step": 7270
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": 0.6623083353042603,
+      "learning_rate": 7.407407407407407e-05,
+      "loss": 0.3627,
+      "step": 7280
+    },
+    {
+      "epoch": 2.892857142857143,
+      "grad_norm": 0.5939965844154358,
+      "learning_rate": 7.142857142857142e-05,
+      "loss": 0.3334,
+      "step": 7290
+    },
+    {
+      "epoch": 2.8968253968253967,
+      "grad_norm": 0.699934184551239,
+      "learning_rate": 6.878306878306878e-05,
+      "loss": 0.3701,
+      "step": 7300
+    },
+    {
+      "epoch": 2.9007936507936507,
+      "grad_norm": 1.0135327577590942,
+      "learning_rate": 6.613756613756614e-05,
+      "loss": 0.284,
+      "step": 7310
+    },
+    {
+      "epoch": 2.9047619047619047,
+      "grad_norm": 0.7858631014823914,
+      "learning_rate": 6.349206349206349e-05,
+      "loss": 0.3369,
+      "step": 7320
+    },
+    {
+      "epoch": 2.9087301587301586,
+      "grad_norm": 0.8691958785057068,
+      "learning_rate": 6.084656084656084e-05,
+      "loss": 0.3139,
+      "step": 7330
+    },
+    {
+      "epoch": 2.9126984126984126,
+      "grad_norm": 0.6003396511077881,
+      "learning_rate": 5.82010582010582e-05,
+      "loss": 0.246,
+      "step": 7340
+    },
+    {
+      "epoch": 2.9166666666666665,
+      "grad_norm": 0.5547357797622681,
+      "learning_rate": 5.555555555555555e-05,
+      "loss": 0.2271,
+      "step": 7350
+    },
+    {
+      "epoch": 2.9206349206349205,
+      "grad_norm": 1.1599771976470947,
+      "learning_rate": 5.2910052910052905e-05,
+      "loss": 0.3734,
+      "step": 7360
+    },
+    {
+      "epoch": 2.924603174603175,
+      "grad_norm": 0.7528437972068787,
+      "learning_rate": 5.026455026455026e-05,
+      "loss": 0.3274,
+      "step": 7370
+    },
+    {
+      "epoch": 2.928571428571429,
+      "grad_norm": 1.2826586961746216,
+      "learning_rate": 4.761904761904762e-05,
+      "loss": 0.3819,
+      "step": 7380
+    },
+    {
+      "epoch": 2.932539682539683,
+      "grad_norm": 1.2130956649780273,
+      "learning_rate": 4.4973544973544974e-05,
+      "loss": 0.2897,
+      "step": 7390
+    },
+    {
+      "epoch": 2.9365079365079367,
+      "grad_norm": 0.4103514552116394,
+      "learning_rate": 4.232804232804233e-05,
+      "loss": 0.245,
+      "step": 7400
+    },
+    {
+      "epoch": 2.9404761904761907,
+      "grad_norm": 1.2988202571868896,
+      "learning_rate": 3.968253968253968e-05,
+      "loss": 0.3479,
+      "step": 7410
+    },
+    {
+      "epoch": 2.9444444444444446,
+      "grad_norm": 0.4846343696117401,
+      "learning_rate": 3.7037037037037037e-05,
+      "loss": 0.2929,
+      "step": 7420
+    },
+    {
+      "epoch": 2.9484126984126986,
+      "grad_norm": 1.0788893699645996,
+      "learning_rate": 3.439153439153439e-05,
+      "loss": 0.3317,
+      "step": 7430
+    },
+    {
+      "epoch": 2.9523809523809526,
+      "grad_norm": 0.5080360174179077,
+      "learning_rate": 3.1746031746031745e-05,
+      "loss": 0.3498,
+      "step": 7440
+    },
+    {
+      "epoch": 2.9563492063492065,
+      "grad_norm": 0.8950350284576416,
+      "learning_rate": 2.91005291005291e-05,
+      "loss": 0.3807,
+      "step": 7450
+    },
+    {
+      "epoch": 2.9603174603174605,
+      "grad_norm": 0.5955391526222229,
+      "learning_rate": 2.6455026455026453e-05,
+      "loss": 0.3587,
+      "step": 7460
+    },
+    {
+      "epoch": 2.9642857142857144,
+      "grad_norm": 0.8612658977508545,
+      "learning_rate": 2.380952380952381e-05,
+      "loss": 0.3857,
+      "step": 7470
+    },
+    {
+      "epoch": 2.9682539682539684,
+      "grad_norm": 0.4796072542667389,
+      "learning_rate": 2.1164021164021164e-05,
+      "loss": 0.2429,
+      "step": 7480
+    },
+    {
+      "epoch": 2.9722222222222223,
+      "grad_norm": 0.6475656032562256,
+      "learning_rate": 1.8518518518518518e-05,
+      "loss": 0.2567,
+      "step": 7490
+    },
+    {
+      "epoch": 2.9761904761904763,
+      "grad_norm": 1.2244335412979126,
+      "learning_rate": 1.5873015873015872e-05,
+      "loss": 0.3446,
+      "step": 7500
+    },
+    {
+      "epoch": 0.9630674531931265,
+      "grad_norm": 2.1661019325256348,
+      "learning_rate": 0.0013579550312045822,
+      "loss": 0.8551,
+      "step": 7510
+    },
+    {
+      "epoch": 0.9643498332905873,
+      "grad_norm": 2.811357021331787,
+      "learning_rate": 0.0013571001111396083,
+      "loss": 1.0575,
+      "step": 7520
+    },
+    {
+      "epoch": 0.9656322133880482,
+      "grad_norm": 1.8580657243728638,
+      "learning_rate": 0.0013562451910746345,
+      "loss": 0.7093,
+      "step": 7530
+    },
+    {
+      "epoch": 0.9669145934855091,
+      "grad_norm": 1.7952332496643066,
+      "learning_rate": 0.0013553902710096606,
+      "loss": 0.8066,
+      "step": 7540
+    },
+    {
+      "epoch": 0.96819697358297,
+      "grad_norm": 1.4091452360153198,
+      "learning_rate": 0.0013545353509446867,
+      "loss": 0.9003,
+      "step": 7550
+    },
+    {
+      "epoch": 0.9694793536804309,
+      "grad_norm": 0.9127289652824402,
+      "learning_rate": 0.0013536804308797129,
+      "loss": 0.607,
+      "step": 7560
+    },
+    {
+      "epoch": 0.9707617337778918,
+      "grad_norm": 1.1701823472976685,
+      "learning_rate": 0.001352825510814739,
+      "loss": 0.8753,
+      "step": 7570
+    },
+    {
+      "epoch": 0.9720441138753526,
+      "grad_norm": 1.0958774089813232,
+      "learning_rate": 0.001351970590749765,
+      "loss": 0.8458,
+      "step": 7580
+    },
+    {
+      "epoch": 0.9733264939728136,
+      "grad_norm": 1.0484057664871216,
+      "learning_rate": 0.0013511156706847909,
+      "loss": 0.8565,
+      "step": 7590
+    },
+    {
+      "epoch": 0.9746088740702744,
+      "grad_norm": 1.4138461351394653,
+      "learning_rate": 0.001350260750619817,
+      "loss": 0.6054,
+      "step": 7600
+    },
+    {
+      "epoch": 0.9758912541677354,
+      "grad_norm": 1.9181944131851196,
+      "learning_rate": 0.0013494058305548431,
+      "loss": 1.0207,
+      "step": 7610
+    },
+    {
+      "epoch": 0.9771736342651962,
+      "grad_norm": 1.6007705926895142,
+      "learning_rate": 0.0013485509104898693,
+      "loss": 0.6417,
+      "step": 7620
+    },
+    {
+      "epoch": 0.9784560143626571,
+      "grad_norm": 1.3225061893463135,
+      "learning_rate": 0.0013476959904248954,
+      "loss": 0.9063,
+      "step": 7630
+    },
+    {
+      "epoch": 0.979738394460118,
+      "grad_norm": 1.6732155084609985,
+      "learning_rate": 0.0013468410703599213,
+      "loss": 1.058,
+      "step": 7640
+    },
+    {
+      "epoch": 0.9810207745575789,
+      "grad_norm": 1.4079992771148682,
+      "learning_rate": 0.0013459861502949475,
+      "loss": 0.9633,
+      "step": 7650
+    },
+    {
+      "epoch": 0.9823031546550397,
+      "grad_norm": 1.1940516233444214,
+      "learning_rate": 0.0013451312302299736,
+      "loss": 0.7728,
+      "step": 7660
+    },
+    {
+      "epoch": 0.9835855347525007,
+      "grad_norm": 1.1965214014053345,
+      "learning_rate": 0.0013442763101649995,
+      "loss": 0.7916,
+      "step": 7670
+    },
+    {
+      "epoch": 0.9848679148499615,
+      "grad_norm": 1.6329299211502075,
+      "learning_rate": 0.0013434213901000257,
+      "loss": 0.702,
+      "step": 7680
+    },
+    {
+      "epoch": 0.9861502949474225,
+      "grad_norm": 0.9614496827125549,
+      "learning_rate": 0.0013425664700350518,
+      "loss": 1.1379,
+      "step": 7690
+    },
+    {
+      "epoch": 0.9874326750448833,
+      "grad_norm": 0.7053365707397461,
+      "learning_rate": 0.001341711549970078,
+      "loss": 0.6611,
+      "step": 7700
+    },
+    {
+      "epoch": 0.9887150551423441,
+      "grad_norm": 1.1425424814224243,
+      "learning_rate": 0.0013408566299051039,
+      "loss": 0.8594,
+      "step": 7710
+    },
+    {
+      "epoch": 0.9899974352398051,
+      "grad_norm": 1.6565475463867188,
+      "learning_rate": 0.00134000170984013,
+      "loss": 0.713,
+      "step": 7720
+    },
+    {
+      "epoch": 0.991279815337266,
+      "grad_norm": 0.6158244609832764,
+      "learning_rate": 0.0013391467897751561,
+      "loss": 0.6972,
+      "step": 7730
+    },
+    {
+      "epoch": 0.9925621954347269,
+      "grad_norm": 1.166113018989563,
+      "learning_rate": 0.001338291869710182,
+      "loss": 0.7004,
+      "step": 7740
+    },
+    {
+      "epoch": 0.9938445755321877,
+      "grad_norm": 1.3203206062316895,
+      "learning_rate": 0.0013374369496452082,
+      "loss": 0.8673,
+      "step": 7750
+    },
+    {
+      "epoch": 0.9951269556296486,
+      "grad_norm": 1.4865373373031616,
+      "learning_rate": 0.0013365820295802343,
+      "loss": 0.9206,
+      "step": 7760
+    },
+    {
+      "epoch": 0.9964093357271095,
+      "grad_norm": 1.5147637128829956,
+      "learning_rate": 0.0013357271095152602,
+      "loss": 0.6281,
+      "step": 7770
+    },
+    {
+      "epoch": 0.9976917158245704,
+      "grad_norm": 2.0644052028656006,
+      "learning_rate": 0.0013348721894502864,
+      "loss": 0.8273,
+      "step": 7780
+    },
+    {
+      "epoch": 0.9989740959220312,
+      "grad_norm": 1.2069566249847412,
+      "learning_rate": 0.0013340172693853125,
+      "loss": 0.8031,
+      "step": 7790
+    },
+    {
+      "epoch": 1.000256476019492,
+      "grad_norm": 0.8759682178497314,
+      "learning_rate": 0.0013331623493203386,
+      "loss": 0.7899,
+      "step": 7800
+    },
+    {
+      "epoch": 1.0015388561169531,
+      "grad_norm": 2.084101438522339,
+      "learning_rate": 0.0013323074292553648,
+      "loss": 0.9902,
+      "step": 7810
+    },
+    {
+      "epoch": 1.002821236214414,
+      "grad_norm": 1.871626377105713,
+      "learning_rate": 0.0013314525091903907,
+      "loss": 0.7242,
+      "step": 7820
+    },
+    {
+      "epoch": 1.0041036163118748,
+      "grad_norm": 1.0887974500656128,
+      "learning_rate": 0.0013305975891254166,
+      "loss": 1.0633,
+      "step": 7830
+    },
+    {
+      "epoch": 1.0053859964093357,
+      "grad_norm": 2.206550359725952,
+      "learning_rate": 0.0013297426690604428,
+      "loss": 0.8527,
+      "step": 7840
+    },
+    {
+      "epoch": 1.0066683765067965,
+      "grad_norm": 1.9103882312774658,
+      "learning_rate": 0.001328887748995469,
+      "loss": 0.9871,
+      "step": 7850
+    },
+    {
+      "epoch": 1.0079507566042576,
+      "grad_norm": 1.2385672330856323,
+      "learning_rate": 0.001328032828930495,
+      "loss": 0.7928,
+      "step": 7860
+    },
+    {
+      "epoch": 1.0092331367017184,
+      "grad_norm": 1.632601261138916,
+      "learning_rate": 0.0013271779088655212,
+      "loss": 0.8596,
+      "step": 7870
+    },
+    {
+      "epoch": 1.0105155167991793,
+      "grad_norm": 0.8926940560340881,
+      "learning_rate": 0.0013263229888005473,
+      "loss": 0.576,
+      "step": 7880
+    },
+    {
+      "epoch": 1.0117978968966401,
+      "grad_norm": 1.632102370262146,
+      "learning_rate": 0.0013254680687355734,
+      "loss": 0.717,
+      "step": 7890
+    },
+    {
+      "epoch": 1.013080276994101,
+      "grad_norm": 1.671094298362732,
+      "learning_rate": 0.0013246131486705991,
+      "loss": 0.7507,
+      "step": 7900
+    },
+    {
+      "epoch": 1.014362657091562,
+      "grad_norm": 1.1313315629959106,
+      "learning_rate": 0.0013237582286056253,
+      "loss": 0.7725,
+      "step": 7910
+    },
+    {
+      "epoch": 1.0156450371890229,
+      "grad_norm": 0.7332689762115479,
+      "learning_rate": 0.0013229033085406514,
+      "loss": 0.6037,
+      "step": 7920
+    },
+    {
+      "epoch": 1.0169274172864837,
+      "grad_norm": 1.1624342203140259,
+      "learning_rate": 0.0013220483884756776,
+      "loss": 0.8739,
+      "step": 7930
+    },
+    {
+      "epoch": 1.0182097973839446,
+      "grad_norm": 1.1926045417785645,
+      "learning_rate": 0.0013211934684107037,
+      "loss": 0.6407,
+      "step": 7940
+    },
+    {
+      "epoch": 1.0194921774814054,
+      "grad_norm": 0.8199732303619385,
+      "learning_rate": 0.0013203385483457298,
+      "loss": 0.7909,
+      "step": 7950
+    },
+    {
+      "epoch": 1.0207745575788665,
+      "grad_norm": 1.0138903856277466,
+      "learning_rate": 0.0013194836282807557,
+      "loss": 0.6335,
+      "step": 7960
+    },
+    {
+      "epoch": 1.0220569376763273,
+      "grad_norm": 1.0864981412887573,
+      "learning_rate": 0.0013186287082157819,
+      "loss": 0.6715,
+      "step": 7970
+    },
+    {
+      "epoch": 1.0233393177737882,
+      "grad_norm": 1.3601443767547607,
+      "learning_rate": 0.0013177737881508078,
+      "loss": 0.7332,
+      "step": 7980
+    },
+    {
+      "epoch": 1.024621697871249,
+      "grad_norm": 1.9008809328079224,
+      "learning_rate": 0.001316918868085834,
+      "loss": 0.8091,
+      "step": 7990
+    },
+    {
+      "epoch": 1.0259040779687099,
+      "grad_norm": 0.7857163548469543,
+      "learning_rate": 0.00131606394802086,
+      "loss": 0.7366,
+      "step": 8000
+    },
+    {
+      "epoch": 1.0271864580661707,
+      "grad_norm": 1.2369189262390137,
+      "learning_rate": 0.0013152090279558862,
+      "loss": 0.6633,
+      "step": 8010
+    },
+    {
+      "epoch": 1.0284688381636318,
+      "grad_norm": 0.9395790696144104,
+      "learning_rate": 0.0013143541078909123,
+      "loss": 0.6482,
+      "step": 8020
+    },
+    {
+      "epoch": 1.0297512182610926,
+      "grad_norm": 0.7295545935630798,
+      "learning_rate": 0.0013134991878259383,
+      "loss": 0.5748,
+      "step": 8030
+    },
+    {
+      "epoch": 1.0310335983585535,
+      "grad_norm": 0.824608564376831,
+      "learning_rate": 0.0013126442677609644,
+      "loss": 0.5573,
+      "step": 8040
+    },
+    {
+      "epoch": 1.0323159784560143,
+      "grad_norm": 0.4935958981513977,
+      "learning_rate": 0.0013117893476959903,
+      "loss": 0.7653,
+      "step": 8050
+    },
+    {
+      "epoch": 1.0335983585534751,
+      "grad_norm": 1.4792139530181885,
+      "learning_rate": 0.0013109344276310165,
+      "loss": 0.6865,
+      "step": 8060
+    },
+    {
+      "epoch": 1.0348807386509362,
+      "grad_norm": 1.3146531581878662,
+      "learning_rate": 0.0013100795075660426,
+      "loss": 1.062,
+      "step": 8070
+    },
+    {
+      "epoch": 1.036163118748397,
+      "grad_norm": 0.7179372906684875,
+      "learning_rate": 0.0013092245875010687,
+      "loss": 0.6821,
+      "step": 8080
+    },
+    {
+      "epoch": 1.037445498845858,
+      "grad_norm": 0.9141503572463989,
+      "learning_rate": 0.0013083696674360947,
+      "loss": 0.7213,
+      "step": 8090
+    },
+    {
+      "epoch": 1.0387278789433187,
+      "grad_norm": 1.0443553924560547,
+      "learning_rate": 0.0013075147473711208,
+      "loss": 0.7408,
+      "step": 8100
+    },
+    {
+      "epoch": 1.0400102590407796,
+      "grad_norm": 1.3247910737991333,
+      "learning_rate": 0.001306659827306147,
+      "loss": 0.7333,
+      "step": 8110
+    },
+    {
+      "epoch": 1.0412926391382407,
+      "grad_norm": 1.7711740732192993,
+      "learning_rate": 0.001305804907241173,
+      "loss": 0.5489,
+      "step": 8120
+    },
+    {
+      "epoch": 1.0425750192357015,
+      "grad_norm": 0.6265628337860107,
+      "learning_rate": 0.001304949987176199,
+      "loss": 0.6677,
+      "step": 8130
+    },
+    {
+      "epoch": 1.0438573993331624,
+      "grad_norm": 0.7401356101036072,
+      "learning_rate": 0.0013040950671112251,
+      "loss": 0.7797,
+      "step": 8140
+    },
+    {
+      "epoch": 1.0451397794306232,
+      "grad_norm": 1.509304165840149,
+      "learning_rate": 0.0013032401470462513,
+      "loss": 0.5999,
+      "step": 8150
+    },
+    {
+      "epoch": 1.046422159528084,
+      "grad_norm": 0.9695005416870117,
+      "learning_rate": 0.0013023852269812772,
+      "loss": 0.7253,
+      "step": 8160
+    },
+    {
+      "epoch": 1.047704539625545,
+      "grad_norm": 0.6109398603439331,
+      "learning_rate": 0.0013015303069163033,
+      "loss": 0.7742,
+      "step": 8170
+    },
+    {
+      "epoch": 1.048986919723006,
+      "grad_norm": 1.064182996749878,
+      "learning_rate": 0.0013006753868513295,
+      "loss": 0.9212,
+      "step": 8180
+    },
+    {
+      "epoch": 1.0502692998204668,
+      "grad_norm": 0.6978124976158142,
+      "learning_rate": 0.0012998204667863556,
+      "loss": 0.4896,
+      "step": 8190
+    },
+    {
+      "epoch": 1.0515516799179276,
+      "grad_norm": 1.6062722206115723,
+      "learning_rate": 0.0012989655467213817,
+      "loss": 0.7899,
+      "step": 8200
+    },
+    {
+      "epoch": 1.0528340600153885,
+      "grad_norm": 1.7857177257537842,
+      "learning_rate": 0.0012981106266564076,
+      "loss": 0.6839,
+      "step": 8210
+    },
+    {
+      "epoch": 1.0541164401128496,
+      "grad_norm": 0.8134258985519409,
+      "learning_rate": 0.0012972557065914336,
+      "loss": 0.7301,
+      "step": 8220
+    },
+    {
+      "epoch": 1.0553988202103104,
+      "grad_norm": 0.9167507886886597,
+      "learning_rate": 0.0012964007865264597,
+      "loss": 0.5785,
+      "step": 8230
+    },
+    {
+      "epoch": 1.0566812003077712,
+      "grad_norm": 1.9447743892669678,
+      "learning_rate": 0.0012955458664614858,
+      "loss": 0.8407,
+      "step": 8240
+    },
+    {
+      "epoch": 1.057963580405232,
+      "grad_norm": 1.4368077516555786,
+      "learning_rate": 0.001294690946396512,
+      "loss": 0.6482,
+      "step": 8250
+    },
+    {
+      "epoch": 1.059245960502693,
+      "grad_norm": 1.1576851606369019,
+      "learning_rate": 0.0012938360263315381,
+      "loss": 0.7483,
+      "step": 8260
+    },
+    {
+      "epoch": 1.060528340600154,
+      "grad_norm": 0.8467000126838684,
+      "learning_rate": 0.0012929811062665642,
+      "loss": 0.5062,
+      "step": 8270
+    },
+    {
+      "epoch": 1.0618107206976148,
+      "grad_norm": 1.393489956855774,
+      "learning_rate": 0.0012921261862015904,
+      "loss": 0.6136,
+      "step": 8280
+    },
+    {
+      "epoch": 1.0630931007950757,
+      "grad_norm": 1.543832540512085,
+      "learning_rate": 0.001291271266136616,
+      "loss": 0.6688,
+      "step": 8290
+    },
+    {
+      "epoch": 1.0643754808925365,
+      "grad_norm": 1.3143365383148193,
+      "learning_rate": 0.0012904163460716422,
+      "loss": 0.7964,
+      "step": 8300
+    },
+    {
+      "epoch": 1.0656578609899974,
+      "grad_norm": 0.8674246072769165,
+      "learning_rate": 0.0012895614260066684,
+      "loss": 0.6107,
+      "step": 8310
+    },
+    {
+      "epoch": 1.0669402410874582,
+      "grad_norm": 1.773195743560791,
+      "learning_rate": 0.0012887065059416945,
+      "loss": 0.5933,
+      "step": 8320
+    },
+    {
+      "epoch": 1.0682226211849193,
+      "grad_norm": 1.678631067276001,
+      "learning_rate": 0.0012878515858767206,
+      "loss": 0.6461,
+      "step": 8330
+    },
+    {
+      "epoch": 1.0695050012823801,
+      "grad_norm": 1.5302932262420654,
+      "learning_rate": 0.0012869966658117468,
+      "loss": 0.6384,
+      "step": 8340
+    },
+    {
+      "epoch": 1.070787381379841,
+      "grad_norm": 2.0298891067504883,
+      "learning_rate": 0.0012861417457467727,
+      "loss": 0.7581,
+      "step": 8350
+    },
+    {
+      "epoch": 1.0720697614773018,
+      "grad_norm": 0.6674436926841736,
+      "learning_rate": 0.0012852868256817988,
+      "loss": 0.5885,
+      "step": 8360
+    },
+    {
+      "epoch": 1.0733521415747627,
+      "grad_norm": 2.016268491744995,
+      "learning_rate": 0.0012844319056168247,
+      "loss": 0.7515,
+      "step": 8370
+    },
+    {
+      "epoch": 1.0746345216722237,
+      "grad_norm": 0.7742639780044556,
+      "learning_rate": 0.0012835769855518509,
+      "loss": 0.8347,
+      "step": 8380
+    },
+    {
+      "epoch": 1.0759169017696846,
+      "grad_norm": 0.7427125573158264,
+      "learning_rate": 0.001282722065486877,
+      "loss": 0.6399,
+      "step": 8390
+    },
+    {
+      "epoch": 1.0771992818671454,
+      "grad_norm": 1.2341949939727783,
+      "learning_rate": 0.0012818671454219032,
+      "loss": 0.7604,
+      "step": 8400
+    },
+    {
+      "epoch": 1.0784816619646063,
+      "grad_norm": 1.360198974609375,
+      "learning_rate": 0.0012810122253569293,
+      "loss": 0.6709,
+      "step": 8410
+    },
+    {
+      "epoch": 1.079764042062067,
+      "grad_norm": 0.9719991087913513,
+      "learning_rate": 0.0012801573052919552,
+      "loss": 0.6708,
+      "step": 8420
+    },
+    {
+      "epoch": 1.0810464221595282,
+      "grad_norm": 1.0374557971954346,
+      "learning_rate": 0.0012793023852269813,
+      "loss": 0.7977,
+      "step": 8430
+    },
+    {
+      "epoch": 1.082328802256989,
+      "grad_norm": 1.6081194877624512,
+      "learning_rate": 0.0012784474651620073,
+      "loss": 0.7365,
+      "step": 8440
+    },
+    {
+      "epoch": 1.0836111823544499,
+      "grad_norm": 1.6259393692016602,
+      "learning_rate": 0.0012775925450970334,
+      "loss": 0.9605,
+      "step": 8450
+    },
+    {
+      "epoch": 1.0848935624519107,
+      "grad_norm": 1.2967445850372314,
+      "learning_rate": 0.0012767376250320595,
+      "loss": 0.6966,
+      "step": 8460
+    },
+    {
+      "epoch": 1.0861759425493716,
+      "grad_norm": 1.928068995475769,
+      "learning_rate": 0.0012758827049670857,
+      "loss": 0.7251,
+      "step": 8470
+    },
+    {
+      "epoch": 1.0874583226468326,
+      "grad_norm": 0.8280344605445862,
+      "learning_rate": 0.0012750277849021116,
+      "loss": 0.6462,
+      "step": 8480
+    },
+    {
+      "epoch": 1.0887407027442935,
+      "grad_norm": 1.297142744064331,
+      "learning_rate": 0.0012741728648371377,
+      "loss": 0.6173,
+      "step": 8490
+    },
+    {
+      "epoch": 1.0900230828417543,
+      "grad_norm": 1.008922815322876,
+      "learning_rate": 0.0012733179447721639,
+      "loss": 0.6061,
+      "step": 8500
+    },
+    {
+      "epoch": 1.0913054629392152,
+      "grad_norm": 1.0894274711608887,
+      "learning_rate": 0.00127246302470719,
+      "loss": 0.553,
+      "step": 8510
+    },
+    {
+      "epoch": 1.092587843036676,
+      "grad_norm": 0.8227475881576538,
+      "learning_rate": 0.001271608104642216,
+      "loss": 0.8644,
+      "step": 8520
+    },
+    {
+      "epoch": 1.0938702231341368,
+      "grad_norm": 1.1827560663223267,
+      "learning_rate": 0.001270753184577242,
+      "loss": 0.6479,
+      "step": 8530
+    },
+    {
+      "epoch": 1.095152603231598,
+      "grad_norm": 0.7578190565109253,
+      "learning_rate": 0.001269898264512268,
+      "loss": 0.6047,
+      "step": 8540
+    },
+    {
+      "epoch": 1.0964349833290588,
+      "grad_norm": 1.0699750185012817,
+      "learning_rate": 0.0012690433444472941,
+      "loss": 0.6204,
+      "step": 8550
+    },
+    {
+      "epoch": 1.0977173634265196,
+      "grad_norm": 0.6562129259109497,
+      "learning_rate": 0.0012681884243823203,
+      "loss": 0.7301,
+      "step": 8560
+    },
+    {
+      "epoch": 1.0989997435239804,
+      "grad_norm": 1.25229811668396,
+      "learning_rate": 0.0012673335043173464,
+      "loss": 0.7684,
+      "step": 8570
+    },
+    {
+      "epoch": 1.1002821236214415,
+      "grad_norm": 2.2153637409210205,
+      "learning_rate": 0.0012664785842523725,
+      "loss": 0.8262,
+      "step": 8580
+    },
+    {
+      "epoch": 1.1015645037189024,
+      "grad_norm": 1.5694715976715088,
+      "learning_rate": 0.0012656236641873987,
+      "loss": 0.793,
+      "step": 8590
+    },
+    {
+      "epoch": 1.1028468838163632,
+      "grad_norm": 1.0267013311386108,
+      "learning_rate": 0.0012647687441224246,
+      "loss": 0.7762,
+      "step": 8600
+    },
+    {
+      "epoch": 1.104129263913824,
+      "grad_norm": 1.9953442811965942,
+      "learning_rate": 0.0012639138240574505,
+      "loss": 0.8645,
+      "step": 8610
+    },
+    {
+      "epoch": 1.1054116440112849,
+      "grad_norm": 1.799996018409729,
+      "learning_rate": 0.0012630589039924766,
+      "loss": 0.8559,
+      "step": 8620
+    },
+    {
+      "epoch": 1.1066940241087457,
+      "grad_norm": 0.7598561644554138,
+      "learning_rate": 0.0012622039839275028,
+      "loss": 0.5443,
+      "step": 8630
+    },
+    {
+      "epoch": 1.1079764042062068,
+      "grad_norm": 1.0869311094284058,
+      "learning_rate": 0.001261349063862529,
+      "loss": 0.6119,
+      "step": 8640
+    },
+    {
+      "epoch": 1.1092587843036676,
+      "grad_norm": 3.7719571590423584,
+      "learning_rate": 0.001260494143797555,
+      "loss": 0.6665,
+      "step": 8650
+    },
+    {
+      "epoch": 1.1105411644011285,
+      "grad_norm": 0.8712837100028992,
+      "learning_rate": 0.0012596392237325812,
+      "loss": 0.6976,
+      "step": 8660
+    },
+    {
+      "epoch": 1.1118235444985893,
+      "grad_norm": 0.7456098794937134,
+      "learning_rate": 0.001258784303667607,
+      "loss": 0.531,
+      "step": 8670
+    },
+    {
+      "epoch": 1.1131059245960502,
+      "grad_norm": 0.8376585245132446,
+      "learning_rate": 0.001257929383602633,
+      "loss": 0.6767,
+      "step": 8680
+    },
+    {
+      "epoch": 1.1143883046935112,
+      "grad_norm": 0.8283999562263489,
+      "learning_rate": 0.0012570744635376592,
+      "loss": 0.7066,
+      "step": 8690
+    },
+    {
+      "epoch": 1.115670684790972,
+      "grad_norm": 1.3523050546646118,
+      "learning_rate": 0.0012562195434726853,
+      "loss": 0.8031,
+      "step": 8700
+    },
+    {
+      "epoch": 1.116953064888433,
+      "grad_norm": 0.8163665533065796,
+      "learning_rate": 0.0012553646234077114,
+      "loss": 0.6172,
+      "step": 8710
+    },
+    {
+      "epoch": 1.1182354449858938,
+      "grad_norm": 0.7833021879196167,
+      "learning_rate": 0.0012545097033427376,
+      "loss": 0.6963,
+      "step": 8720
+    },
+    {
+      "epoch": 1.1195178250833546,
+      "grad_norm": 1.102979302406311,
+      "learning_rate": 0.0012536547832777637,
+      "loss": 0.5429,
+      "step": 8730
+    },
+    {
+      "epoch": 1.1208002051808157,
+      "grad_norm": 0.6765029430389404,
+      "learning_rate": 0.0012527998632127896,
+      "loss": 0.7414,
+      "step": 8740
+    },
+    {
+      "epoch": 1.1220825852782765,
+      "grad_norm": 0.9990458488464355,
+      "learning_rate": 0.0012519449431478155,
+      "loss": 0.7734,
+      "step": 8750
+    },
+    {
+      "epoch": 1.1233649653757374,
+      "grad_norm": 0.533682107925415,
+      "learning_rate": 0.0012510900230828417,
+      "loss": 0.538,
+      "step": 8760
+    },
+    {
+      "epoch": 1.1246473454731982,
+      "grad_norm": 0.8333368301391602,
+      "learning_rate": 0.0012502351030178678,
+      "loss": 0.6792,
+      "step": 8770
+    },
+    {
+      "epoch": 1.125929725570659,
+      "grad_norm": 1.0533137321472168,
+      "learning_rate": 0.001249380182952894,
+      "loss": 0.8741,
+      "step": 8780
+    },
+    {
+      "epoch": 1.1272121056681201,
+      "grad_norm": 0.6982734203338623,
+      "learning_rate": 0.00124852526288792,
+      "loss": 0.487,
+      "step": 8790
+    },
+    {
+      "epoch": 1.128494485765581,
+      "grad_norm": 1.0995063781738281,
+      "learning_rate": 0.001247670342822946,
+      "loss": 0.6424,
+      "step": 8800
+    },
+    {
+      "epoch": 1.1297768658630418,
+      "grad_norm": 1.018072485923767,
+      "learning_rate": 0.0012468154227579722,
+      "loss": 0.5269,
+      "step": 8810
+    },
+    {
+      "epoch": 1.1310592459605027,
+      "grad_norm": 1.1999799013137817,
+      "learning_rate": 0.0012459605026929983,
+      "loss": 0.6517,
+      "step": 8820
+    },
+    {
+      "epoch": 1.1323416260579635,
+      "grad_norm": 1.4877429008483887,
+      "learning_rate": 0.0012451055826280242,
+      "loss": 0.7614,
+      "step": 8830
+    },
+    {
+      "epoch": 1.1336240061554244,
+      "grad_norm": 1.8721554279327393,
+      "learning_rate": 0.0012442506625630503,
+      "loss": 0.7041,
+      "step": 8840
+    },
+    {
+      "epoch": 1.1349063862528854,
+      "grad_norm": 0.9679120779037476,
+      "learning_rate": 0.0012433957424980765,
+      "loss": 0.6513,
+      "step": 8850
+    },
+    {
+      "epoch": 1.1361887663503463,
+      "grad_norm": 1.2489194869995117,
+      "learning_rate": 0.0012425408224331026,
+      "loss": 0.7101,
+      "step": 8860
+    },
+    {
+      "epoch": 1.1374711464478071,
+      "grad_norm": 1.4882025718688965,
+      "learning_rate": 0.0012416859023681285,
+      "loss": 0.8231,
+      "step": 8870
+    },
+    {
+      "epoch": 1.138753526545268,
+      "grad_norm": 1.0859804153442383,
+      "learning_rate": 0.0012408309823031547,
+      "loss": 0.6414,
+      "step": 8880
+    },
+    {
+      "epoch": 1.140035906642729,
+      "grad_norm": 1.027529239654541,
+      "learning_rate": 0.0012399760622381808,
+      "loss": 0.5815,
+      "step": 8890
+    },
+    {
+      "epoch": 1.1413182867401899,
+      "grad_norm": 0.7189488410949707,
+      "learning_rate": 0.001239121142173207,
+      "loss": 0.6481,
+      "step": 8900
+    },
+    {
+      "epoch": 1.1426006668376507,
+      "grad_norm": 0.4419424831867218,
+      "learning_rate": 0.0012382662221082329,
+      "loss": 0.6162,
+      "step": 8910
+    },
+    {
+      "epoch": 1.1438830469351116,
+      "grad_norm": 1.4418524503707886,
+      "learning_rate": 0.001237411302043259,
+      "loss": 0.7172,
+      "step": 8920
+    },
+    {
+      "epoch": 1.1451654270325724,
+      "grad_norm": 1.9095782041549683,
+      "learning_rate": 0.001236556381978285,
+      "loss": 0.691,
+      "step": 8930
+    },
+    {
+      "epoch": 1.1464478071300332,
+      "grad_norm": 1.3859055042266846,
+      "learning_rate": 0.001235701461913311,
+      "loss": 0.7949,
+      "step": 8940
+    },
+    {
+      "epoch": 1.1477301872274943,
+      "grad_norm": 1.1789556741714478,
+      "learning_rate": 0.0012348465418483372,
+      "loss": 0.6125,
+      "step": 8950
+    },
+    {
+      "epoch": 1.1490125673249552,
+      "grad_norm": 1.1849820613861084,
+      "learning_rate": 0.0012339916217833633,
+      "loss": 0.7081,
+      "step": 8960
+    },
+    {
+      "epoch": 1.150294947422416,
+      "grad_norm": 1.8381001949310303,
+      "learning_rate": 0.0012331367017183895,
+      "loss": 0.6178,
+      "step": 8970
+    },
+    {
+      "epoch": 1.1515773275198768,
+      "grad_norm": 1.472754955291748,
+      "learning_rate": 0.0012322817816534156,
+      "loss": 0.6599,
+      "step": 8980
+    },
+    {
+      "epoch": 1.1528597076173377,
+      "grad_norm": 1.1755315065383911,
+      "learning_rate": 0.0012314268615884413,
+      "loss": 0.6519,
+      "step": 8990
+    },
+    {
+      "epoch": 1.1541420877147988,
+      "grad_norm": 1.3931992053985596,
+      "learning_rate": 0.0012305719415234674,
+      "loss": 0.7395,
+      "step": 9000
+    },
+    {
+      "epoch": 1.1554244678122596,
+      "grad_norm": 1.171525001525879,
+      "learning_rate": 0.0012297170214584936,
+      "loss": 0.6334,
+      "step": 9010
+    },
+    {
+      "epoch": 1.1567068479097204,
+      "grad_norm": 0.9669147729873657,
+      "learning_rate": 0.0012288621013935197,
+      "loss": 0.5607,
+      "step": 9020
+    },
+    {
+      "epoch": 1.1579892280071813,
+      "grad_norm": 1.3448598384857178,
+      "learning_rate": 0.0012280071813285459,
+      "loss": 0.6036,
+      "step": 9030
+    },
+    {
+      "epoch": 1.1592716081046421,
+      "grad_norm": 0.9272229671478271,
+      "learning_rate": 0.001227152261263572,
+      "loss": 0.6345,
+      "step": 9040
+    },
+    {
+      "epoch": 1.160553988202103,
+      "grad_norm": 1.4232205152511597,
+      "learning_rate": 0.0012262973411985981,
+      "loss": 0.5926,
+      "step": 9050
+    },
+    {
+      "epoch": 1.161836368299564,
+      "grad_norm": 1.1732438802719116,
+      "learning_rate": 0.001225442421133624,
+      "loss": 0.6894,
+      "step": 9060
+    },
+    {
+      "epoch": 1.163118748397025,
+      "grad_norm": 1.3374831676483154,
+      "learning_rate": 0.00122458750106865,
+      "loss": 0.5419,
+      "step": 9070
+    },
+    {
+      "epoch": 1.1644011284944857,
+      "grad_norm": 1.0163809061050415,
+      "learning_rate": 0.001223732581003676,
+      "loss": 0.537,
+      "step": 9080
+    },
+    {
+      "epoch": 1.1656835085919466,
+      "grad_norm": 1.285212755203247,
+      "learning_rate": 0.0012228776609387022,
+      "loss": 0.6088,
+      "step": 9090
+    },
+    {
+      "epoch": 1.1669658886894076,
+      "grad_norm": 0.41504955291748047,
+      "learning_rate": 0.0012220227408737284,
+      "loss": 0.5944,
+      "step": 9100
+    },
+    {
+      "epoch": 1.1682482687868685,
+      "grad_norm": 1.1668952703475952,
+      "learning_rate": 0.0012211678208087545,
+      "loss": 0.6529,
+      "step": 9110
+    },
+    {
+      "epoch": 1.1695306488843293,
+      "grad_norm": 2.5007708072662354,
+      "learning_rate": 0.0012203129007437804,
+      "loss": 0.7975,
+      "step": 9120
+    },
+    {
+      "epoch": 1.1708130289817902,
+      "grad_norm": 0.4132268726825714,
+      "learning_rate": 0.0012194579806788066,
+      "loss": 0.5383,
+      "step": 9130
+    },
+    {
+      "epoch": 1.172095409079251,
+      "grad_norm": 0.9651444554328918,
+      "learning_rate": 0.0012186030606138325,
+      "loss": 0.6613,
+      "step": 9140
+    },
+    {
+      "epoch": 1.1733777891767119,
+      "grad_norm": 1.2722069025039673,
+      "learning_rate": 0.0012177481405488586,
+      "loss": 0.8106,
+      "step": 9150
+    },
+    {
+      "epoch": 1.174660169274173,
+      "grad_norm": 1.5842227935791016,
+      "learning_rate": 0.0012168932204838848,
+      "loss": 0.5899,
+      "step": 9160
+    },
+    {
+      "epoch": 1.1759425493716338,
+      "grad_norm": 0.7606542110443115,
+      "learning_rate": 0.001216038300418911,
+      "loss": 0.6511,
+      "step": 9170
+    },
+    {
+      "epoch": 1.1772249294690946,
+      "grad_norm": 0.9012206196784973,
+      "learning_rate": 0.001215183380353937,
+      "loss": 0.5919,
+      "step": 9180
+    },
+    {
+      "epoch": 1.1785073095665555,
+      "grad_norm": 1.250051736831665,
+      "learning_rate": 0.001214328460288963,
+      "loss": 0.6909,
+      "step": 9190
+    },
+    {
+      "epoch": 1.1797896896640165,
+      "grad_norm": 1.4063526391983032,
+      "learning_rate": 0.001213473540223989,
+      "loss": 0.5535,
+      "step": 9200
+    },
+    {
+      "epoch": 1.1810720697614774,
+      "grad_norm": 0.7005236148834229,
+      "learning_rate": 0.0012126186201590152,
+      "loss": 0.5309,
+      "step": 9210
+    },
+    {
+      "epoch": 1.1823544498589382,
+      "grad_norm": 1.317863941192627,
+      "learning_rate": 0.0012117637000940411,
+      "loss": 0.8211,
+      "step": 9220
+    },
+    {
+      "epoch": 1.183636829956399,
+      "grad_norm": 1.379496693611145,
+      "learning_rate": 0.0012109087800290673,
+      "loss": 0.8195,
+      "step": 9230
+    },
+    {
+      "epoch": 1.18491921005386,
+      "grad_norm": 0.9941421747207642,
+      "learning_rate": 0.0012100538599640934,
+      "loss": 0.7493,
+      "step": 9240
+    },
+    {
+      "epoch": 1.1862015901513208,
+      "grad_norm": 1.5360379219055176,
+      "learning_rate": 0.0012091989398991193,
+      "loss": 0.7467,
+      "step": 9250
+    },
+    {
+      "epoch": 1.1874839702487818,
+      "grad_norm": 0.7074049711227417,
+      "learning_rate": 0.0012083440198341455,
+      "loss": 0.755,
+      "step": 9260
+    },
+    {
+      "epoch": 1.1887663503462427,
+      "grad_norm": 1.1832996606826782,
+      "learning_rate": 0.0012074890997691716,
+      "loss": 0.6105,
+      "step": 9270
+    },
+    {
+      "epoch": 1.1900487304437035,
+      "grad_norm": 0.9239598512649536,
+      "learning_rate": 0.0012066341797041978,
+      "loss": 0.6783,
+      "step": 9280
+    },
+    {
+      "epoch": 1.1913311105411644,
+      "grad_norm": 1.3701421022415161,
+      "learning_rate": 0.0012057792596392239,
+      "loss": 0.6904,
+      "step": 9290
+    },
+    {
+      "epoch": 1.1926134906386252,
+      "grad_norm": 1.2199441194534302,
+      "learning_rate": 0.0012049243395742498,
+      "loss": 0.5398,
+      "step": 9300
+    },
+    {
+      "epoch": 1.1938958707360863,
+      "grad_norm": 1.273148536682129,
+      "learning_rate": 0.001204069419509276,
+      "loss": 0.5927,
+      "step": 9310
+    },
+    {
+      "epoch": 1.1951782508335471,
+      "grad_norm": 1.4068207740783691,
+      "learning_rate": 0.0012032144994443019,
+      "loss": 0.7114,
+      "step": 9320
+    },
+    {
+      "epoch": 1.196460630931008,
+      "grad_norm": 0.7752937078475952,
+      "learning_rate": 0.001202359579379328,
+      "loss": 0.7165,
+      "step": 9330
+    },
+    {
+      "epoch": 1.1977430110284688,
+      "grad_norm": 0.880491316318512,
+      "learning_rate": 0.0012015046593143541,
+      "loss": 0.6698,
+      "step": 9340
+    },
+    {
+      "epoch": 1.1990253911259297,
+      "grad_norm": 0.8572263121604919,
+      "learning_rate": 0.0012006497392493803,
+      "loss": 0.5999,
+      "step": 9350
+    },
+    {
+      "epoch": 1.2003077712233905,
+      "grad_norm": 1.0356217622756958,
+      "learning_rate": 0.0011997948191844064,
+      "loss": 0.6217,
+      "step": 9360
+    },
+    {
+      "epoch": 1.2015901513208516,
+      "grad_norm": 0.6338940262794495,
+      "learning_rate": 0.0011989398991194325,
+      "loss": 0.5119,
+      "step": 9370
+    },
+    {
+      "epoch": 1.2028725314183124,
+      "grad_norm": 0.7291190028190613,
+      "learning_rate": 0.0011980849790544582,
+      "loss": 0.5494,
+      "step": 9380
+    },
+    {
+      "epoch": 1.2041549115157733,
+      "grad_norm": 1.3608429431915283,
+      "learning_rate": 0.0011972300589894844,
+      "loss": 0.5987,
+      "step": 9390
+    },
+    {
+      "epoch": 1.205437291613234,
+      "grad_norm": 0.8818786144256592,
+      "learning_rate": 0.0011963751389245105,
+      "loss": 0.6058,
+      "step": 9400
+    },
+    {
+      "epoch": 1.2067196717106952,
+      "grad_norm": 0.4697217345237732,
+      "learning_rate": 0.0011955202188595367,
+      "loss": 0.6277,
+      "step": 9410
+    },
+    {
+      "epoch": 1.208002051808156,
+      "grad_norm": 1.4859899282455444,
+      "learning_rate": 0.0011946652987945628,
+      "loss": 0.5474,
+      "step": 9420
+    },
+    {
+      "epoch": 1.2092844319056169,
+      "grad_norm": 1.107643723487854,
+      "learning_rate": 0.001193810378729589,
+      "loss": 0.6741,
+      "step": 9430
+    },
+    {
+      "epoch": 1.2105668120030777,
+      "grad_norm": 1.3313883543014526,
+      "learning_rate": 0.001192955458664615,
+      "loss": 0.6354,
+      "step": 9440
+    },
+    {
+      "epoch": 1.2118491921005385,
+      "grad_norm": 1.3976408243179321,
+      "learning_rate": 0.0011921005385996408,
+      "loss": 0.5456,
+      "step": 9450
+    },
+    {
+      "epoch": 1.2131315721979994,
+      "grad_norm": 0.9394209384918213,
+      "learning_rate": 0.001191245618534667,
+      "loss": 0.5251,
+      "step": 9460
+    },
+    {
+      "epoch": 1.2144139522954605,
+      "grad_norm": 1.3019652366638184,
+      "learning_rate": 0.001190390698469693,
+      "loss": 0.8192,
+      "step": 9470
+    },
+    {
+      "epoch": 1.2156963323929213,
+      "grad_norm": 1.342137098312378,
+      "learning_rate": 0.0011895357784047192,
+      "loss": 0.4957,
+      "step": 9480
+    },
+    {
+      "epoch": 1.2169787124903821,
+      "grad_norm": 0.8409485220909119,
+      "learning_rate": 0.0011886808583397453,
+      "loss": 0.6699,
+      "step": 9490
+    },
+    {
+      "epoch": 1.218261092587843,
+      "grad_norm": 1.7443925142288208,
+      "learning_rate": 0.0011878259382747715,
+      "loss": 0.7271,
+      "step": 9500
+    },
+    {
+      "epoch": 1.2195434726853038,
+      "grad_norm": 1.7577857971191406,
+      "learning_rate": 0.0011869710182097974,
+      "loss": 0.5655,
+      "step": 9510
+    },
+    {
+      "epoch": 1.220825852782765,
+      "grad_norm": 1.430893063545227,
+      "learning_rate": 0.0011861160981448235,
+      "loss": 0.6315,
+      "step": 9520
+    },
+    {
+      "epoch": 1.2221082328802257,
+      "grad_norm": 0.5352253913879395,
+      "learning_rate": 0.0011852611780798494,
+      "loss": 0.6559,
+      "step": 9530
+    },
+    {
+      "epoch": 1.2233906129776866,
+      "grad_norm": 0.7444478869438171,
+      "learning_rate": 0.0011844062580148756,
+      "loss": 0.5961,
+      "step": 9540
+    },
+    {
+      "epoch": 1.2246729930751474,
+      "grad_norm": 1.430808186531067,
+      "learning_rate": 0.0011835513379499017,
+      "loss": 0.6427,
+      "step": 9550
+    },
+    {
+      "epoch": 1.2259553731726083,
+      "grad_norm": 1.0020971298217773,
+      "learning_rate": 0.0011826964178849278,
+      "loss": 0.6509,
+      "step": 9560
+    },
+    {
+      "epoch": 1.2272377532700693,
+      "grad_norm": 0.9940693974494934,
+      "learning_rate": 0.001181841497819954,
+      "loss": 0.5086,
+      "step": 9570
+    },
+    {
+      "epoch": 1.2285201333675302,
+      "grad_norm": 0.8661133050918579,
+      "learning_rate": 0.00118098657775498,
+      "loss": 0.6015,
+      "step": 9580
+    },
+    {
+      "epoch": 1.229802513464991,
+      "grad_norm": 1.14053475856781,
+      "learning_rate": 0.001180131657690006,
+      "loss": 0.544,
+      "step": 9590
+    },
+    {
+      "epoch": 1.2310848935624519,
+      "grad_norm": 0.6881473660469055,
+      "learning_rate": 0.0011792767376250322,
+      "loss": 0.5568,
+      "step": 9600
+    },
+    {
+      "epoch": 1.2323672736599127,
+      "grad_norm": 0.9339885115623474,
+      "learning_rate": 0.001178421817560058,
+      "loss": 0.7278,
+      "step": 9610
+    },
+    {
+      "epoch": 1.2336496537573738,
+      "grad_norm": 0.9663743376731873,
+      "learning_rate": 0.0011775668974950842,
+      "loss": 0.5526,
+      "step": 9620
+    },
+    {
+      "epoch": 1.2349320338548346,
+      "grad_norm": 0.5652614235877991,
+      "learning_rate": 0.0011767119774301104,
+      "loss": 0.6143,
+      "step": 9630
+    },
+    {
+      "epoch": 1.2362144139522955,
+      "grad_norm": 1.0602763891220093,
+      "learning_rate": 0.0011758570573651363,
+      "loss": 0.5072,
+      "step": 9640
+    },
+    {
+      "epoch": 1.2374967940497563,
+      "grad_norm": 1.2798588275909424,
+      "learning_rate": 0.0011750021373001624,
+      "loss": 0.4941,
+      "step": 9650
+    },
+    {
+      "epoch": 1.2387791741472172,
+      "grad_norm": 0.8834647536277771,
+      "learning_rate": 0.0011741472172351886,
+      "loss": 0.7828,
+      "step": 9660
+    },
+    {
+      "epoch": 1.240061554244678,
+      "grad_norm": 0.47825196385383606,
+      "learning_rate": 0.0011732922971702147,
+      "loss": 0.5121,
+      "step": 9670
+    },
+    {
+      "epoch": 1.241343934342139,
+      "grad_norm": 1.1528728008270264,
+      "learning_rate": 0.0011724373771052408,
+      "loss": 0.6023,
+      "step": 9680
+    },
+    {
+      "epoch": 1.2426263144396,
+      "grad_norm": 0.7429089546203613,
+      "learning_rate": 0.0011715824570402667,
+      "loss": 0.6159,
+      "step": 9690
+    },
+    {
+      "epoch": 1.2439086945370608,
+      "grad_norm": 0.700433075428009,
+      "learning_rate": 0.0011707275369752927,
+      "loss": 0.5488,
+      "step": 9700
+    },
+    {
+      "epoch": 1.2451910746345216,
+      "grad_norm": 0.9546358585357666,
+      "learning_rate": 0.0011698726169103188,
+      "loss": 0.6106,
+      "step": 9710
+    },
+    {
+      "epoch": 1.2464734547319827,
+      "grad_norm": 0.6889375448226929,
+      "learning_rate": 0.001169017696845345,
+      "loss": 0.672,
+      "step": 9720
+    },
+    {
+      "epoch": 1.2477558348294435,
+      "grad_norm": 0.6451250314712524,
+      "learning_rate": 0.001168162776780371,
+      "loss": 0.7776,
+      "step": 9730
+    },
+    {
+      "epoch": 1.2490382149269044,
+      "grad_norm": 0.6140780448913574,
+      "learning_rate": 0.0011673078567153972,
+      "loss": 0.6053,
+      "step": 9740
+    },
+    {
+      "epoch": 1.2503205950243652,
+      "grad_norm": 0.8168278932571411,
+      "learning_rate": 0.0011664529366504234,
+      "loss": 0.6648,
+      "step": 9750
+    },
+    {
+      "epoch": 1.251602975121826,
+      "grad_norm": 0.7731073498725891,
+      "learning_rate": 0.0011655980165854495,
+      "loss": 0.7018,
+      "step": 9760
+    },
+    {
+      "epoch": 1.252885355219287,
+      "grad_norm": 1.4403185844421387,
+      "learning_rate": 0.0011647430965204752,
+      "loss": 0.7392,
+      "step": 9770
+    },
+    {
+      "epoch": 1.254167735316748,
+      "grad_norm": 2.162862777709961,
+      "learning_rate": 0.0011638881764555013,
+      "loss": 0.7369,
+      "step": 9780
+    },
+    {
+      "epoch": 1.2554501154142088,
+      "grad_norm": 1.393710970878601,
+      "learning_rate": 0.0011630332563905275,
+      "loss": 0.704,
+      "step": 9790
+    },
+    {
+      "epoch": 1.2567324955116697,
+      "grad_norm": 0.6522937417030334,
+      "learning_rate": 0.0011621783363255536,
+      "loss": 0.522,
+      "step": 9800
+    },
+    {
+      "epoch": 1.2580148756091305,
+      "grad_norm": 0.6343621611595154,
+      "learning_rate": 0.0011613234162605797,
+      "loss": 0.6998,
+      "step": 9810
+    },
+    {
+      "epoch": 1.2592972557065916,
+      "grad_norm": 1.105334758758545,
+      "learning_rate": 0.0011604684961956059,
+      "loss": 0.582,
+      "step": 9820
+    },
+    {
+      "epoch": 1.2605796358040524,
+      "grad_norm": 1.2634021043777466,
+      "learning_rate": 0.0011596135761306318,
+      "loss": 0.5776,
+      "step": 9830
+    },
+    {
+      "epoch": 1.2618620159015133,
+      "grad_norm": 1.232373595237732,
+      "learning_rate": 0.0011587586560656577,
+      "loss": 0.6949,
+      "step": 9840
+    },
+    {
+      "epoch": 1.263144395998974,
+      "grad_norm": 1.2917943000793457,
+      "learning_rate": 0.0011579037360006838,
+      "loss": 0.5866,
+      "step": 9850
+    },
+    {
+      "epoch": 1.264426776096435,
+      "grad_norm": 1.0393379926681519,
+      "learning_rate": 0.00115704881593571,
+      "loss": 0.6554,
+      "step": 9860
+    },
+    {
+      "epoch": 1.2657091561938958,
+      "grad_norm": 0.9786701202392578,
+      "learning_rate": 0.0011561938958707361,
+      "loss": 0.4743,
+      "step": 9870
+    },
+    {
+      "epoch": 1.2669915362913566,
+      "grad_norm": 0.6891704201698303,
+      "learning_rate": 0.0011553389758057623,
+      "loss": 0.6969,
+      "step": 9880
+    },
+    {
+      "epoch": 1.2682739163888177,
+      "grad_norm": 1.0330877304077148,
+      "learning_rate": 0.0011544840557407884,
+      "loss": 0.5176,
+      "step": 9890
+    },
+    {
+      "epoch": 1.2695562964862785,
+      "grad_norm": 1.5313884019851685,
+      "learning_rate": 0.0011536291356758143,
+      "loss": 0.8418,
+      "step": 9900
+    },
+    {
+      "epoch": 1.2708386765837394,
+      "grad_norm": 1.8381309509277344,
+      "learning_rate": 0.0011527742156108405,
+      "loss": 0.5932,
+      "step": 9910
+    },
+    {
+      "epoch": 1.2721210566812002,
+      "grad_norm": 0.8131228685379028,
+      "learning_rate": 0.0011519192955458664,
+      "loss": 0.5982,
+      "step": 9920
+    },
+    {
+      "epoch": 1.2734034367786613,
+      "grad_norm": 0.9269918203353882,
+      "learning_rate": 0.0011510643754808925,
+      "loss": 0.588,
+      "step": 9930
+    },
+    {
+      "epoch": 1.2746858168761221,
+      "grad_norm": 1.5636909008026123,
+      "learning_rate": 0.0011502094554159186,
+      "loss": 0.638,
+      "step": 9940
+    },
+    {
+      "epoch": 1.275968196973583,
+      "grad_norm": 0.8227086067199707,
+      "learning_rate": 0.0011493545353509448,
+      "loss": 0.6211,
+      "step": 9950
+    },
+    {
+      "epoch": 1.2772505770710438,
+      "grad_norm": 0.5944573283195496,
+      "learning_rate": 0.0011484996152859707,
+      "loss": 0.65,
+      "step": 9960
+    },
+    {
+      "epoch": 1.2785329571685047,
+      "grad_norm": 1.4585204124450684,
+      "learning_rate": 0.0011476446952209968,
+      "loss": 0.465,
+      "step": 9970
+    },
+    {
+      "epoch": 1.2798153372659655,
+      "grad_norm": 0.8570374250411987,
+      "learning_rate": 0.001146789775156023,
+      "loss": 0.5458,
+      "step": 9980
+    },
+    {
+      "epoch": 1.2810977173634266,
+      "grad_norm": 1.3235441446304321,
+      "learning_rate": 0.0011459348550910491,
+      "loss": 0.5912,
+      "step": 9990
+    },
+    {
+      "epoch": 1.2823800974608874,
+      "grad_norm": 1.786232352256775,
+      "learning_rate": 0.001145079935026075,
+      "loss": 0.7141,
+      "step": 10000
+    },
+    {
+      "epoch": 1.2836624775583483,
+      "grad_norm": 1.6500744819641113,
+      "learning_rate": 0.0011442250149611012,
+      "loss": 0.7004,
+      "step": 10010
+    },
+    {
+      "epoch": 1.2849448576558091,
+      "grad_norm": 0.9735982418060303,
+      "learning_rate": 0.0011433700948961273,
+      "loss": 0.6243,
+      "step": 10020
+    },
+    {
+      "epoch": 1.2862272377532702,
+      "grad_norm": 1.611070990562439,
+      "learning_rate": 0.0011425151748311532,
+      "loss": 0.7653,
+      "step": 10030
+    },
+    {
+      "epoch": 1.287509617850731,
+      "grad_norm": 0.9978891611099243,
+      "learning_rate": 0.0011416602547661794,
+      "loss": 0.5817,
+      "step": 10040
+    },
+    {
+      "epoch": 1.2887919979481919,
+      "grad_norm": 1.2319824695587158,
+      "learning_rate": 0.0011408053347012055,
+      "loss": 0.5303,
+      "step": 10050
+    },
+    {
+      "epoch": 1.2900743780456527,
+      "grad_norm": 0.8889154195785522,
+      "learning_rate": 0.0011399504146362316,
+      "loss": 0.4797,
+      "step": 10060
+    },
+    {
+      "epoch": 1.2913567581431136,
+      "grad_norm": 0.8058596253395081,
+      "learning_rate": 0.0011390954945712578,
+      "loss": 0.6967,
+      "step": 10070
+    },
+    {
+      "epoch": 1.2926391382405744,
+      "grad_norm": 1.0289708375930786,
+      "learning_rate": 0.0011382405745062837,
+      "loss": 0.6387,
+      "step": 10080
+    },
+    {
+      "epoch": 1.2939215183380353,
+      "grad_norm": 0.7614682912826538,
+      "learning_rate": 0.0011373856544413096,
+      "loss": 0.5186,
+      "step": 10090
+    },
+    {
+      "epoch": 1.2952038984354963,
+      "grad_norm": 1.5079838037490845,
+      "learning_rate": 0.0011365307343763357,
+      "loss": 0.7524,
+      "step": 10100
+    },
+    {
+      "epoch": 1.2964862785329572,
+      "grad_norm": 1.0859569311141968,
+      "learning_rate": 0.0011356758143113619,
+      "loss": 0.669,
+      "step": 10110
+    },
+    {
+      "epoch": 1.297768658630418,
+      "grad_norm": 1.2234021425247192,
+      "learning_rate": 0.001134820894246388,
+      "loss": 0.5603,
+      "step": 10120
+    },
+    {
+      "epoch": 1.299051038727879,
+      "grad_norm": 0.7844352126121521,
+      "learning_rate": 0.0011339659741814142,
+      "loss": 0.5283,
+      "step": 10130
+    },
+    {
+      "epoch": 1.30033341882534,
+      "grad_norm": 0.7370574474334717,
+      "learning_rate": 0.0011331110541164403,
+      "loss": 0.5756,
+      "step": 10140
+    },
+    {
+      "epoch": 1.3016157989228008,
+      "grad_norm": 0.7193623185157776,
+      "learning_rate": 0.001132256134051466,
+      "loss": 0.4713,
+      "step": 10150
+    },
+    {
+      "epoch": 1.3028981790202616,
+      "grad_norm": 0.8968930244445801,
+      "learning_rate": 0.0011314012139864921,
+      "loss": 0.502,
+      "step": 10160
+    },
+    {
+      "epoch": 1.3041805591177225,
+      "grad_norm": 0.7797239422798157,
+      "learning_rate": 0.0011305462939215183,
+      "loss": 0.5768,
+      "step": 10170
+    },
+    {
+      "epoch": 1.3054629392151833,
+      "grad_norm": 1.528817892074585,
+      "learning_rate": 0.0011296913738565444,
+      "loss": 0.7379,
+      "step": 10180
+    },
+    {
+      "epoch": 1.3067453193126441,
+      "grad_norm": 0.5706043839454651,
+      "learning_rate": 0.0011288364537915705,
+      "loss": 0.4838,
+      "step": 10190
+    },
+    {
+      "epoch": 1.3080276994101052,
+      "grad_norm": 0.8248624205589294,
+      "learning_rate": 0.0011279815337265967,
+      "loss": 0.7465,
+      "step": 10200
+    },
+    {
+      "epoch": 1.309310079507566,
+      "grad_norm": 0.7821047306060791,
+      "learning_rate": 0.0011271266136616228,
+      "loss": 0.6183,
+      "step": 10210
+    },
+    {
+      "epoch": 1.310592459605027,
+      "grad_norm": 0.7619379162788391,
+      "learning_rate": 0.0011262716935966487,
+      "loss": 0.6079,
+      "step": 10220
+    },
+    {
+      "epoch": 1.3118748397024877,
+      "grad_norm": 0.5874025225639343,
+      "learning_rate": 0.0011254167735316747,
+      "loss": 0.5995,
+      "step": 10230
+    },
+    {
+      "epoch": 1.3131572197999488,
+      "grad_norm": 1.180526852607727,
+      "learning_rate": 0.0011245618534667008,
+      "loss": 0.567,
+      "step": 10240
+    },
+    {
+      "epoch": 1.3144395998974097,
+      "grad_norm": 1.1229068040847778,
+      "learning_rate": 0.001123706933401727,
+      "loss": 0.3913,
+      "step": 10250
+    },
+    {
+      "epoch": 1.3157219799948705,
+      "grad_norm": 0.6968095898628235,
+      "learning_rate": 0.001122852013336753,
+      "loss": 0.5576,
+      "step": 10260
+    },
+    {
+      "epoch": 1.3170043600923313,
+      "grad_norm": 1.0002440214157104,
+      "learning_rate": 0.0011219970932717792,
+      "loss": 0.7124,
+      "step": 10270
+    },
+    {
+      "epoch": 1.3182867401897922,
+      "grad_norm": 1.1690402030944824,
+      "learning_rate": 0.0011211421732068051,
+      "loss": 0.4824,
+      "step": 10280
+    },
+    {
+      "epoch": 1.319569120287253,
+      "grad_norm": 1.384547472000122,
+      "learning_rate": 0.0011202872531418313,
+      "loss": 0.6647,
+      "step": 10290
+    },
+    {
+      "epoch": 1.320851500384714,
+      "grad_norm": 1.943840503692627,
+      "learning_rate": 0.0011194323330768574,
+      "loss": 0.5974,
+      "step": 10300
+    },
+    {
+      "epoch": 1.322133880482175,
+      "grad_norm": 0.721809983253479,
+      "learning_rate": 0.0011185774130118833,
+      "loss": 0.601,
+      "step": 10310
+    },
+    {
+      "epoch": 1.3234162605796358,
+      "grad_norm": 0.594584584236145,
+      "learning_rate": 0.0011177224929469094,
+      "loss": 0.4628,
+      "step": 10320
+    },
+    {
+      "epoch": 1.3246986406770966,
+      "grad_norm": 1.1963189840316772,
+      "learning_rate": 0.0011168675728819356,
+      "loss": 0.5409,
+      "step": 10330
+    },
+    {
+      "epoch": 1.3259810207745577,
+      "grad_norm": 0.9252663254737854,
+      "learning_rate": 0.0011160126528169617,
+      "loss": 0.4973,
+      "step": 10340
+    },
+    {
+      "epoch": 1.3272634008720186,
+      "grad_norm": 0.7232112288475037,
+      "learning_rate": 0.0011151577327519876,
+      "loss": 0.6411,
+      "step": 10350
+    },
+    {
+      "epoch": 1.3285457809694794,
+      "grad_norm": 1.3147138357162476,
+      "learning_rate": 0.0011143028126870138,
+      "loss": 0.5462,
+      "step": 10360
+    },
+    {
+      "epoch": 1.3298281610669402,
+      "grad_norm": 1.6422502994537354,
+      "learning_rate": 0.00111344789262204,
+      "loss": 0.5364,
+      "step": 10370
+    },
+    {
+      "epoch": 1.331110541164401,
+      "grad_norm": 0.6929153203964233,
+      "learning_rate": 0.001112592972557066,
+      "loss": 0.4947,
+      "step": 10380
+    },
+    {
+      "epoch": 1.332392921261862,
+      "grad_norm": 1.4353240728378296,
+      "learning_rate": 0.001111738052492092,
+      "loss": 0.52,
+      "step": 10390
+    },
+    {
+      "epoch": 1.3336753013593228,
+      "grad_norm": 1.516634225845337,
+      "learning_rate": 0.0011108831324271181,
+      "loss": 0.6082,
+      "step": 10400
+    },
+    {
+      "epoch": 1.3349576814567838,
+      "grad_norm": 1.1383343935012817,
+      "learning_rate": 0.001110028212362144,
+      "loss": 0.663,
+      "step": 10410
+    },
+    {
+      "epoch": 1.3362400615542447,
+      "grad_norm": 1.2249925136566162,
+      "learning_rate": 0.0011091732922971702,
+      "loss": 0.6528,
+      "step": 10420
+    },
+    {
+      "epoch": 1.3375224416517055,
+      "grad_norm": 0.720862865447998,
+      "learning_rate": 0.0011083183722321963,
+      "loss": 0.5235,
+      "step": 10430
+    },
+    {
+      "epoch": 1.3388048217491664,
+      "grad_norm": 0.6571218967437744,
+      "learning_rate": 0.0011074634521672224,
+      "loss": 0.7425,
+      "step": 10440
+    },
+    {
+      "epoch": 1.3400872018466274,
+      "grad_norm": 1.3739579916000366,
+      "learning_rate": 0.0011066085321022486,
+      "loss": 0.7658,
+      "step": 10450
+    },
+    {
+      "epoch": 1.3413695819440883,
+      "grad_norm": 1.992790937423706,
+      "learning_rate": 0.0011057536120372747,
+      "loss": 0.6204,
+      "step": 10460
+    },
+    {
+      "epoch": 1.3426519620415491,
+      "grad_norm": 0.7727292776107788,
+      "learning_rate": 0.0011048986919723006,
+      "loss": 0.5973,
+      "step": 10470
+    },
+    {
+      "epoch": 1.34393434213901,
+      "grad_norm": 0.9260819554328918,
+      "learning_rate": 0.0011040437719073266,
+      "loss": 0.6831,
+      "step": 10480
+    },
+    {
+      "epoch": 1.3452167222364708,
+      "grad_norm": 0.4422336220741272,
+      "learning_rate": 0.0011031888518423527,
+      "loss": 0.5292,
+      "step": 10490
+    },
+    {
+      "epoch": 1.3464991023339317,
+      "grad_norm": 0.5913951992988586,
+      "learning_rate": 0.0011023339317773788,
+      "loss": 0.5232,
+      "step": 10500
+    },
+    {
+      "epoch": 1.3477814824313927,
+      "grad_norm": 1.8508780002593994,
+      "learning_rate": 0.001101479011712405,
+      "loss": 0.6073,
+      "step": 10510
+    },
+    {
+      "epoch": 1.3490638625288536,
+      "grad_norm": 1.7794585227966309,
+      "learning_rate": 0.001100624091647431,
+      "loss": 0.5784,
+      "step": 10520
+    },
+    {
+      "epoch": 1.3503462426263144,
+      "grad_norm": 1.4535781145095825,
+      "learning_rate": 0.0010997691715824572,
+      "loss": 0.6948,
+      "step": 10530
+    },
+    {
+      "epoch": 1.3516286227237753,
+      "grad_norm": 0.6549120545387268,
+      "learning_rate": 0.001098914251517483,
+      "loss": 0.6816,
+      "step": 10540
+    },
+    {
+      "epoch": 1.3529110028212363,
+      "grad_norm": 2.080423355102539,
+      "learning_rate": 0.001098059331452509,
+      "loss": 0.5362,
+      "step": 10550
+    },
+    {
+      "epoch": 1.3541933829186972,
+      "grad_norm": 0.6796220541000366,
+      "learning_rate": 0.0010972044113875352,
+      "loss": 0.5462,
+      "step": 10560
+    },
+    {
+      "epoch": 1.355475763016158,
+      "grad_norm": 0.9593464732170105,
+      "learning_rate": 0.0010963494913225613,
+      "loss": 0.7782,
+      "step": 10570
+    },
+    {
+      "epoch": 1.3567581431136189,
+      "grad_norm": 0.9870818853378296,
+      "learning_rate": 0.0010954945712575875,
+      "loss": 0.5401,
+      "step": 10580
+    },
+    {
+      "epoch": 1.3580405232110797,
+      "grad_norm": 1.1072885990142822,
+      "learning_rate": 0.0010946396511926136,
+      "loss": 0.5625,
+      "step": 10590
+    },
+    {
+      "epoch": 1.3593229033085406,
+      "grad_norm": 0.8635666370391846,
+      "learning_rate": 0.0010937847311276398,
+      "loss": 0.6213,
+      "step": 10600
+    },
+    {
+      "epoch": 1.3606052834060016,
+      "grad_norm": 0.6433390378952026,
+      "learning_rate": 0.0010929298110626657,
+      "loss": 0.5149,
+      "step": 10610
+    },
+    {
+      "epoch": 1.3618876635034625,
+      "grad_norm": 0.9244104623794556,
+      "learning_rate": 0.0010920748909976916,
+      "loss": 0.8149,
+      "step": 10620
+    },
+    {
+      "epoch": 1.3631700436009233,
+      "grad_norm": 1.0596814155578613,
+      "learning_rate": 0.0010912199709327177,
+      "loss": 0.6769,
+      "step": 10630
+    },
+    {
+      "epoch": 1.3644524236983842,
+      "grad_norm": 1.2836452722549438,
+      "learning_rate": 0.0010903650508677439,
+      "loss": 0.7406,
+      "step": 10640
+    },
+    {
+      "epoch": 1.3657348037958452,
+      "grad_norm": 1.069035291671753,
+      "learning_rate": 0.00108951013080277,
+      "loss": 0.5905,
+      "step": 10650
+    },
+    {
+      "epoch": 1.367017183893306,
+      "grad_norm": 0.6436813473701477,
+      "learning_rate": 0.0010886552107377961,
+      "loss": 0.6066,
+      "step": 10660
+    },
+    {
+      "epoch": 1.368299563990767,
+      "grad_norm": 1.8772107362747192,
+      "learning_rate": 0.001087800290672822,
+      "loss": 0.7121,
+      "step": 10670
+    },
+    {
+      "epoch": 1.3695819440882278,
+      "grad_norm": 0.6196737289428711,
+      "learning_rate": 0.0010869453706078482,
+      "loss": 0.6006,
+      "step": 10680
+    },
+    {
+      "epoch": 1.3708643241856886,
+      "grad_norm": 1.3433279991149902,
+      "learning_rate": 0.0010860904505428743,
+      "loss": 0.6368,
+      "step": 10690
+    },
+    {
+      "epoch": 1.3721467042831494,
+      "grad_norm": 0.9667194485664368,
+      "learning_rate": 0.0010852355304779003,
+      "loss": 0.5574,
+      "step": 10700
+    },
+    {
+      "epoch": 1.3734290843806103,
+      "grad_norm": 1.4600547552108765,
+      "learning_rate": 0.0010843806104129264,
+      "loss": 0.5626,
+      "step": 10710
+    },
+    {
+      "epoch": 1.3747114644780714,
+      "grad_norm": 0.7120881676673889,
+      "learning_rate": 0.0010835256903479525,
+      "loss": 0.6258,
+      "step": 10720
+    },
+    {
+      "epoch": 1.3759938445755322,
+      "grad_norm": 1.2124048471450806,
+      "learning_rate": 0.0010826707702829787,
+      "loss": 0.7491,
+      "step": 10730
+    },
+    {
+      "epoch": 1.377276224672993,
+      "grad_norm": 0.9732292294502258,
+      "learning_rate": 0.0010818158502180046,
+      "loss": 0.5647,
+      "step": 10740
+    },
+    {
+      "epoch": 1.3785586047704539,
+      "grad_norm": 0.7741032838821411,
+      "learning_rate": 0.0010809609301530307,
+      "loss": 0.5397,
+      "step": 10750
+    },
+    {
+      "epoch": 1.379840984867915,
+      "grad_norm": 1.0396802425384521,
+      "learning_rate": 0.0010801060100880569,
+      "loss": 0.7297,
+      "step": 10760
+    },
+    {
+      "epoch": 1.3811233649653758,
+      "grad_norm": 1.2885736227035522,
+      "learning_rate": 0.001079251090023083,
+      "loss": 0.532,
+      "step": 10770
+    },
+    {
+      "epoch": 1.3824057450628366,
+      "grad_norm": 0.7599356174468994,
+      "learning_rate": 0.001078396169958109,
+      "loss": 0.7877,
+      "step": 10780
+    },
+    {
+      "epoch": 1.3836881251602975,
+      "grad_norm": 1.040028691291809,
+      "learning_rate": 0.001077541249893135,
+      "loss": 0.5618,
+      "step": 10790
+    },
+    {
+      "epoch": 1.3849705052577583,
+      "grad_norm": 0.859203577041626,
+      "learning_rate": 0.001076686329828161,
+      "loss": 0.5895,
+      "step": 10800
+    },
+    {
+      "epoch": 1.3862528853552192,
+      "grad_norm": 0.6244560480117798,
+      "learning_rate": 0.001075831409763187,
+      "loss": 0.4826,
+      "step": 10810
+    },
+    {
+      "epoch": 1.3875352654526802,
+      "grad_norm": 0.6640686392784119,
+      "learning_rate": 0.0010749764896982132,
+      "loss": 0.6616,
+      "step": 10820
+    },
+    {
+      "epoch": 1.388817645550141,
+      "grad_norm": 1.2225605249404907,
+      "learning_rate": 0.0010741215696332394,
+      "loss": 0.6392,
+      "step": 10830
+    },
+    {
+      "epoch": 1.390100025647602,
+      "grad_norm": 0.7027501463890076,
+      "learning_rate": 0.0010732666495682655,
+      "loss": 0.5071,
+      "step": 10840
+    },
+    {
+      "epoch": 1.3913824057450628,
+      "grad_norm": 0.8924635052680969,
+      "learning_rate": 0.0010724117295032914,
+      "loss": 0.5538,
+      "step": 10850
+    },
+    {
+      "epoch": 1.3926647858425238,
+      "grad_norm": 1.6392470598220825,
+      "learning_rate": 0.0010715568094383174,
+      "loss": 0.7056,
+      "step": 10860
+    },
+    {
+      "epoch": 1.3939471659399847,
+      "grad_norm": 0.6672780513763428,
+      "learning_rate": 0.0010707018893733435,
+      "loss": 0.6533,
+      "step": 10870
+    },
+    {
+      "epoch": 1.3952295460374455,
+      "grad_norm": 0.9473418593406677,
+      "learning_rate": 0.0010698469693083696,
+      "loss": 0.6053,
+      "step": 10880
+    },
+    {
+      "epoch": 1.3965119261349064,
+      "grad_norm": 1.2938871383666992,
+      "learning_rate": 0.0010689920492433958,
+      "loss": 0.6268,
+      "step": 10890
+    },
+    {
+      "epoch": 1.3977943062323672,
+      "grad_norm": 1.0239317417144775,
+      "learning_rate": 0.001068137129178422,
+      "loss": 0.7315,
+      "step": 10900
+    },
+    {
+      "epoch": 1.399076686329828,
+      "grad_norm": 1.4379597902297974,
+      "learning_rate": 0.001067282209113448,
+      "loss": 0.6352,
+      "step": 10910
+    },
+    {
+      "epoch": 1.400359066427289,
+      "grad_norm": 1.5610178709030151,
+      "learning_rate": 0.0010664272890484742,
+      "loss": 0.6641,
+      "step": 10920
+    },
+    {
+      "epoch": 1.40164144652475,
+      "grad_norm": 0.7390224933624268,
+      "learning_rate": 0.0010655723689834999,
+      "loss": 0.5248,
+      "step": 10930
+    },
+    {
+      "epoch": 1.4029238266222108,
+      "grad_norm": 0.9852975606918335,
+      "learning_rate": 0.001064717448918526,
+      "loss": 0.4792,
+      "step": 10940
+    },
+    {
+      "epoch": 1.4042062067196717,
+      "grad_norm": 1.171047568321228,
+      "learning_rate": 0.0010638625288535522,
+      "loss": 0.7306,
+      "step": 10950
+    },
+    {
+      "epoch": 1.4054885868171327,
+      "grad_norm": 0.7043918371200562,
+      "learning_rate": 0.0010630076087885783,
+      "loss": 0.4516,
+      "step": 10960
+    },
+    {
+      "epoch": 1.4067709669145936,
+      "grad_norm": 2.092144250869751,
+      "learning_rate": 0.0010621526887236044,
+      "loss": 0.6593,
+      "step": 10970
+    },
+    {
+      "epoch": 1.4080533470120544,
+      "grad_norm": 0.4322734475135803,
+      "learning_rate": 0.0010612977686586306,
+      "loss": 0.6476,
+      "step": 10980
+    },
+    {
+      "epoch": 1.4093357271095153,
+      "grad_norm": 1.1757038831710815,
+      "learning_rate": 0.0010604428485936565,
+      "loss": 0.4645,
+      "step": 10990
+    },
+    {
+      "epoch": 1.4106181072069761,
+      "grad_norm": 2.142357587814331,
+      "learning_rate": 0.0010595879285286826,
+      "loss": 0.5823,
+      "step": 11000
+    },
+    {
+      "epoch": 1.411900487304437,
+      "grad_norm": 0.8038185834884644,
+      "learning_rate": 0.0010587330084637085,
+      "loss": 0.6456,
+      "step": 11010
+    },
+    {
+      "epoch": 1.4131828674018978,
+      "grad_norm": 0.9236948490142822,
+      "learning_rate": 0.0010578780883987347,
+      "loss": 0.4065,
+      "step": 11020
+    },
+    {
+      "epoch": 1.4144652474993589,
+      "grad_norm": 1.382051706314087,
+      "learning_rate": 0.0010570231683337608,
+      "loss": 0.5759,
+      "step": 11030
+    },
+    {
+      "epoch": 1.4157476275968197,
+      "grad_norm": 1.405614972114563,
+      "learning_rate": 0.001056168248268787,
+      "loss": 0.6169,
+      "step": 11040
+    },
+    {
+      "epoch": 1.4170300076942806,
+      "grad_norm": 0.9285224080085754,
+      "learning_rate": 0.001055313328203813,
+      "loss": 0.5831,
+      "step": 11050
+    },
+    {
+      "epoch": 1.4183123877917414,
+      "grad_norm": 0.7825279235839844,
+      "learning_rate": 0.001054458408138839,
+      "loss": 0.6827,
+      "step": 11060
+    },
+    {
+      "epoch": 1.4195947678892025,
+      "grad_norm": 2.2078566551208496,
+      "learning_rate": 0.0010536034880738651,
+      "loss": 0.6374,
+      "step": 11070
+    },
+    {
+      "epoch": 1.4208771479866633,
+      "grad_norm": 0.5845392942428589,
+      "learning_rate": 0.0010527485680088913,
+      "loss": 0.4996,
+      "step": 11080
+    },
+    {
+      "epoch": 1.4221595280841242,
+      "grad_norm": 1.3388561010360718,
+      "learning_rate": 0.0010518936479439172,
+      "loss": 0.5793,
+      "step": 11090
+    },
+    {
+      "epoch": 1.423441908181585,
+      "grad_norm": 0.7074248790740967,
+      "learning_rate": 0.0010510387278789433,
+      "loss": 0.5553,
+      "step": 11100
+    },
+    {
+      "epoch": 1.4247242882790458,
+      "grad_norm": 0.9576848149299622,
+      "learning_rate": 0.0010501838078139695,
+      "loss": 0.7863,
+      "step": 11110
+    },
+    {
+      "epoch": 1.4260066683765067,
+      "grad_norm": 1.0783685445785522,
+      "learning_rate": 0.0010493288877489954,
+      "loss": 0.5752,
+      "step": 11120
+    },
+    {
+      "epoch": 1.4272890484739678,
+      "grad_norm": 1.143621802330017,
+      "learning_rate": 0.0010484739676840215,
+      "loss": 0.5223,
+      "step": 11130
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 1.0758188962936401,
+      "learning_rate": 0.0010476190476190477,
+      "loss": 0.4724,
+      "step": 11140
+    },
+    {
+      "epoch": 1.4298538086688894,
+      "grad_norm": 0.7286604642868042,
+      "learning_rate": 0.0010467641275540738,
+      "loss": 0.493,
+      "step": 11150
+    },
+    {
+      "epoch": 1.4311361887663503,
+      "grad_norm": 0.7054126858711243,
+      "learning_rate": 0.0010459092074891,
+      "loss": 0.6056,
+      "step": 11160
+    },
+    {
+      "epoch": 1.4324185688638114,
+      "grad_norm": 0.7918633222579956,
+      "learning_rate": 0.0010450542874241259,
+      "loss": 0.5409,
+      "step": 11170
+    },
+    {
+      "epoch": 1.4337009489612722,
+      "grad_norm": 1.105986475944519,
+      "learning_rate": 0.001044199367359152,
+      "loss": 0.5131,
+      "step": 11180
+    },
+    {
+      "epoch": 1.434983329058733,
+      "grad_norm": 1.0960558652877808,
+      "learning_rate": 0.001043344447294178,
+      "loss": 0.6069,
+      "step": 11190
+    },
+    {
+      "epoch": 1.436265709156194,
+      "grad_norm": 0.9920992851257324,
+      "learning_rate": 0.001042489527229204,
+      "loss": 0.6847,
+      "step": 11200
+    },
+    {
+      "epoch": 1.4375480892536547,
+      "grad_norm": 1.1634384393692017,
+      "learning_rate": 0.0010416346071642302,
+      "loss": 0.5016,
+      "step": 11210
+    },
+    {
+      "epoch": 1.4388304693511156,
+      "grad_norm": 0.9485201239585876,
+      "learning_rate": 0.0010407796870992563,
+      "loss": 0.3805,
+      "step": 11220
+    },
+    {
+      "epoch": 1.4401128494485764,
+      "grad_norm": 0.9039535522460938,
+      "learning_rate": 0.0010399247670342825,
+      "loss": 0.4757,
+      "step": 11230
+    },
+    {
+      "epoch": 1.4413952295460375,
+      "grad_norm": 1.6443145275115967,
+      "learning_rate": 0.0010390698469693084,
+      "loss": 0.5901,
+      "step": 11240
+    },
+    {
+      "epoch": 1.4426776096434983,
+      "grad_norm": 0.9684635400772095,
+      "learning_rate": 0.0010382149269043343,
+      "loss": 0.5933,
+      "step": 11250
+    },
+    {
+      "epoch": 1.4439599897409592,
+      "grad_norm": 0.9142680764198303,
+      "learning_rate": 0.0010373600068393604,
+      "loss": 0.5636,
+      "step": 11260
+    },
+    {
+      "epoch": 1.44524236983842,
+      "grad_norm": 1.5742777585983276,
+      "learning_rate": 0.0010365050867743866,
+      "loss": 0.513,
+      "step": 11270
+    },
+    {
+      "epoch": 1.446524749935881,
+      "grad_norm": 1.9768625497817993,
+      "learning_rate": 0.0010356501667094127,
+      "loss": 0.7878,
+      "step": 11280
+    },
+    {
+      "epoch": 1.447807130033342,
+      "grad_norm": 0.7365511655807495,
+      "learning_rate": 0.0010347952466444388,
+      "loss": 0.488,
+      "step": 11290
+    },
+    {
+      "epoch": 1.4490895101308028,
+      "grad_norm": 0.7075834274291992,
+      "learning_rate": 0.001033940326579465,
+      "loss": 0.7751,
+      "step": 11300
+    },
+    {
+      "epoch": 1.4503718902282636,
+      "grad_norm": 0.7053199410438538,
+      "learning_rate": 0.0010330854065144911,
+      "loss": 0.5618,
+      "step": 11310
+    },
+    {
+      "epoch": 1.4516542703257245,
+      "grad_norm": 0.7528406977653503,
+      "learning_rate": 0.0010322304864495168,
+      "loss": 0.4849,
+      "step": 11320
+    },
+    {
+      "epoch": 1.4529366504231853,
+      "grad_norm": 0.96307772397995,
+      "learning_rate": 0.001031375566384543,
+      "loss": 0.5907,
+      "step": 11330
+    },
+    {
+      "epoch": 1.4542190305206464,
+      "grad_norm": 0.5311592817306519,
+      "learning_rate": 0.001030520646319569,
+      "loss": 0.5292,
+      "step": 11340
+    },
+    {
+      "epoch": 1.4555014106181072,
+      "grad_norm": 1.3051635026931763,
+      "learning_rate": 0.0010296657262545952,
+      "loss": 0.5106,
+      "step": 11350
+    },
+    {
+      "epoch": 1.456783790715568,
+      "grad_norm": 1.5041792392730713,
+      "learning_rate": 0.0010288108061896214,
+      "loss": 0.5676,
+      "step": 11360
+    },
+    {
+      "epoch": 1.458066170813029,
+      "grad_norm": 1.6428672075271606,
+      "learning_rate": 0.0010279558861246475,
+      "loss": 0.6384,
+      "step": 11370
+    },
+    {
+      "epoch": 1.45934855091049,
+      "grad_norm": 0.9617105722427368,
+      "learning_rate": 0.0010271009660596734,
+      "loss": 0.4024,
+      "step": 11380
+    },
+    {
+      "epoch": 1.4606309310079508,
+      "grad_norm": 1.0485490560531616,
+      "learning_rate": 0.0010262460459946996,
+      "loss": 0.5619,
+      "step": 11390
+    },
+    {
+      "epoch": 1.4619133111054117,
+      "grad_norm": 1.21506667137146,
+      "learning_rate": 0.0010253911259297255,
+      "loss": 0.4969,
+      "step": 11400
+    },
+    {
+      "epoch": 1.4631956912028725,
+      "grad_norm": 1.1984257698059082,
+      "learning_rate": 0.0010245362058647516,
+      "loss": 0.4939,
+      "step": 11410
+    },
+    {
+      "epoch": 1.4644780713003334,
+      "grad_norm": 0.8625141382217407,
+      "learning_rate": 0.0010236812857997778,
+      "loss": 0.543,
+      "step": 11420
+    },
+    {
+      "epoch": 1.4657604513977942,
+      "grad_norm": 1.3549401760101318,
+      "learning_rate": 0.0010228263657348039,
+      "loss": 0.5781,
+      "step": 11430
+    },
+    {
+      "epoch": 1.4670428314952553,
+      "grad_norm": 1.5862414836883545,
+      "learning_rate": 0.0010219714456698298,
+      "loss": 0.5567,
+      "step": 11440
+    },
+    {
+      "epoch": 1.4683252115927161,
+      "grad_norm": 0.9037706255912781,
+      "learning_rate": 0.001021116525604856,
+      "loss": 0.519,
+      "step": 11450
+    },
+    {
+      "epoch": 1.469607591690177,
+      "grad_norm": 0.9766443967819214,
+      "learning_rate": 0.001020261605539882,
+      "loss": 0.5847,
+      "step": 11460
+    },
+    {
+      "epoch": 1.4708899717876378,
+      "grad_norm": 1.0838594436645508,
+      "learning_rate": 0.0010194066854749082,
+      "loss": 0.5357,
+      "step": 11470
+    },
+    {
+      "epoch": 1.4721723518850989,
+      "grad_norm": 1.4483375549316406,
+      "learning_rate": 0.0010185517654099341,
+      "loss": 0.5475,
+      "step": 11480
+    },
+    {
+      "epoch": 1.4734547319825597,
+      "grad_norm": 0.8537694215774536,
+      "learning_rate": 0.0010176968453449603,
+      "loss": 0.6359,
+      "step": 11490
+    },
+    {
+      "epoch": 1.4747371120800206,
+      "grad_norm": 1.2811238765716553,
+      "learning_rate": 0.0010168419252799864,
+      "loss": 0.4657,
+      "step": 11500
+    },
+    {
+      "epoch": 1.4760194921774814,
+      "grad_norm": 1.3045051097869873,
+      "learning_rate": 0.0010159870052150123,
+      "loss": 0.4271,
+      "step": 11510
+    },
+    {
+      "epoch": 1.4773018722749423,
+      "grad_norm": 0.577139139175415,
+      "learning_rate": 0.0010151320851500385,
+      "loss": 0.5664,
+      "step": 11520
+    },
+    {
+      "epoch": 1.478584252372403,
+      "grad_norm": 0.9219268560409546,
+      "learning_rate": 0.0010142771650850646,
+      "loss": 0.5258,
+      "step": 11530
+    },
+    {
+      "epoch": 1.479866632469864,
+      "grad_norm": 0.6909111142158508,
+      "learning_rate": 0.0010134222450200907,
+      "loss": 0.4782,
+      "step": 11540
+    },
+    {
+      "epoch": 1.481149012567325,
+      "grad_norm": 1.5484191179275513,
+      "learning_rate": 0.0010125673249551167,
+      "loss": 0.6158,
+      "step": 11550
+    },
+    {
+      "epoch": 1.4824313926647859,
+      "grad_norm": 1.1076061725616455,
+      "learning_rate": 0.0010117124048901428,
+      "loss": 0.687,
+      "step": 11560
+    },
+    {
+      "epoch": 1.4837137727622467,
+      "grad_norm": 0.5345991253852844,
+      "learning_rate": 0.0010108574848251687,
+      "loss": 0.4507,
+      "step": 11570
+    },
+    {
+      "epoch": 1.4849961528597075,
+      "grad_norm": 1.8849554061889648,
+      "learning_rate": 0.0010100025647601949,
+      "loss": 0.7036,
+      "step": 11580
+    },
+    {
+      "epoch": 1.4862785329571686,
+      "grad_norm": 1.3229577541351318,
+      "learning_rate": 0.001009147644695221,
+      "loss": 0.6121,
+      "step": 11590
+    },
+    {
+      "epoch": 1.4875609130546295,
+      "grad_norm": 0.5365115404129028,
+      "learning_rate": 0.0010082927246302471,
+      "loss": 0.5233,
+      "step": 11600
+    },
+    {
+      "epoch": 1.4888432931520903,
+      "grad_norm": 1.0795190334320068,
+      "learning_rate": 0.0010074378045652733,
+      "loss": 0.464,
+      "step": 11610
+    },
+    {
+      "epoch": 1.4901256732495511,
+      "grad_norm": 1.2084637880325317,
+      "learning_rate": 0.0010065828845002994,
+      "loss": 0.5965,
+      "step": 11620
+    },
+    {
+      "epoch": 1.491408053347012,
+      "grad_norm": 1.16603684425354,
+      "learning_rate": 0.0010057279644353253,
+      "loss": 0.5806,
+      "step": 11630
+    },
+    {
+      "epoch": 1.4926904334444728,
+      "grad_norm": 1.2406288385391235,
+      "learning_rate": 0.0010048730443703512,
+      "loss": 0.5364,
+      "step": 11640
+    },
+    {
+      "epoch": 1.493972813541934,
+      "grad_norm": 1.1917636394500732,
+      "learning_rate": 0.0010040181243053774,
+      "loss": 0.7354,
+      "step": 11650
+    },
+    {
+      "epoch": 1.4952551936393947,
+      "grad_norm": 0.8824617862701416,
+      "learning_rate": 0.0010031632042404035,
+      "loss": 0.5803,
+      "step": 11660
+    },
+    {
+      "epoch": 1.4965375737368556,
+      "grad_norm": 1.87214195728302,
+      "learning_rate": 0.0010023082841754296,
+      "loss": 0.6652,
+      "step": 11670
+    },
+    {
+      "epoch": 1.4978199538343164,
+      "grad_norm": 1.6992979049682617,
+      "learning_rate": 0.0010014533641104558,
+      "loss": 0.6646,
+      "step": 11680
+    },
+    {
+      "epoch": 1.4991023339317775,
+      "grad_norm": 0.8672876954078674,
+      "learning_rate": 0.001000598444045482,
+      "loss": 0.6627,
+      "step": 11690
+    },
+    {
+      "epoch": 1.5003847140292383,
+      "grad_norm": 0.7643082141876221,
+      "learning_rate": 0.0009997435239805078,
+      "loss": 0.4474,
+      "step": 11700
+    },
+    {
+      "epoch": 1.5016670941266992,
+      "grad_norm": 0.6023688912391663,
+      "learning_rate": 0.000998888603915534,
+      "loss": 0.5694,
+      "step": 11710
+    },
+    {
+      "epoch": 1.50294947422416,
+      "grad_norm": 0.637225866317749,
+      "learning_rate": 0.0009980336838505601,
+      "loss": 0.5371,
+      "step": 11720
+    },
+    {
+      "epoch": 1.5042318543216209,
+      "grad_norm": 1.3987553119659424,
+      "learning_rate": 0.000997178763785586,
+      "loss": 0.5906,
+      "step": 11730
+    },
+    {
+      "epoch": 1.5055142344190817,
+      "grad_norm": 1.1652394533157349,
+      "learning_rate": 0.0009963238437206122,
+      "loss": 0.5353,
+      "step": 11740
+    },
+    {
+      "epoch": 1.5067966145165426,
+      "grad_norm": 0.8522770404815674,
+      "learning_rate": 0.0009954689236556383,
+      "loss": 0.5415,
+      "step": 11750
+    },
+    {
+      "epoch": 1.5080789946140036,
+      "grad_norm": 0.6423736810684204,
+      "learning_rate": 0.0009946140035906644,
+      "loss": 0.4791,
+      "step": 11760
+    },
+    {
+      "epoch": 1.5093613747114645,
+      "grad_norm": 1.1400604248046875,
+      "learning_rate": 0.0009937590835256904,
+      "loss": 0.7027,
+      "step": 11770
+    },
+    {
+      "epoch": 1.5106437548089253,
+      "grad_norm": 1.3665844202041626,
+      "learning_rate": 0.0009929041634607165,
+      "loss": 0.5289,
+      "step": 11780
+    },
+    {
+      "epoch": 1.5119261349063864,
+      "grad_norm": 0.5529056191444397,
+      "learning_rate": 0.0009920492433957426,
+      "loss": 0.4608,
+      "step": 11790
+    },
+    {
+      "epoch": 1.5132085150038472,
+      "grad_norm": 0.7571165561676025,
+      "learning_rate": 0.0009911943233307686,
+      "loss": 0.5328,
+      "step": 11800
+    },
+    {
+      "epoch": 1.514490895101308,
+      "grad_norm": 1.0876634120941162,
+      "learning_rate": 0.0009903394032657947,
+      "loss": 0.6374,
+      "step": 11810
+    },
+    {
+      "epoch": 1.515773275198769,
+      "grad_norm": 0.9497352242469788,
+      "learning_rate": 0.0009894844832008208,
+      "loss": 0.5999,
+      "step": 11820
+    },
+    {
+      "epoch": 1.5170556552962298,
+      "grad_norm": 1.0220736265182495,
+      "learning_rate": 0.0009886295631358467,
+      "loss": 0.5733,
+      "step": 11830
+    },
+    {
+      "epoch": 1.5183380353936906,
+      "grad_norm": 0.8994792699813843,
+      "learning_rate": 0.0009877746430708729,
+      "loss": 0.5991,
+      "step": 11840
+    },
+    {
+      "epoch": 1.5196204154911515,
+      "grad_norm": 0.9045878648757935,
+      "learning_rate": 0.000986919723005899,
+      "loss": 0.4144,
+      "step": 11850
+    },
+    {
+      "epoch": 1.5209027955886123,
+      "grad_norm": 0.8704327344894409,
+      "learning_rate": 0.000986064802940925,
+      "loss": 0.4937,
+      "step": 11860
+    },
+    {
+      "epoch": 1.5221851756860734,
+      "grad_norm": 2.1458346843719482,
+      "learning_rate": 0.000985209882875951,
+      "loss": 0.5259,
+      "step": 11870
+    },
+    {
+      "epoch": 1.5234675557835342,
+      "grad_norm": 0.5980050563812256,
+      "learning_rate": 0.0009843549628109772,
+      "loss": 0.5159,
+      "step": 11880
+    },
+    {
+      "epoch": 1.5247499358809953,
+      "grad_norm": 1.8885890245437622,
+      "learning_rate": 0.0009835000427460034,
+      "loss": 0.6933,
+      "step": 11890
+    },
+    {
+      "epoch": 1.5260323159784561,
+      "grad_norm": 1.1115361452102661,
+      "learning_rate": 0.0009826451226810293,
+      "loss": 0.48,
+      "step": 11900
+    },
+    {
+      "epoch": 1.527314696075917,
+      "grad_norm": 0.6800194978713989,
+      "learning_rate": 0.0009817902026160554,
+      "loss": 0.5799,
+      "step": 11910
+    },
+    {
+      "epoch": 1.5285970761733778,
+      "grad_norm": 0.7408868074417114,
+      "learning_rate": 0.0009809352825510815,
+      "loss": 0.4767,
+      "step": 11920
+    },
+    {
+      "epoch": 1.5298794562708387,
+      "grad_norm": 0.7443408966064453,
+      "learning_rate": 0.0009800803624861075,
+      "loss": 0.7237,
+      "step": 11930
+    },
+    {
+      "epoch": 1.5311618363682995,
+      "grad_norm": 0.7522343993186951,
+      "learning_rate": 0.0009792254424211336,
+      "loss": 0.3609,
+      "step": 11940
+    },
+    {
+      "epoch": 1.5324442164657603,
+      "grad_norm": 1.1541762351989746,
+      "learning_rate": 0.0009783705223561597,
+      "loss": 0.6063,
+      "step": 11950
+    },
+    {
+      "epoch": 1.5337265965632212,
+      "grad_norm": 0.7068589329719543,
+      "learning_rate": 0.0009775156022911857,
+      "loss": 0.4613,
+      "step": 11960
+    },
+    {
+      "epoch": 1.5350089766606823,
+      "grad_norm": 0.4808710515499115,
+      "learning_rate": 0.0009766606822262118,
+      "loss": 0.538,
+      "step": 11970
+    },
+    {
+      "epoch": 1.536291356758143,
+      "grad_norm": 0.7787117958068848,
+      "learning_rate": 0.0009758057621612379,
+      "loss": 0.5906,
+      "step": 11980
+    },
+    {
+      "epoch": 1.5375737368556042,
+      "grad_norm": 0.8264428973197937,
+      "learning_rate": 0.0009749508420962641,
+      "loss": 0.5553,
+      "step": 11990
+    },
+    {
+      "epoch": 1.538856116953065,
+      "grad_norm": 1.344869613647461,
+      "learning_rate": 0.0009740959220312901,
+      "loss": 0.4615,
+      "step": 12000
+    },
+    {
+      "epoch": 1.5401384970505259,
+      "grad_norm": 0.622340977191925,
+      "learning_rate": 0.0009732410019663161,
+      "loss": 0.5527,
+      "step": 12010
+    },
+    {
+      "epoch": 1.5414208771479867,
+      "grad_norm": 0.8521725535392761,
+      "learning_rate": 0.0009723860819013423,
+      "loss": 0.7133,
+      "step": 12020
+    },
+    {
+      "epoch": 1.5427032572454475,
+      "grad_norm": 0.7346990704536438,
+      "learning_rate": 0.0009715311618363684,
+      "loss": 0.4141,
+      "step": 12030
+    },
+    {
+      "epoch": 1.5439856373429084,
+      "grad_norm": 0.42431166768074036,
+      "learning_rate": 0.0009706762417713943,
+      "loss": 0.524,
+      "step": 12040
+    },
+    {
+      "epoch": 1.5452680174403692,
+      "grad_norm": 0.6801153421401978,
+      "learning_rate": 0.0009698213217064205,
+      "loss": 0.5191,
+      "step": 12050
+    },
+    {
+      "epoch": 1.54655039753783,
+      "grad_norm": 1.2390146255493164,
+      "learning_rate": 0.0009689664016414466,
+      "loss": 0.6671,
+      "step": 12060
+    },
+    {
+      "epoch": 1.5478327776352911,
+      "grad_norm": 1.2838438749313354,
+      "learning_rate": 0.0009681114815764726,
+      "loss": 0.5188,
+      "step": 12070
+    },
+    {
+      "epoch": 1.549115157732752,
+      "grad_norm": 1.216489315032959,
+      "learning_rate": 0.0009672565615114986,
+      "loss": 0.6267,
+      "step": 12080
+    },
+    {
+      "epoch": 1.5503975378302128,
+      "grad_norm": 1.3445849418640137,
+      "learning_rate": 0.0009664016414465248,
+      "loss": 0.5555,
+      "step": 12090
+    },
+    {
+      "epoch": 1.551679917927674,
+      "grad_norm": 0.9177038669586182,
+      "learning_rate": 0.0009655467213815508,
+      "loss": 0.5171,
+      "step": 12100
+    },
+    {
+      "epoch": 1.5529622980251347,
+      "grad_norm": 0.784768283367157,
+      "learning_rate": 0.000964691801316577,
+      "loss": 0.5645,
+      "step": 12110
+    },
+    {
+      "epoch": 1.5542446781225956,
+      "grad_norm": 1.3219481706619263,
+      "learning_rate": 0.000963836881251603,
+      "loss": 0.5687,
+      "step": 12120
+    },
+    {
+      "epoch": 1.5555270582200564,
+      "grad_norm": 1.0199742317199707,
+      "learning_rate": 0.0009629819611866291,
+      "loss": 0.4982,
+      "step": 12130
+    },
+    {
+      "epoch": 1.5568094383175173,
+      "grad_norm": 0.6068373322486877,
+      "learning_rate": 0.0009621270411216551,
+      "loss": 0.4458,
+      "step": 12140
+    },
+    {
+      "epoch": 1.5580918184149781,
+      "grad_norm": 0.7337871193885803,
+      "learning_rate": 0.0009612721210566813,
+      "loss": 0.5002,
+      "step": 12150
+    },
+    {
+      "epoch": 1.559374198512439,
+      "grad_norm": 0.9381522536277771,
+      "learning_rate": 0.0009604172009917073,
+      "loss": 0.4884,
+      "step": 12160
+    },
+    {
+      "epoch": 1.5606565786098998,
+      "grad_norm": 1.3527436256408691,
+      "learning_rate": 0.0009595622809267333,
+      "loss": 0.6701,
+      "step": 12170
+    },
+    {
+      "epoch": 1.5619389587073609,
+      "grad_norm": 0.9414606094360352,
+      "learning_rate": 0.0009587073608617595,
+      "loss": 0.6152,
+      "step": 12180
+    },
+    {
+      "epoch": 1.5632213388048217,
+      "grad_norm": 0.8174257874488831,
+      "learning_rate": 0.0009578524407967856,
+      "loss": 0.5274,
+      "step": 12190
+    },
+    {
+      "epoch": 1.5645037189022828,
+      "grad_norm": 1.5723670721054077,
+      "learning_rate": 0.0009569975207318115,
+      "loss": 0.6343,
+      "step": 12200
+    },
+    {
+      "epoch": 1.5657860989997436,
+      "grad_norm": 1.176369309425354,
+      "learning_rate": 0.0009561426006668377,
+      "loss": 0.5032,
+      "step": 12210
+    },
+    {
+      "epoch": 1.5670684790972045,
+      "grad_norm": 0.9606861472129822,
+      "learning_rate": 0.0009552876806018638,
+      "loss": 0.5375,
+      "step": 12220
+    },
+    {
+      "epoch": 1.5683508591946653,
+      "grad_norm": 0.65707927942276,
+      "learning_rate": 0.0009544327605368897,
+      "loss": 0.5424,
+      "step": 12230
+    },
+    {
+      "epoch": 1.5696332392921262,
+      "grad_norm": 1.4007467031478882,
+      "learning_rate": 0.0009535778404719159,
+      "loss": 0.6445,
+      "step": 12240
+    },
+    {
+      "epoch": 1.570915619389587,
+      "grad_norm": 1.0171335935592651,
+      "learning_rate": 0.000952722920406942,
+      "loss": 0.6483,
+      "step": 12250
+    },
+    {
+      "epoch": 1.5721979994870479,
+      "grad_norm": 1.3080893754959106,
+      "learning_rate": 0.0009518680003419681,
+      "loss": 0.6063,
+      "step": 12260
+    },
+    {
+      "epoch": 1.5734803795845087,
+      "grad_norm": 1.2589539289474487,
+      "learning_rate": 0.000951013080276994,
+      "loss": 0.5336,
+      "step": 12270
+    },
+    {
+      "epoch": 1.5747627596819698,
+      "grad_norm": 1.1046485900878906,
+      "learning_rate": 0.0009501581602120202,
+      "loss": 0.5861,
+      "step": 12280
+    },
+    {
+      "epoch": 1.5760451397794306,
+      "grad_norm": 0.5925372242927551,
+      "learning_rate": 0.0009493032401470463,
+      "loss": 0.5389,
+      "step": 12290
+    },
+    {
+      "epoch": 1.5773275198768915,
+      "grad_norm": 0.9463218450546265,
+      "learning_rate": 0.0009484483200820723,
+      "loss": 0.4316,
+      "step": 12300
+    },
+    {
+      "epoch": 1.5786098999743525,
+      "grad_norm": 1.0939500331878662,
+      "learning_rate": 0.0009475934000170984,
+      "loss": 0.5682,
+      "step": 12310
+    },
+    {
+      "epoch": 1.5798922800718134,
+      "grad_norm": 1.314713478088379,
+      "learning_rate": 0.0009467384799521245,
+      "loss": 0.7091,
+      "step": 12320
+    },
+    {
+      "epoch": 1.5811746601692742,
+      "grad_norm": 1.5223866701126099,
+      "learning_rate": 0.0009458835598871505,
+      "loss": 0.6481,
+      "step": 12330
+    },
+    {
+      "epoch": 1.582457040266735,
+      "grad_norm": 0.6593065857887268,
+      "learning_rate": 0.0009450286398221767,
+      "loss": 0.5051,
+      "step": 12340
+    },
+    {
+      "epoch": 1.583739420364196,
+      "grad_norm": 0.6195816993713379,
+      "learning_rate": 0.0009441737197572027,
+      "loss": 0.5446,
+      "step": 12350
+    },
+    {
+      "epoch": 1.5850218004616567,
+      "grad_norm": 0.5355708599090576,
+      "learning_rate": 0.0009433187996922287,
+      "loss": 0.592,
+      "step": 12360
+    },
+    {
+      "epoch": 1.5863041805591176,
+      "grad_norm": 0.5407887697219849,
+      "learning_rate": 0.0009424638796272549,
+      "loss": 0.4079,
+      "step": 12370
+    },
+    {
+      "epoch": 1.5875865606565787,
+      "grad_norm": 0.8656293153762817,
+      "learning_rate": 0.000941608959562281,
+      "loss": 0.6232,
+      "step": 12380
+    },
+    {
+      "epoch": 1.5888689407540395,
+      "grad_norm": 1.0088326930999756,
+      "learning_rate": 0.0009407540394973069,
+      "loss": 0.5646,
+      "step": 12390
+    },
+    {
+      "epoch": 1.5901513208515003,
+      "grad_norm": 1.7522426843643188,
+      "learning_rate": 0.0009398991194323331,
+      "loss": 0.6204,
+      "step": 12400
+    },
+    {
+      "epoch": 1.5914337009489614,
+      "grad_norm": 1.3139151334762573,
+      "learning_rate": 0.0009390441993673592,
+      "loss": 0.5363,
+      "step": 12410
+    },
+    {
+      "epoch": 1.5927160810464223,
+      "grad_norm": 1.4125381708145142,
+      "learning_rate": 0.0009381892793023853,
+      "loss": 0.5504,
+      "step": 12420
+    },
+    {
+      "epoch": 1.593998461143883,
+      "grad_norm": 1.0490381717681885,
+      "learning_rate": 0.0009373343592374113,
+      "loss": 0.5976,
+      "step": 12430
+    },
+    {
+      "epoch": 1.595280841241344,
+      "grad_norm": 0.8534132242202759,
+      "learning_rate": 0.0009364794391724374,
+      "loss": 0.5671,
+      "step": 12440
+    },
+    {
+      "epoch": 1.5965632213388048,
+      "grad_norm": 0.882533848285675,
+      "learning_rate": 0.0009356245191074635,
+      "loss": 0.5291,
+      "step": 12450
+    },
+    {
+      "epoch": 1.5978456014362656,
+      "grad_norm": 0.9412997364997864,
+      "learning_rate": 0.0009347695990424896,
+      "loss": 0.5206,
+      "step": 12460
+    },
+    {
+      "epoch": 1.5991279815337265,
+      "grad_norm": 0.8173903822898865,
+      "learning_rate": 0.0009339146789775156,
+      "loss": 0.6253,
+      "step": 12470
+    },
+    {
+      "epoch": 1.6004103616311873,
+      "grad_norm": 1.2541340589523315,
+      "learning_rate": 0.0009330597589125417,
+      "loss": 0.6774,
+      "step": 12480
+    },
+    {
+      "epoch": 1.6016927417286484,
+      "grad_norm": 0.8679422736167908,
+      "learning_rate": 0.0009322048388475678,
+      "loss": 0.4629,
+      "step": 12490
+    },
+    {
+      "epoch": 1.6029751218261092,
+      "grad_norm": 0.8610258102416992,
+      "learning_rate": 0.0009313499187825939,
+      "loss": 0.575,
+      "step": 12500
+    },
+    {
+      "epoch": 1.6042575019235703,
+      "grad_norm": 0.6528512835502625,
+      "learning_rate": 0.0009304949987176199,
+      "loss": 0.4773,
+      "step": 12510
+    },
+    {
+      "epoch": 1.6055398820210312,
+      "grad_norm": 0.7231767773628235,
+      "learning_rate": 0.0009296400786526459,
+      "loss": 0.643,
+      "step": 12520
+    },
+    {
+      "epoch": 1.606822262118492,
+      "grad_norm": 1.6632248163223267,
+      "learning_rate": 0.0009287851585876721,
+      "loss": 0.7517,
+      "step": 12530
+    },
+    {
+      "epoch": 1.6081046422159528,
+      "grad_norm": 0.7457917332649231,
+      "learning_rate": 0.0009279302385226982,
+      "loss": 0.3932,
+      "step": 12540
+    },
+    {
+      "epoch": 1.6093870223134137,
+      "grad_norm": 0.8939314484596252,
+      "learning_rate": 0.0009270753184577242,
+      "loss": 0.6226,
+      "step": 12550
+    },
+    {
+      "epoch": 1.6106694024108745,
+      "grad_norm": 0.4794626235961914,
+      "learning_rate": 0.0009262203983927503,
+      "loss": 0.5804,
+      "step": 12560
+    },
+    {
+      "epoch": 1.6119517825083354,
+      "grad_norm": 0.7062269449234009,
+      "learning_rate": 0.0009253654783277764,
+      "loss": 0.4038,
+      "step": 12570
+    },
+    {
+      "epoch": 1.6132341626057962,
+      "grad_norm": 0.6264899373054504,
+      "learning_rate": 0.0009245105582628024,
+      "loss": 0.7183,
+      "step": 12580
+    },
+    {
+      "epoch": 1.6145165427032573,
+      "grad_norm": 0.5829173922538757,
+      "learning_rate": 0.0009236556381978285,
+      "loss": 0.61,
+      "step": 12590
+    },
+    {
+      "epoch": 1.6157989228007181,
+      "grad_norm": 0.9000409841537476,
+      "learning_rate": 0.0009228007181328546,
+      "loss": 0.6004,
+      "step": 12600
+    },
+    {
+      "epoch": 1.617081302898179,
+      "grad_norm": 0.5521026849746704,
+      "learning_rate": 0.0009219457980678807,
+      "loss": 0.5799,
+      "step": 12610
+    },
+    {
+      "epoch": 1.61836368299564,
+      "grad_norm": 0.9383637309074402,
+      "learning_rate": 0.0009210908780029067,
+      "loss": 0.4902,
+      "step": 12620
+    },
+    {
+      "epoch": 1.6196460630931009,
+      "grad_norm": 1.1581825017929077,
+      "learning_rate": 0.0009202359579379328,
+      "loss": 0.4182,
+      "step": 12630
+    },
+    {
+      "epoch": 1.6209284431905617,
+      "grad_norm": 2.7146356105804443,
+      "learning_rate": 0.0009193810378729589,
+      "loss": 0.6339,
+      "step": 12640
+    },
+    {
+      "epoch": 1.6222108232880226,
+      "grad_norm": 0.607458233833313,
+      "learning_rate": 0.000918526117807985,
+      "loss": 0.7317,
+      "step": 12650
+    },
+    {
+      "epoch": 1.6234932033854834,
+      "grad_norm": 0.8015214800834656,
+      "learning_rate": 0.000917671197743011,
+      "loss": 0.5006,
+      "step": 12660
+    },
+    {
+      "epoch": 1.6247755834829443,
+      "grad_norm": 0.9352098703384399,
+      "learning_rate": 0.0009168162776780371,
+      "loss": 0.5692,
+      "step": 12670
+    },
+    {
+      "epoch": 1.626057963580405,
+      "grad_norm": 1.3403977155685425,
+      "learning_rate": 0.0009159613576130632,
+      "loss": 0.3569,
+      "step": 12680
+    },
+    {
+      "epoch": 1.6273403436778662,
+      "grad_norm": 0.9648029804229736,
+      "learning_rate": 0.0009151064375480893,
+      "loss": 0.6767,
+      "step": 12690
+    },
+    {
+      "epoch": 1.628622723775327,
+      "grad_norm": 0.7948251962661743,
+      "learning_rate": 0.0009142515174831153,
+      "loss": 0.5911,
+      "step": 12700
+    },
+    {
+      "epoch": 1.6299051038727879,
+      "grad_norm": 0.9088913798332214,
+      "learning_rate": 0.0009133965974181415,
+      "loss": 0.7068,
+      "step": 12710
+    },
+    {
+      "epoch": 1.631187483970249,
+      "grad_norm": 0.6906175017356873,
+      "learning_rate": 0.0009125416773531675,
+      "loss": 0.7111,
+      "step": 12720
+    },
+    {
+      "epoch": 1.6324698640677098,
+      "grad_norm": 1.3398417234420776,
+      "learning_rate": 0.0009116867572881936,
+      "loss": 0.7023,
+      "step": 12730
+    },
+    {
+      "epoch": 1.6337522441651706,
+      "grad_norm": 1.101651906967163,
+      "learning_rate": 0.0009108318372232196,
+      "loss": 0.6276,
+      "step": 12740
+    },
+    {
+      "epoch": 1.6350346242626315,
+      "grad_norm": 1.373275637626648,
+      "learning_rate": 0.0009099769171582457,
+      "loss": 0.5836,
+      "step": 12750
+    },
+    {
+      "epoch": 1.6363170043600923,
+      "grad_norm": 1.2274094820022583,
+      "learning_rate": 0.0009091219970932718,
+      "loss": 0.7078,
+      "step": 12760
+    },
+    {
+      "epoch": 1.6375993844575532,
+      "grad_norm": 1.5747358798980713,
+      "learning_rate": 0.000908267077028298,
+      "loss": 0.5668,
+      "step": 12770
+    },
+    {
+      "epoch": 1.638881764555014,
+      "grad_norm": 0.8394151329994202,
+      "learning_rate": 0.0009074121569633239,
+      "loss": 0.4556,
+      "step": 12780
+    },
+    {
+      "epoch": 1.6401641446524748,
+      "grad_norm": 0.7515396475791931,
+      "learning_rate": 0.00090655723689835,
+      "loss": 0.5956,
+      "step": 12790
+    },
+    {
+      "epoch": 1.641446524749936,
+      "grad_norm": 1.009969711303711,
+      "learning_rate": 0.0009057023168333761,
+      "loss": 0.4052,
+      "step": 12800
+    },
+    {
+      "epoch": 1.6427289048473968,
+      "grad_norm": 1.0753581523895264,
+      "learning_rate": 0.0009048473967684022,
+      "loss": 0.4942,
+      "step": 12810
+    },
+    {
+      "epoch": 1.6440112849448578,
+      "grad_norm": 0.8667649030685425,
+      "learning_rate": 0.0009039924767034282,
+      "loss": 0.5448,
+      "step": 12820
+    },
+    {
+      "epoch": 1.6452936650423187,
+      "grad_norm": 0.7869744300842285,
+      "learning_rate": 0.0009031375566384543,
+      "loss": 0.4488,
+      "step": 12830
+    },
+    {
+      "epoch": 1.6465760451397795,
+      "grad_norm": 0.9994969367980957,
+      "learning_rate": 0.0009022826365734805,
+      "loss": 0.4666,
+      "step": 12840
+    },
+    {
+      "epoch": 1.6478584252372404,
+      "grad_norm": 1.0947333574295044,
+      "learning_rate": 0.0009014277165085065,
+      "loss": 0.5769,
+      "step": 12850
+    },
+    {
+      "epoch": 1.6491408053347012,
+      "grad_norm": 0.44217410683631897,
+      "learning_rate": 0.0009005727964435325,
+      "loss": 0.4351,
+      "step": 12860
+    },
+    {
+      "epoch": 1.650423185432162,
+      "grad_norm": 1.5107439756393433,
+      "learning_rate": 0.0008997178763785587,
+      "loss": 0.4896,
+      "step": 12870
+    },
+    {
+      "epoch": 1.6517055655296229,
+      "grad_norm": 1.3806378841400146,
+      "learning_rate": 0.0008988629563135847,
+      "loss": 0.4819,
+      "step": 12880
+    },
+    {
+      "epoch": 1.6529879456270837,
+      "grad_norm": 1.0684335231781006,
+      "learning_rate": 0.0008980080362486108,
+      "loss": 0.5076,
+      "step": 12890
+    },
+    {
+      "epoch": 1.6542703257245448,
+      "grad_norm": 0.6249770522117615,
+      "learning_rate": 0.0008971531161836369,
+      "loss": 0.5086,
+      "step": 12900
+    },
+    {
+      "epoch": 1.6555527058220056,
+      "grad_norm": 0.8188676238059998,
+      "learning_rate": 0.0008962981961186629,
+      "loss": 0.4622,
+      "step": 12910
+    },
+    {
+      "epoch": 1.6568350859194665,
+      "grad_norm": 0.6623940467834473,
+      "learning_rate": 0.000895443276053689,
+      "loss": 0.5601,
+      "step": 12920
+    },
+    {
+      "epoch": 1.6581174660169276,
+      "grad_norm": 0.7788714170455933,
+      "learning_rate": 0.000894588355988715,
+      "loss": 0.4996,
+      "step": 12930
+    },
+    {
+      "epoch": 1.6593998461143884,
+      "grad_norm": 1.1400748491287231,
+      "learning_rate": 0.0008937334359237411,
+      "loss": 0.6324,
+      "step": 12940
+    },
+    {
+      "epoch": 1.6606822262118492,
+      "grad_norm": 0.873874306678772,
+      "learning_rate": 0.0008928785158587672,
+      "loss": 0.4832,
+      "step": 12950
+    },
+    {
+      "epoch": 1.66196460630931,
+      "grad_norm": 1.9320780038833618,
+      "learning_rate": 0.0008920235957937934,
+      "loss": 0.5149,
+      "step": 12960
+    },
+    {
+      "epoch": 1.663246986406771,
+      "grad_norm": 0.7874430418014526,
+      "learning_rate": 0.0008911686757288193,
+      "loss": 0.5972,
+      "step": 12970
+    },
+    {
+      "epoch": 1.6645293665042318,
+      "grad_norm": 0.5509324073791504,
+      "learning_rate": 0.0008903137556638454,
+      "loss": 0.6655,
+      "step": 12980
+    },
+    {
+      "epoch": 1.6658117466016926,
+      "grad_norm": 1.294395089149475,
+      "learning_rate": 0.0008894588355988715,
+      "loss": 0.5809,
+      "step": 12990
+    },
+    {
+      "epoch": 1.6670941266991535,
+      "grad_norm": 1.0513406991958618,
+      "learning_rate": 0.0008886039155338977,
+      "loss": 0.5312,
+      "step": 13000
+    },
+    {
+      "epoch": 1.6683765067966145,
+      "grad_norm": 0.7653344869613647,
+      "learning_rate": 0.0008877489954689236,
+      "loss": 0.5772,
+      "step": 13010
+    },
+    {
+      "epoch": 1.6696588868940754,
+      "grad_norm": 0.8619967103004456,
+      "learning_rate": 0.0008868940754039497,
+      "loss": 0.5977,
+      "step": 13020
+    },
+    {
+      "epoch": 1.6709412669915364,
+      "grad_norm": 0.5212082862854004,
+      "learning_rate": 0.0008860391553389759,
+      "loss": 0.514,
+      "step": 13030
+    },
+    {
+      "epoch": 1.6722236470889973,
+      "grad_norm": 1.150930404663086,
+      "learning_rate": 0.0008851842352740019,
+      "loss": 0.4911,
+      "step": 13040
+    },
+    {
+      "epoch": 1.6735060271864581,
+      "grad_norm": 1.1434667110443115,
+      "learning_rate": 0.0008843293152090279,
+      "loss": 0.6873,
+      "step": 13050
+    },
+    {
+      "epoch": 1.674788407283919,
+      "grad_norm": 0.8454691171646118,
+      "learning_rate": 0.0008834743951440541,
+      "loss": 0.5696,
+      "step": 13060
+    },
+    {
+      "epoch": 1.6760707873813798,
+      "grad_norm": 0.9413100481033325,
+      "learning_rate": 0.0008826194750790801,
+      "loss": 0.5203,
+      "step": 13070
+    },
+    {
+      "epoch": 1.6773531674788407,
+      "grad_norm": 1.1929399967193604,
+      "learning_rate": 0.0008817645550141062,
+      "loss": 0.6685,
+      "step": 13080
+    },
+    {
+      "epoch": 1.6786355475763015,
+      "grad_norm": 0.7188916206359863,
+      "learning_rate": 0.0008809096349491323,
+      "loss": 0.5879,
+      "step": 13090
+    },
+    {
+      "epoch": 1.6799179276737624,
+      "grad_norm": 1.4014157056808472,
+      "learning_rate": 0.0008800547148841583,
+      "loss": 0.7174,
+      "step": 13100
+    },
+    {
+      "epoch": 1.6812003077712234,
+      "grad_norm": 1.280308485031128,
+      "learning_rate": 0.0008791997948191844,
+      "loss": 0.6408,
+      "step": 13110
+    },
+    {
+      "epoch": 1.6824826878686843,
+      "grad_norm": 0.7468828558921814,
+      "learning_rate": 0.0008783448747542106,
+      "loss": 0.5482,
+      "step": 13120
+    },
+    {
+      "epoch": 1.6837650679661451,
+      "grad_norm": 1.2528510093688965,
+      "learning_rate": 0.0008774899546892366,
+      "loss": 0.5617,
+      "step": 13130
+    },
+    {
+      "epoch": 1.6850474480636062,
+      "grad_norm": 0.8010075092315674,
+      "learning_rate": 0.0008766350346242626,
+      "loss": 0.497,
+      "step": 13140
+    },
+    {
+      "epoch": 1.686329828161067,
+      "grad_norm": 0.6345623731613159,
+      "learning_rate": 0.0008757801145592888,
+      "loss": 0.5155,
+      "step": 13150
+    },
+    {
+      "epoch": 1.6876122082585279,
+      "grad_norm": 0.8836259245872498,
+      "learning_rate": 0.0008749251944943149,
+      "loss": 0.5199,
+      "step": 13160
+    },
+    {
+      "epoch": 1.6888945883559887,
+      "grad_norm": 1.7731555700302124,
+      "learning_rate": 0.0008740702744293408,
+      "loss": 0.7202,
+      "step": 13170
+    },
+    {
+      "epoch": 1.6901769684534496,
+      "grad_norm": 0.9856127500534058,
+      "learning_rate": 0.0008732153543643669,
+      "loss": 0.5554,
+      "step": 13180
+    },
+    {
+      "epoch": 1.6914593485509104,
+      "grad_norm": 0.678236722946167,
+      "learning_rate": 0.0008723604342993931,
+      "loss": 0.5101,
+      "step": 13190
+    },
+    {
+      "epoch": 1.6927417286483712,
+      "grad_norm": 1.093641996383667,
+      "learning_rate": 0.0008715055142344191,
+      "loss": 0.4779,
+      "step": 13200
+    },
+    {
+      "epoch": 1.6940241087458323,
+      "grad_norm": 0.8706515431404114,
+      "learning_rate": 0.0008706505941694451,
+      "loss": 0.5454,
+      "step": 13210
+    },
+    {
+      "epoch": 1.6953064888432932,
+      "grad_norm": 1.9918123483657837,
+      "learning_rate": 0.0008697956741044713,
+      "loss": 0.5804,
+      "step": 13220
+    },
+    {
+      "epoch": 1.696588868940754,
+      "grad_norm": 0.68386310338974,
+      "learning_rate": 0.0008689407540394973,
+      "loss": 0.5132,
+      "step": 13230
+    },
+    {
+      "epoch": 1.697871249038215,
+      "grad_norm": 0.9491919279098511,
+      "learning_rate": 0.0008680858339745234,
+      "loss": 0.5333,
+      "step": 13240
+    },
+    {
+      "epoch": 1.699153629135676,
+      "grad_norm": 1.1484148502349854,
+      "learning_rate": 0.0008672309139095495,
+      "loss": 0.5035,
+      "step": 13250
+    },
+    {
+      "epoch": 1.7004360092331368,
+      "grad_norm": 0.6695376038551331,
+      "learning_rate": 0.0008663759938445755,
+      "loss": 0.4074,
+      "step": 13260
+    },
+    {
+      "epoch": 1.7017183893305976,
+      "grad_norm": 1.1360992193222046,
+      "learning_rate": 0.0008655210737796016,
+      "loss": 0.4778,
+      "step": 13270
+    },
+    {
+      "epoch": 1.7030007694280584,
+      "grad_norm": 0.9604383111000061,
+      "learning_rate": 0.0008646661537146277,
+      "loss": 0.5601,
+      "step": 13280
+    },
+    {
+      "epoch": 1.7042831495255193,
+      "grad_norm": 1.3364579677581787,
+      "learning_rate": 0.0008638112336496538,
+      "loss": 0.4396,
+      "step": 13290
+    },
+    {
+      "epoch": 1.7055655296229801,
+      "grad_norm": 0.6195886731147766,
+      "learning_rate": 0.0008629563135846798,
+      "loss": 0.5143,
+      "step": 13300
+    },
+    {
+      "epoch": 1.706847909720441,
+      "grad_norm": 0.9771988987922668,
+      "learning_rate": 0.000862101393519706,
+      "loss": 0.504,
+      "step": 13310
+    },
+    {
+      "epoch": 1.708130289817902,
+      "grad_norm": 1.2056382894515991,
+      "learning_rate": 0.000861246473454732,
+      "loss": 0.5157,
+      "step": 13320
+    },
+    {
+      "epoch": 1.709412669915363,
+      "grad_norm": 1.0065749883651733,
+      "learning_rate": 0.000860391553389758,
+      "loss": 0.5178,
+      "step": 13330
+    },
+    {
+      "epoch": 1.710695050012824,
+      "grad_norm": 0.5957716703414917,
+      "learning_rate": 0.0008595366333247842,
+      "loss": 0.4264,
+      "step": 13340
+    },
+    {
+      "epoch": 1.7119774301102848,
+      "grad_norm": 0.7101840376853943,
+      "learning_rate": 0.0008586817132598103,
+      "loss": 0.4966,
+      "step": 13350
+    },
+    {
+      "epoch": 1.7132598102077456,
+      "grad_norm": 0.7323270440101624,
+      "learning_rate": 0.0008578267931948362,
+      "loss": 0.427,
+      "step": 13360
+    },
+    {
+      "epoch": 1.7145421903052065,
+      "grad_norm": 0.798598051071167,
+      "learning_rate": 0.0008569718731298623,
+      "loss": 0.3831,
+      "step": 13370
+    },
+    {
+      "epoch": 1.7158245704026673,
+      "grad_norm": 0.7318591475486755,
+      "learning_rate": 0.0008561169530648885,
+      "loss": 0.5691,
+      "step": 13380
+    },
+    {
+      "epoch": 1.7171069505001282,
+      "grad_norm": 0.951507031917572,
+      "learning_rate": 0.0008552620329999145,
+      "loss": 0.8433,
+      "step": 13390
+    },
+    {
+      "epoch": 1.718389330597589,
+      "grad_norm": 0.7206411361694336,
+      "learning_rate": 0.0008544071129349405,
+      "loss": 0.5138,
+      "step": 13400
+    },
+    {
+      "epoch": 1.7196717106950499,
+      "grad_norm": 0.9043763875961304,
+      "learning_rate": 0.0008535521928699667,
+      "loss": 0.5955,
+      "step": 13410
+    },
+    {
+      "epoch": 1.720954090792511,
+      "grad_norm": 2.0460662841796875,
+      "learning_rate": 0.0008526972728049928,
+      "loss": 0.5227,
+      "step": 13420
+    },
+    {
+      "epoch": 1.7222364708899718,
+      "grad_norm": 1.167067289352417,
+      "learning_rate": 0.0008518423527400188,
+      "loss": 0.4719,
+      "step": 13430
+    },
+    {
+      "epoch": 1.7235188509874326,
+      "grad_norm": 0.7188780307769775,
+      "learning_rate": 0.0008509874326750449,
+      "loss": 0.5359,
+      "step": 13440
+    },
+    {
+      "epoch": 1.7248012310848937,
+      "grad_norm": 1.6519628763198853,
+      "learning_rate": 0.000850132512610071,
+      "loss": 0.5869,
+      "step": 13450
+    },
+    {
+      "epoch": 1.7260836111823545,
+      "grad_norm": 1.1188615560531616,
+      "learning_rate": 0.000849277592545097,
+      "loss": 0.6952,
+      "step": 13460
+    },
+    {
+      "epoch": 1.7273659912798154,
+      "grad_norm": 0.7881381511688232,
+      "learning_rate": 0.0008484226724801232,
+      "loss": 0.5809,
+      "step": 13470
+    },
+    {
+      "epoch": 1.7286483713772762,
+      "grad_norm": 0.39665651321411133,
+      "learning_rate": 0.0008475677524151492,
+      "loss": 0.6488,
+      "step": 13480
+    },
+    {
+      "epoch": 1.729930751474737,
+      "grad_norm": 0.7501810193061829,
+      "learning_rate": 0.0008467128323501752,
+      "loss": 0.6566,
+      "step": 13490
+    },
+    {
+      "epoch": 1.731213131572198,
+      "grad_norm": 0.7444621324539185,
+      "learning_rate": 0.0008458579122852014,
+      "loss": 0.559,
+      "step": 13500
+    },
+    {
+      "epoch": 1.7324955116696588,
+      "grad_norm": 1.1984418630599976,
+      "learning_rate": 0.0008450029922202275,
+      "loss": 0.5926,
+      "step": 13510
+    },
+    {
+      "epoch": 1.7337778917671198,
+      "grad_norm": 1.1419836282730103,
+      "learning_rate": 0.0008441480721552534,
+      "loss": 0.4816,
+      "step": 13520
+    },
+    {
+      "epoch": 1.7350602718645807,
+      "grad_norm": 1.021096110343933,
+      "learning_rate": 0.0008432931520902796,
+      "loss": 0.534,
+      "step": 13530
+    },
+    {
+      "epoch": 1.7363426519620415,
+      "grad_norm": 0.5502016544342041,
+      "learning_rate": 0.0008424382320253057,
+      "loss": 0.5077,
+      "step": 13540
+    },
+    {
+      "epoch": 1.7376250320595026,
+      "grad_norm": 1.1149070262908936,
+      "learning_rate": 0.0008415833119603318,
+      "loss": 0.6677,
+      "step": 13550
+    },
+    {
+      "epoch": 1.7389074121569634,
+      "grad_norm": 1.017102837562561,
+      "learning_rate": 0.0008407283918953577,
+      "loss": 0.5076,
+      "step": 13560
+    },
+    {
+      "epoch": 1.7401897922544243,
+      "grad_norm": 1.4042975902557373,
+      "learning_rate": 0.0008398734718303839,
+      "loss": 0.588,
+      "step": 13570
+    },
+    {
+      "epoch": 1.7414721723518851,
+      "grad_norm": 1.2214784622192383,
+      "learning_rate": 0.00083901855176541,
+      "loss": 0.6285,
+      "step": 13580
+    },
+    {
+      "epoch": 1.742754552449346,
+      "grad_norm": 0.9134330153465271,
+      "learning_rate": 0.000838163631700436,
+      "loss": 0.4326,
+      "step": 13590
+    },
+    {
+      "epoch": 1.7440369325468068,
+      "grad_norm": 1.0869004726409912,
+      "learning_rate": 0.0008373087116354621,
+      "loss": 0.4825,
+      "step": 13600
+    },
+    {
+      "epoch": 1.7453193126442677,
+      "grad_norm": 0.8992425799369812,
+      "learning_rate": 0.0008364537915704882,
+      "loss": 0.6364,
+      "step": 13610
+    },
+    {
+      "epoch": 1.7466016927417285,
+      "grad_norm": 1.2545522451400757,
+      "learning_rate": 0.0008355988715055142,
+      "loss": 0.4515,
+      "step": 13620
+    },
+    {
+      "epoch": 1.7478840728391896,
+      "grad_norm": 0.7109204530715942,
+      "learning_rate": 0.0008347439514405403,
+      "loss": 0.5838,
+      "step": 13630
+    },
+    {
+      "epoch": 1.7491664529366504,
+      "grad_norm": 1.2190492153167725,
+      "learning_rate": 0.0008338890313755664,
+      "loss": 0.4962,
+      "step": 13640
+    },
+    {
+      "epoch": 1.7504488330341115,
+      "grad_norm": 0.9201902151107788,
+      "learning_rate": 0.0008330341113105924,
+      "loss": 0.4827,
+      "step": 13650
+    },
+    {
+      "epoch": 1.7517312131315723,
+      "grad_norm": 1.5981885194778442,
+      "learning_rate": 0.0008321791912456186,
+      "loss": 0.6323,
+      "step": 13660
+    },
+    {
+      "epoch": 1.7530135932290332,
+      "grad_norm": 0.8127802014350891,
+      "learning_rate": 0.0008313242711806446,
+      "loss": 0.407,
+      "step": 13670
+    },
+    {
+      "epoch": 1.754295973326494,
+      "grad_norm": 0.7639079689979553,
+      "learning_rate": 0.0008304693511156706,
+      "loss": 0.455,
+      "step": 13680
+    },
+    {
+      "epoch": 1.7555783534239549,
+      "grad_norm": 1.8039822578430176,
+      "learning_rate": 0.0008296144310506968,
+      "loss": 0.53,
+      "step": 13690
+    },
+    {
+      "epoch": 1.7568607335214157,
+      "grad_norm": 1.4500998258590698,
+      "learning_rate": 0.0008287595109857229,
+      "loss": 0.6162,
+      "step": 13700
+    },
+    {
+      "epoch": 1.7581431136188765,
+      "grad_norm": 2.520433187484741,
+      "learning_rate": 0.0008279045909207489,
+      "loss": 0.5441,
+      "step": 13710
+    },
+    {
+      "epoch": 1.7594254937163374,
+      "grad_norm": 1.0140107870101929,
+      "learning_rate": 0.000827049670855775,
+      "loss": 0.4956,
+      "step": 13720
+    },
+    {
+      "epoch": 1.7607078738137985,
+      "grad_norm": 0.4604131281375885,
+      "learning_rate": 0.0008261947507908011,
+      "loss": 0.51,
+      "step": 13730
+    },
+    {
+      "epoch": 1.7619902539112593,
+      "grad_norm": 0.9998058080673218,
+      "learning_rate": 0.0008253398307258272,
+      "loss": 0.5095,
+      "step": 13740
+    },
+    {
+      "epoch": 1.7632726340087201,
+      "grad_norm": 0.8125320076942444,
+      "learning_rate": 0.0008244849106608532,
+      "loss": 0.4844,
+      "step": 13750
+    },
+    {
+      "epoch": 1.7645550141061812,
+      "grad_norm": 2.0400047302246094,
+      "learning_rate": 0.0008236299905958793,
+      "loss": 0.521,
+      "step": 13760
+    },
+    {
+      "epoch": 1.765837394203642,
+      "grad_norm": 1.3145325183868408,
+      "learning_rate": 0.0008227750705309054,
+      "loss": 0.5762,
+      "step": 13770
+    },
+    {
+      "epoch": 1.767119774301103,
+      "grad_norm": 1.6746065616607666,
+      "learning_rate": 0.0008219201504659315,
+      "loss": 0.5174,
+      "step": 13780
+    },
+    {
+      "epoch": 1.7684021543985637,
+      "grad_norm": 1.6866681575775146,
+      "learning_rate": 0.0008210652304009575,
+      "loss": 0.4871,
+      "step": 13790
+    },
+    {
+      "epoch": 1.7696845344960246,
+      "grad_norm": 1.2878737449645996,
+      "learning_rate": 0.0008202103103359836,
+      "loss": 0.6269,
+      "step": 13800
+    },
+    {
+      "epoch": 1.7709669145934854,
+      "grad_norm": 1.4274048805236816,
+      "learning_rate": 0.0008193553902710096,
+      "loss": 0.5801,
+      "step": 13810
+    },
+    {
+      "epoch": 1.7722492946909463,
+      "grad_norm": 0.7440363168716431,
+      "learning_rate": 0.0008185004702060358,
+      "loss": 0.4935,
+      "step": 13820
+    },
+    {
+      "epoch": 1.7735316747884071,
+      "grad_norm": 0.7374436259269714,
+      "learning_rate": 0.0008176455501410618,
+      "loss": 0.4152,
+      "step": 13830
+    },
+    {
+      "epoch": 1.7748140548858682,
+      "grad_norm": 0.7930302619934082,
+      "learning_rate": 0.0008167906300760878,
+      "loss": 0.5713,
+      "step": 13840
+    },
+    {
+      "epoch": 1.776096434983329,
+      "grad_norm": 0.8752467036247253,
+      "learning_rate": 0.000815935710011114,
+      "loss": 0.5714,
+      "step": 13850
+    },
+    {
+      "epoch": 1.77737881508079,
+      "grad_norm": 0.8803384900093079,
+      "learning_rate": 0.0008150807899461401,
+      "loss": 0.4603,
+      "step": 13860
+    },
+    {
+      "epoch": 1.778661195178251,
+      "grad_norm": 0.8935397863388062,
+      "learning_rate": 0.0008142258698811661,
+      "loss": 0.432,
+      "step": 13870
+    },
+    {
+      "epoch": 1.7799435752757118,
+      "grad_norm": 1.1395505666732788,
+      "learning_rate": 0.0008133709498161922,
+      "loss": 0.5874,
+      "step": 13880
+    },
+    {
+      "epoch": 1.7812259553731726,
+      "grad_norm": 1.5835202932357788,
+      "learning_rate": 0.0008125160297512183,
+      "loss": 0.5225,
+      "step": 13890
+    },
+    {
+      "epoch": 1.7825083354706335,
+      "grad_norm": 0.9241839647293091,
+      "learning_rate": 0.0008116611096862444,
+      "loss": 0.4985,
+      "step": 13900
+    },
+    {
+      "epoch": 1.7837907155680943,
+      "grad_norm": 0.7671691179275513,
+      "learning_rate": 0.0008108061896212704,
+      "loss": 0.6306,
+      "step": 13910
+    },
+    {
+      "epoch": 1.7850730956655552,
+      "grad_norm": 0.9022935628890991,
+      "learning_rate": 0.0008099512695562965,
+      "loss": 0.5379,
+      "step": 13920
+    },
+    {
+      "epoch": 1.786355475763016,
+      "grad_norm": 1.420850157737732,
+      "learning_rate": 0.0008090963494913226,
+      "loss": 0.3985,
+      "step": 13930
+    },
+    {
+      "epoch": 1.787637855860477,
+      "grad_norm": 0.733504593372345,
+      "learning_rate": 0.0008082414294263487,
+      "loss": 0.7121,
+      "step": 13940
+    },
+    {
+      "epoch": 1.788920235957938,
+      "grad_norm": 1.4567188024520874,
+      "learning_rate": 0.0008073865093613747,
+      "loss": 0.6869,
+      "step": 13950
+    },
+    {
+      "epoch": 1.7902026160553988,
+      "grad_norm": 0.6763759255409241,
+      "learning_rate": 0.0008065315892964008,
+      "loss": 0.621,
+      "step": 13960
+    },
+    {
+      "epoch": 1.7914849961528598,
+      "grad_norm": 0.8437899947166443,
+      "learning_rate": 0.0008056766692314269,
+      "loss": 0.5373,
+      "step": 13970
+    },
+    {
+      "epoch": 1.7927673762503207,
+      "grad_norm": 0.4620661437511444,
+      "learning_rate": 0.0008048217491664529,
+      "loss": 0.5236,
+      "step": 13980
+    },
+    {
+      "epoch": 1.7940497563477815,
+      "grad_norm": 0.7685003280639648,
+      "learning_rate": 0.000803966829101479,
+      "loss": 0.4337,
+      "step": 13990
+    },
+    {
+      "epoch": 1.7953321364452424,
+      "grad_norm": 1.1188052892684937,
+      "learning_rate": 0.0008031119090365052,
+      "loss": 0.691,
+      "step": 14000
+    },
+    {
+      "epoch": 1.7966145165427032,
+      "grad_norm": 0.8180050849914551,
+      "learning_rate": 0.0008022569889715312,
+      "loss": 0.403,
+      "step": 14010
+    },
+    {
+      "epoch": 1.797896896640164,
+      "grad_norm": 0.6858202219009399,
+      "learning_rate": 0.0008014020689065572,
+      "loss": 0.4301,
+      "step": 14020
+    },
+    {
+      "epoch": 1.799179276737625,
+      "grad_norm": 0.5204628705978394,
+      "learning_rate": 0.0008005471488415833,
+      "loss": 0.5291,
+      "step": 14030
+    },
+    {
+      "epoch": 1.800461656835086,
+      "grad_norm": 1.3663321733474731,
+      "learning_rate": 0.0007996922287766094,
+      "loss": 0.5184,
+      "step": 14040
+    },
+    {
+      "epoch": 1.8017440369325468,
+      "grad_norm": 1.3524236679077148,
+      "learning_rate": 0.0007988373087116355,
+      "loss": 0.5009,
+      "step": 14050
+    },
+    {
+      "epoch": 1.8030264170300077,
+      "grad_norm": 0.8444858193397522,
+      "learning_rate": 0.0007979823886466615,
+      "loss": 0.5588,
+      "step": 14060
+    },
+    {
+      "epoch": 1.8043087971274687,
+      "grad_norm": 1.0178775787353516,
+      "learning_rate": 0.0007971274685816876,
+      "loss": 0.4677,
+      "step": 14070
+    },
+    {
+      "epoch": 1.8055911772249296,
+      "grad_norm": 0.7170798778533936,
+      "learning_rate": 0.0007962725485167137,
+      "loss": 0.6669,
+      "step": 14080
+    },
+    {
+      "epoch": 1.8068735573223904,
+      "grad_norm": 0.8029789328575134,
+      "learning_rate": 0.0007954176284517398,
+      "loss": 0.4776,
+      "step": 14090
+    },
+    {
+      "epoch": 1.8081559374198513,
+      "grad_norm": 1.1515179872512817,
+      "learning_rate": 0.0007945627083867658,
+      "loss": 0.3991,
+      "step": 14100
+    },
+    {
+      "epoch": 1.809438317517312,
+      "grad_norm": 1.060905933380127,
+      "learning_rate": 0.0007937077883217919,
+      "loss": 0.5106,
+      "step": 14110
+    },
+    {
+      "epoch": 1.810720697614773,
+      "grad_norm": 0.6174659132957458,
+      "learning_rate": 0.000792852868256818,
+      "loss": 0.5666,
+      "step": 14120
+    },
+    {
+      "epoch": 1.8120030777122338,
+      "grad_norm": 1.2170026302337646,
+      "learning_rate": 0.0007919979481918442,
+      "loss": 0.6201,
+      "step": 14130
+    },
+    {
+      "epoch": 1.8132854578096946,
+      "grad_norm": 1.1146901845932007,
+      "learning_rate": 0.0007911430281268701,
+      "loss": 0.4525,
+      "step": 14140
+    },
+    {
+      "epoch": 1.8145678379071557,
+      "grad_norm": 0.9617615342140198,
+      "learning_rate": 0.0007902881080618962,
+      "loss": 0.4363,
+      "step": 14150
+    },
+    {
+      "epoch": 1.8158502180046165,
+      "grad_norm": 0.9604726433753967,
+      "learning_rate": 0.0007894331879969224,
+      "loss": 0.4125,
+      "step": 14160
+    },
+    {
+      "epoch": 1.8171325981020776,
+      "grad_norm": 1.3785549402236938,
+      "learning_rate": 0.0007885782679319484,
+      "loss": 0.5336,
+      "step": 14170
+    },
+    {
+      "epoch": 1.8184149781995385,
+      "grad_norm": 1.3045930862426758,
+      "learning_rate": 0.0007877233478669744,
+      "loss": 0.6223,
+      "step": 14180
+    },
+    {
+      "epoch": 1.8196973582969993,
+      "grad_norm": 1.0294426679611206,
+      "learning_rate": 0.0007868684278020006,
+      "loss": 0.567,
+      "step": 14190
+    },
+    {
+      "epoch": 1.8209797383944601,
+      "grad_norm": 0.7799472808837891,
+      "learning_rate": 0.0007860135077370266,
+      "loss": 0.5708,
+      "step": 14200
+    },
+    {
+      "epoch": 1.822262118491921,
+      "grad_norm": 0.9570887684822083,
+      "learning_rate": 0.0007851585876720527,
+      "loss": 0.7051,
+      "step": 14210
+    },
+    {
+      "epoch": 1.8235444985893818,
+      "grad_norm": 1.0479843616485596,
+      "learning_rate": 0.0007843036676070788,
+      "loss": 0.5006,
+      "step": 14220
+    },
+    {
+      "epoch": 1.8248268786868427,
+      "grad_norm": 0.8999461531639099,
+      "learning_rate": 0.0007834487475421048,
+      "loss": 0.5086,
+      "step": 14230
+    },
+    {
+      "epoch": 1.8261092587843035,
+      "grad_norm": 0.9917542934417725,
+      "learning_rate": 0.0007825938274771309,
+      "loss": 0.7195,
+      "step": 14240
+    },
+    {
+      "epoch": 1.8273916388817646,
+      "grad_norm": 0.7102358341217041,
+      "learning_rate": 0.000781738907412157,
+      "loss": 0.3533,
+      "step": 14250
+    },
+    {
+      "epoch": 1.8286740189792254,
+      "grad_norm": 0.8428940176963806,
+      "learning_rate": 0.000780883987347183,
+      "loss": 0.47,
+      "step": 14260
+    },
+    {
+      "epoch": 1.8299563990766863,
+      "grad_norm": 1.4296576976776123,
+      "learning_rate": 0.0007800290672822091,
+      "loss": 0.4745,
+      "step": 14270
+    },
+    {
+      "epoch": 1.8312387791741473,
+      "grad_norm": 0.6871338486671448,
+      "learning_rate": 0.0007791741472172352,
+      "loss": 0.4092,
+      "step": 14280
+    },
+    {
+      "epoch": 1.8325211592716082,
+      "grad_norm": 1.1090123653411865,
+      "learning_rate": 0.0007783192271522614,
+      "loss": 0.6469,
+      "step": 14290
+    },
+    {
+      "epoch": 1.833803539369069,
+      "grad_norm": 0.88601154088974,
+      "learning_rate": 0.0007774643070872873,
+      "loss": 0.55,
+      "step": 14300
+    },
+    {
+      "epoch": 1.8350859194665299,
+      "grad_norm": 1.2753747701644897,
+      "learning_rate": 0.0007766093870223134,
+      "loss": 0.5905,
+      "step": 14310
+    },
+    {
+      "epoch": 1.8363682995639907,
+      "grad_norm": 1.068947196006775,
+      "learning_rate": 0.0007757544669573396,
+      "loss": 0.557,
+      "step": 14320
+    },
+    {
+      "epoch": 1.8376506796614516,
+      "grad_norm": 0.49127456545829773,
+      "learning_rate": 0.0007748995468923655,
+      "loss": 0.5636,
+      "step": 14330
+    },
+    {
+      "epoch": 1.8389330597589124,
+      "grad_norm": 0.5474888682365417,
+      "learning_rate": 0.0007740446268273916,
+      "loss": 0.5462,
+      "step": 14340
+    },
+    {
+      "epoch": 1.8402154398563735,
+      "grad_norm": 0.7848386168479919,
+      "learning_rate": 0.0007731897067624178,
+      "loss": 0.4884,
+      "step": 14350
+    },
+    {
+      "epoch": 1.8414978199538343,
+      "grad_norm": 1.106774091720581,
+      "learning_rate": 0.0007723347866974438,
+      "loss": 0.5223,
+      "step": 14360
+    },
+    {
+      "epoch": 1.8427802000512952,
+      "grad_norm": 1.2404162883758545,
+      "learning_rate": 0.0007714798666324698,
+      "loss": 0.5551,
+      "step": 14370
+    },
+    {
+      "epoch": 1.8440625801487562,
+      "grad_norm": 1.1383230686187744,
+      "learning_rate": 0.000770624946567496,
+      "loss": 0.4654,
+      "step": 14380
+    },
+    {
+      "epoch": 1.845344960246217,
+      "grad_norm": 0.90556800365448,
+      "learning_rate": 0.000769770026502522,
+      "loss": 0.5061,
+      "step": 14390
+    },
+    {
+      "epoch": 1.846627340343678,
+      "grad_norm": 0.922673761844635,
+      "learning_rate": 0.0007689151064375481,
+      "loss": 0.5311,
+      "step": 14400
+    },
+    {
+      "epoch": 1.8479097204411388,
+      "grad_norm": 1.4559677839279175,
+      "learning_rate": 0.0007680601863725742,
+      "loss": 0.5076,
+      "step": 14410
+    },
+    {
+      "epoch": 1.8491921005385996,
+      "grad_norm": 0.6838889718055725,
+      "learning_rate": 0.0007672052663076002,
+      "loss": 0.6319,
+      "step": 14420
+    },
+    {
+      "epoch": 1.8504744806360605,
+      "grad_norm": 0.5238109230995178,
+      "learning_rate": 0.0007663503462426263,
+      "loss": 0.3426,
+      "step": 14430
+    },
+    {
+      "epoch": 1.8517568607335213,
+      "grad_norm": 0.9826338887214661,
+      "learning_rate": 0.0007654954261776525,
+      "loss": 0.6484,
+      "step": 14440
+    },
+    {
+      "epoch": 1.8530392408309821,
+      "grad_norm": 0.6289449334144592,
+      "learning_rate": 0.0007646405061126785,
+      "loss": 0.5969,
+      "step": 14450
+    },
+    {
+      "epoch": 1.8543216209284432,
+      "grad_norm": 0.7887721657752991,
+      "learning_rate": 0.0007637855860477045,
+      "loss": 0.6056,
+      "step": 14460
+    },
+    {
+      "epoch": 1.855604001025904,
+      "grad_norm": 0.9152905344963074,
+      "learning_rate": 0.0007629306659827306,
+      "loss": 0.568,
+      "step": 14470
+    },
+    {
+      "epoch": 1.8568863811233651,
+      "grad_norm": 1.7868821620941162,
+      "learning_rate": 0.0007620757459177568,
+      "loss": 0.5455,
+      "step": 14480
+    },
+    {
+      "epoch": 1.858168761220826,
+      "grad_norm": 0.6593843102455139,
+      "learning_rate": 0.0007612208258527827,
+      "loss": 0.5395,
+      "step": 14490
+    },
+    {
+      "epoch": 1.8594511413182868,
+      "grad_norm": 1.7809274196624756,
+      "learning_rate": 0.0007603659057878088,
+      "loss": 0.5335,
+      "step": 14500
+    },
+    {
+      "epoch": 1.8607335214157477,
+      "grad_norm": 0.5800178050994873,
+      "learning_rate": 0.000759510985722835,
+      "loss": 0.5194,
+      "step": 14510
+    },
+    {
+      "epoch": 1.8620159015132085,
+      "grad_norm": 1.2198340892791748,
+      "learning_rate": 0.000758656065657861,
+      "loss": 0.5906,
+      "step": 14520
+    },
+    {
+      "epoch": 1.8632982816106693,
+      "grad_norm": 0.9660494327545166,
+      "learning_rate": 0.000757801145592887,
+      "loss": 0.6645,
+      "step": 14530
+    },
+    {
+      "epoch": 1.8645806617081302,
+      "grad_norm": 0.9636703729629517,
+      "learning_rate": 0.0007569462255279132,
+      "loss": 0.4877,
+      "step": 14540
+    },
+    {
+      "epoch": 1.865863041805591,
+      "grad_norm": 1.3353137969970703,
+      "learning_rate": 0.0007560913054629392,
+      "loss": 0.5084,
+      "step": 14550
+    },
+    {
+      "epoch": 1.867145421903052,
+      "grad_norm": 0.5856647491455078,
+      "learning_rate": 0.0007552363853979653,
+      "loss": 0.4878,
+      "step": 14560
+    },
+    {
+      "epoch": 1.868427802000513,
+      "grad_norm": 0.8503313064575195,
+      "learning_rate": 0.0007543814653329914,
+      "loss": 0.522,
+      "step": 14570
+    },
+    {
+      "epoch": 1.8697101820979738,
+      "grad_norm": 0.700071394443512,
+      "learning_rate": 0.0007535265452680175,
+      "loss": 0.6389,
+      "step": 14580
+    },
+    {
+      "epoch": 1.8709925621954349,
+      "grad_norm": 0.879586398601532,
+      "learning_rate": 0.0007526716252030435,
+      "loss": 0.5235,
+      "step": 14590
+    },
+    {
+      "epoch": 1.8722749422928957,
+      "grad_norm": 0.9506930708885193,
+      "learning_rate": 0.0007518167051380697,
+      "loss": 0.4805,
+      "step": 14600
+    },
+    {
+      "epoch": 1.8735573223903565,
+      "grad_norm": 1.2647199630737305,
+      "learning_rate": 0.0007509617850730957,
+      "loss": 0.5111,
+      "step": 14610
+    },
+    {
+      "epoch": 1.8748397024878174,
+      "grad_norm": 0.6026537418365479,
+      "learning_rate": 0.0007501068650081217,
+      "loss": 0.3779,
+      "step": 14620
+    },
+    {
+      "epoch": 1.8761220825852782,
+      "grad_norm": 1.2891452312469482,
+      "learning_rate": 0.0007492519449431479,
+      "loss": 0.5285,
+      "step": 14630
+    },
+    {
+      "epoch": 1.877404462682739,
+      "grad_norm": 0.638653039932251,
+      "learning_rate": 0.000748397024878174,
+      "loss": 0.6281,
+      "step": 14640
+    },
+    {
+      "epoch": 1.8786868427802,
+      "grad_norm": 0.8396057486534119,
+      "learning_rate": 0.0007475421048131999,
+      "loss": 0.5179,
+      "step": 14650
+    },
+    {
+      "epoch": 1.8799692228776608,
+      "grad_norm": 0.5984233021736145,
+      "learning_rate": 0.000746687184748226,
+      "loss": 0.3829,
+      "step": 14660
+    },
+    {
+      "epoch": 1.8812516029751218,
+      "grad_norm": 0.9279236793518066,
+      "learning_rate": 0.0007458322646832522,
+      "loss": 0.4392,
+      "step": 14670
+    },
+    {
+      "epoch": 1.8825339830725827,
+      "grad_norm": 0.736960768699646,
+      "learning_rate": 0.0007449773446182781,
+      "loss": 0.5099,
+      "step": 14680
+    },
+    {
+      "epoch": 1.8838163631700438,
+      "grad_norm": 2.048767566680908,
+      "learning_rate": 0.0007441224245533042,
+      "loss": 0.5781,
+      "step": 14690
+    },
+    {
+      "epoch": 1.8850987432675046,
+      "grad_norm": 0.7400988340377808,
+      "learning_rate": 0.0007432675044883304,
+      "loss": 0.4852,
+      "step": 14700
+    },
+    {
+      "epoch": 1.8863811233649654,
+      "grad_norm": 1.009475827217102,
+      "learning_rate": 0.0007424125844233565,
+      "loss": 0.5739,
+      "step": 14710
+    },
+    {
+      "epoch": 1.8876635034624263,
+      "grad_norm": 0.7888931035995483,
+      "learning_rate": 0.0007415576643583824,
+      "loss": 0.6543,
+      "step": 14720
+    },
+    {
+      "epoch": 1.8889458835598871,
+      "grad_norm": 1.3015084266662598,
+      "learning_rate": 0.0007407027442934086,
+      "loss": 0.673,
+      "step": 14730
+    },
+    {
+      "epoch": 1.890228263657348,
+      "grad_norm": 0.8470888137817383,
+      "learning_rate": 0.0007398478242284347,
+      "loss": 0.5653,
+      "step": 14740
+    },
+    {
+      "epoch": 1.8915106437548088,
+      "grad_norm": 1.296543002128601,
+      "learning_rate": 0.0007389929041634607,
+      "loss": 0.4443,
+      "step": 14750
+    },
+    {
+      "epoch": 1.8927930238522697,
+      "grad_norm": 0.8180189728736877,
+      "learning_rate": 0.0007381379840984868,
+      "loss": 0.5177,
+      "step": 14760
+    },
+    {
+      "epoch": 1.8940754039497307,
+      "grad_norm": 1.1298378705978394,
+      "learning_rate": 0.0007372830640335129,
+      "loss": 0.4627,
+      "step": 14770
+    },
+    {
+      "epoch": 1.8953577840471916,
+      "grad_norm": 1.1105875968933105,
+      "learning_rate": 0.0007364281439685389,
+      "loss": 0.6339,
+      "step": 14780
+    },
+    {
+      "epoch": 1.8966401641446526,
+      "grad_norm": 0.8860158324241638,
+      "learning_rate": 0.0007355732239035651,
+      "loss": 0.4824,
+      "step": 14790
+    },
+    {
+      "epoch": 1.8979225442421135,
+      "grad_norm": 1.0457231998443604,
+      "learning_rate": 0.0007347183038385911,
+      "loss": 0.625,
+      "step": 14800
+    },
+    {
+      "epoch": 1.8992049243395743,
+      "grad_norm": 1.6747428178787231,
+      "learning_rate": 0.0007338633837736171,
+      "loss": 0.6475,
+      "step": 14810
+    },
+    {
+      "epoch": 1.9004873044370352,
+      "grad_norm": 0.7799145579338074,
+      "learning_rate": 0.0007330084637086433,
+      "loss": 0.6254,
+      "step": 14820
+    },
+    {
+      "epoch": 1.901769684534496,
+      "grad_norm": 0.7187017798423767,
+      "learning_rate": 0.0007321535436436694,
+      "loss": 0.5614,
+      "step": 14830
+    },
+    {
+      "epoch": 1.9030520646319569,
+      "grad_norm": 1.1263870000839233,
+      "learning_rate": 0.0007312986235786953,
+      "loss": 0.5503,
+      "step": 14840
+    },
+    {
+      "epoch": 1.9043344447294177,
+      "grad_norm": 0.8937992453575134,
+      "learning_rate": 0.0007304437035137215,
+      "loss": 0.6243,
+      "step": 14850
+    },
+    {
+      "epoch": 1.9056168248268786,
+      "grad_norm": 0.9904497861862183,
+      "learning_rate": 0.0007295887834487476,
+      "loss": 0.616,
+      "step": 14860
+    },
+    {
+      "epoch": 1.9068992049243396,
+      "grad_norm": 1.0227149724960327,
+      "learning_rate": 0.0007287338633837737,
+      "loss": 0.4517,
+      "step": 14870
+    },
+    {
+      "epoch": 1.9081815850218005,
+      "grad_norm": 0.5922604203224182,
+      "learning_rate": 0.0007278789433187996,
+      "loss": 0.3924,
+      "step": 14880
+    },
+    {
+      "epoch": 1.9094639651192613,
+      "grad_norm": 0.9521912336349487,
+      "learning_rate": 0.0007270240232538258,
+      "loss": 0.3611,
+      "step": 14890
+    },
+    {
+      "epoch": 1.9107463452167224,
+      "grad_norm": 1.0910950899124146,
+      "learning_rate": 0.0007261691031888519,
+      "loss": 0.5036,
+      "step": 14900
+    },
+    {
+      "epoch": 1.9120287253141832,
+      "grad_norm": 0.8863834738731384,
+      "learning_rate": 0.000725314183123878,
+      "loss": 0.5765,
+      "step": 14910
+    },
+    {
+      "epoch": 1.913311105411644,
+      "grad_norm": 0.6470763683319092,
+      "learning_rate": 0.000724459263058904,
+      "loss": 0.518,
+      "step": 14920
+    },
+    {
+      "epoch": 1.914593485509105,
+      "grad_norm": 1.0323649644851685,
+      "learning_rate": 0.0007236043429939301,
+      "loss": 0.4739,
+      "step": 14930
+    },
+    {
+      "epoch": 1.9158758656065658,
+      "grad_norm": 1.0393568277359009,
+      "learning_rate": 0.0007227494229289561,
+      "loss": 0.5144,
+      "step": 14940
+    },
+    {
+      "epoch": 1.9171582457040266,
+      "grad_norm": 0.9331060647964478,
+      "learning_rate": 0.0007218945028639823,
+      "loss": 0.4523,
+      "step": 14950
+    },
+    {
+      "epoch": 1.9184406258014874,
+      "grad_norm": 0.44560134410858154,
+      "learning_rate": 0.0007210395827990083,
+      "loss": 0.5049,
+      "step": 14960
+    },
+    {
+      "epoch": 1.9197230058989483,
+      "grad_norm": 0.3747738003730774,
+      "learning_rate": 0.0007201846627340343,
+      "loss": 0.5326,
+      "step": 14970
+    },
+    {
+      "epoch": 1.9210053859964094,
+      "grad_norm": 1.22909414768219,
+      "learning_rate": 0.0007193297426690605,
+      "loss": 0.6298,
+      "step": 14980
+    },
+    {
+      "epoch": 1.9222877660938702,
+      "grad_norm": 0.871557354927063,
+      "learning_rate": 0.0007184748226040866,
+      "loss": 0.4603,
+      "step": 14990
+    },
+    {
+      "epoch": 1.9235701461913313,
+      "grad_norm": 0.933385968208313,
+      "learning_rate": 0.0007176199025391125,
+      "loss": 0.4374,
+      "step": 15000
+    },
+    {
+      "epoch": 1.924852526288792,
+      "grad_norm": 1.254412293434143,
+      "learning_rate": 0.0007167649824741387,
+      "loss": 0.5107,
+      "step": 15010
+    },
+    {
+      "epoch": 1.926134906386253,
+      "grad_norm": 0.7056450247764587,
+      "learning_rate": 0.0007159100624091648,
+      "loss": 0.6199,
+      "step": 15020
+    },
+    {
+      "epoch": 1.9274172864837138,
+      "grad_norm": 1.064945936203003,
+      "learning_rate": 0.0007150551423441908,
+      "loss": 0.6342,
+      "step": 15030
+    },
+    {
+      "epoch": 1.9286996665811746,
+      "grad_norm": 1.5574430227279663,
+      "learning_rate": 0.0007142002222792169,
+      "loss": 0.452,
+      "step": 15040
+    },
+    {
+      "epoch": 1.9299820466786355,
+      "grad_norm": 1.3377269506454468,
+      "learning_rate": 0.000713345302214243,
+      "loss": 0.6947,
+      "step": 15050
+    },
+    {
+      "epoch": 1.9312644267760963,
+      "grad_norm": 2.000349760055542,
+      "learning_rate": 0.0007124903821492691,
+      "loss": 0.555,
+      "step": 15060
+    },
+    {
+      "epoch": 1.9325468068735572,
+      "grad_norm": 1.7576501369476318,
+      "learning_rate": 0.000711635462084295,
+      "loss": 0.5503,
+      "step": 15070
+    },
+    {
+      "epoch": 1.9338291869710182,
+      "grad_norm": 0.6069478392601013,
+      "learning_rate": 0.0007107805420193212,
+      "loss": 0.4408,
+      "step": 15080
+    },
+    {
+      "epoch": 1.935111567068479,
+      "grad_norm": 0.8294945955276489,
+      "learning_rate": 0.0007099256219543473,
+      "loss": 0.4914,
+      "step": 15090
+    },
+    {
+      "epoch": 1.93639394716594,
+      "grad_norm": 0.6512126922607422,
+      "learning_rate": 0.0007090707018893733,
+      "loss": 0.5977,
+      "step": 15100
+    },
+    {
+      "epoch": 1.937676327263401,
+      "grad_norm": 0.736539363861084,
+      "learning_rate": 0.0007082157818243994,
+      "loss": 0.4151,
+      "step": 15110
+    },
+    {
+      "epoch": 1.9389587073608618,
+      "grad_norm": 0.33729881048202515,
+      "learning_rate": 0.0007073608617594255,
+      "loss": 0.5454,
+      "step": 15120
+    },
+    {
+      "epoch": 1.9402410874583227,
+      "grad_norm": 0.603800356388092,
+      "learning_rate": 0.0007065059416944515,
+      "loss": 0.3752,
+      "step": 15130
+    },
+    {
+      "epoch": 1.9415234675557835,
+      "grad_norm": 1.2846564054489136,
+      "learning_rate": 0.0007056510216294777,
+      "loss": 0.4826,
+      "step": 15140
+    },
+    {
+      "epoch": 1.9428058476532444,
+      "grad_norm": 0.5370314717292786,
+      "learning_rate": 0.0007047961015645037,
+      "loss": 0.4963,
+      "step": 15150
+    },
+    {
+      "epoch": 1.9440882277507052,
+      "grad_norm": 1.2183728218078613,
+      "learning_rate": 0.0007039411814995298,
+      "loss": 0.6168,
+      "step": 15160
+    },
+    {
+      "epoch": 1.945370607848166,
+      "grad_norm": 1.1323776245117188,
+      "learning_rate": 0.0007030862614345559,
+      "loss": 0.5229,
+      "step": 15170
+    },
+    {
+      "epoch": 1.9466529879456271,
+      "grad_norm": 0.6309476494789124,
+      "learning_rate": 0.000702231341369582,
+      "loss": 0.5992,
+      "step": 15180
+    },
+    {
+      "epoch": 1.947935368043088,
+      "grad_norm": 1.0059658288955688,
+      "learning_rate": 0.000701376421304608,
+      "loss": 0.6053,
+      "step": 15190
+    },
+    {
+      "epoch": 1.9492177481405488,
+      "grad_norm": 1.3484851121902466,
+      "learning_rate": 0.0007005215012396341,
+      "loss": 0.6799,
+      "step": 15200
+    },
+    {
+      "epoch": 1.95050012823801,
+      "grad_norm": 1.7294602394104004,
+      "learning_rate": 0.0006996665811746602,
+      "loss": 0.5543,
+      "step": 15210
+    },
+    {
+      "epoch": 1.9517825083354707,
+      "grad_norm": 0.3680081367492676,
+      "learning_rate": 0.0006988116611096863,
+      "loss": 0.6018,
+      "step": 15220
+    },
+    {
+      "epoch": 1.9530648884329316,
+      "grad_norm": 0.649849534034729,
+      "learning_rate": 0.0006979567410447123,
+      "loss": 0.4314,
+      "step": 15230
+    },
+    {
+      "epoch": 1.9543472685303924,
+      "grad_norm": 1.2836802005767822,
+      "learning_rate": 0.0006971018209797384,
+      "loss": 0.5788,
+      "step": 15240
+    },
+    {
+      "epoch": 1.9556296486278533,
+      "grad_norm": 0.961693525314331,
+      "learning_rate": 0.0006962469009147645,
+      "loss": 0.4917,
+      "step": 15250
+    },
+    {
+      "epoch": 1.9569120287253141,
+      "grad_norm": 0.6490185856819153,
+      "learning_rate": 0.0006953919808497906,
+      "loss": 0.5938,
+      "step": 15260
+    },
+    {
+      "epoch": 1.958194408822775,
+      "grad_norm": 1.116169810295105,
+      "learning_rate": 0.0006945370607848166,
+      "loss": 0.4643,
+      "step": 15270
+    },
+    {
+      "epoch": 1.9594767889202358,
+      "grad_norm": 0.6350299715995789,
+      "learning_rate": 0.0006936821407198427,
+      "loss": 0.3787,
+      "step": 15280
+    },
+    {
+      "epoch": 1.9607591690176969,
+      "grad_norm": 1.2164093255996704,
+      "learning_rate": 0.0006928272206548689,
+      "loss": 0.5109,
+      "step": 15290
+    },
+    {
+      "epoch": 1.9620415491151577,
+      "grad_norm": 0.6592891812324524,
+      "learning_rate": 0.0006919723005898949,
+      "loss": 0.4356,
+      "step": 15300
+    },
+    {
+      "epoch": 1.9633239292126188,
+      "grad_norm": 0.9608144760131836,
+      "learning_rate": 0.0006911173805249209,
+      "loss": 0.4848,
+      "step": 15310
+    },
+    {
+      "epoch": 1.9646063093100796,
+      "grad_norm": 1.3843706846237183,
+      "learning_rate": 0.000690262460459947,
+      "loss": 0.6653,
+      "step": 15320
+    },
+    {
+      "epoch": 1.9658886894075405,
+      "grad_norm": 0.7894043922424316,
+      "learning_rate": 0.0006894075403949731,
+      "loss": 0.5874,
+      "step": 15330
+    },
+    {
+      "epoch": 1.9671710695050013,
+      "grad_norm": 0.7226264476776123,
+      "learning_rate": 0.0006885526203299992,
+      "loss": 0.4726,
+      "step": 15340
+    },
+    {
+      "epoch": 1.9684534496024622,
+      "grad_norm": 1.4548835754394531,
+      "learning_rate": 0.0006876977002650252,
+      "loss": 0.525,
+      "step": 15350
+    },
+    {
+      "epoch": 1.969735829699923,
+      "grad_norm": 0.6473925709724426,
+      "learning_rate": 0.0006868427802000513,
+      "loss": 0.4194,
+      "step": 15360
+    },
+    {
+      "epoch": 1.9710182097973838,
+      "grad_norm": 0.42092105746269226,
+      "learning_rate": 0.0006859878601350774,
+      "loss": 0.3743,
+      "step": 15370
+    },
+    {
+      "epoch": 1.9723005898948447,
+      "grad_norm": 0.8969188332557678,
+      "learning_rate": 0.0006851329400701034,
+      "loss": 0.5525,
+      "step": 15380
+    },
+    {
+      "epoch": 1.9735829699923058,
+      "grad_norm": 0.8764629364013672,
+      "learning_rate": 0.0006842780200051295,
+      "loss": 0.6307,
+      "step": 15390
+    },
+    {
+      "epoch": 1.9748653500897666,
+      "grad_norm": 0.4493338167667389,
+      "learning_rate": 0.0006834230999401556,
+      "loss": 0.5157,
+      "step": 15400
+    },
+    {
+      "epoch": 1.9761477301872274,
+      "grad_norm": 1.2919282913208008,
+      "learning_rate": 0.0006825681798751817,
+      "loss": 0.6005,
+      "step": 15410
+    },
+    {
+      "epoch": 1.9774301102846885,
+      "grad_norm": 0.78176349401474,
+      "learning_rate": 0.0006817132598102077,
+      "loss": 0.4165,
+      "step": 15420
+    },
+    {
+      "epoch": 1.9787124903821494,
+      "grad_norm": 0.7286581993103027,
+      "learning_rate": 0.0006808583397452338,
+      "loss": 0.481,
+      "step": 15430
+    },
+    {
+      "epoch": 1.9799948704796102,
+      "grad_norm": 0.9931614995002747,
+      "learning_rate": 0.0006800034196802599,
+      "loss": 0.5327,
+      "step": 15440
+    },
+    {
+      "epoch": 1.981277250577071,
+      "grad_norm": 0.9504096508026123,
+      "learning_rate": 0.0006791484996152861,
+      "loss": 0.5105,
+      "step": 15450
+    },
+    {
+      "epoch": 1.982559630674532,
+      "grad_norm": 1.473724365234375,
+      "learning_rate": 0.000678293579550312,
+      "loss": 0.5,
+      "step": 15460
+    },
+    {
+      "epoch": 1.9838420107719927,
+      "grad_norm": 0.9803527593612671,
+      "learning_rate": 0.0006774386594853381,
+      "loss": 0.4608,
+      "step": 15470
+    },
+    {
+      "epoch": 1.9851243908694536,
+      "grad_norm": 0.7079563736915588,
+      "learning_rate": 0.0006765837394203643,
+      "loss": 0.3944,
+      "step": 15480
+    },
+    {
+      "epoch": 1.9864067709669146,
+      "grad_norm": 1.4155352115631104,
+      "learning_rate": 0.0006757288193553903,
+      "loss": 0.5133,
+      "step": 15490
+    },
+    {
+      "epoch": 1.9876891510643755,
+      "grad_norm": 1.1894326210021973,
+      "learning_rate": 0.0006748738992904163,
+      "loss": 0.4759,
+      "step": 15500
+    },
+    {
+      "epoch": 1.9889715311618363,
+      "grad_norm": 0.5845767259597778,
+      "learning_rate": 0.0006740189792254425,
+      "loss": 0.4065,
+      "step": 15510
+    },
+    {
+      "epoch": 1.9902539112592974,
+      "grad_norm": 0.3843328654766083,
+      "learning_rate": 0.0006731640591604685,
+      "loss": 0.3548,
+      "step": 15520
+    },
+    {
+      "epoch": 1.9915362913567582,
+      "grad_norm": 1.3628671169281006,
+      "learning_rate": 0.0006723091390954946,
+      "loss": 0.3994,
+      "step": 15530
+    },
+    {
+      "epoch": 1.992818671454219,
+      "grad_norm": 0.7082588076591492,
+      "learning_rate": 0.0006714542190305206,
+      "loss": 0.3953,
+      "step": 15540
+    },
+    {
+      "epoch": 1.99410105155168,
+      "grad_norm": 0.56044602394104,
+      "learning_rate": 0.0006705992989655467,
+      "loss": 0.4706,
+      "step": 15550
+    },
+    {
+      "epoch": 1.9953834316491408,
+      "grad_norm": 0.6746466159820557,
+      "learning_rate": 0.0006697443789005728,
+      "loss": 0.4086,
+      "step": 15560
+    },
+    {
+      "epoch": 1.9966658117466016,
+      "grad_norm": 0.8921716213226318,
+      "learning_rate": 0.000668889458835599,
+      "loss": 0.4959,
+      "step": 15570
+    },
+    {
+      "epoch": 1.9979481918440625,
+      "grad_norm": 1.0937660932540894,
+      "learning_rate": 0.0006680345387706249,
+      "loss": 0.4432,
+      "step": 15580
+    },
+    {
+      "epoch": 1.9992305719415233,
+      "grad_norm": 0.7332781553268433,
+      "learning_rate": 0.000667179618705651,
+      "loss": 0.4807,
+      "step": 15590
+    },
+    {
+      "epoch": 2.000512952038984,
+      "grad_norm": 0.775030791759491,
+      "learning_rate": 0.0006663246986406771,
+      "loss": 0.4139,
+      "step": 15600
+    },
+    {
+      "epoch": 2.0017953321364454,
+      "grad_norm": 0.6231206059455872,
+      "learning_rate": 0.0006654697785757033,
+      "loss": 0.5095,
+      "step": 15610
+    },
+    {
+      "epoch": 2.0030777122339063,
+      "grad_norm": 0.7950479388237,
+      "learning_rate": 0.0006646148585107292,
+      "loss": 0.4397,
+      "step": 15620
+    },
+    {
+      "epoch": 2.004360092331367,
+      "grad_norm": 0.970693051815033,
+      "learning_rate": 0.0006637599384457553,
+      "loss": 0.4875,
+      "step": 15630
+    },
+    {
+      "epoch": 2.005642472428828,
+      "grad_norm": 0.5207669138908386,
+      "learning_rate": 0.0006629050183807815,
+      "loss": 0.4713,
+      "step": 15640
+    },
+    {
+      "epoch": 2.006924852526289,
+      "grad_norm": 0.8894481062889099,
+      "learning_rate": 0.0006620500983158075,
+      "loss": 0.3892,
+      "step": 15650
+    },
+    {
+      "epoch": 2.0082072326237497,
+      "grad_norm": 0.9765975475311279,
+      "learning_rate": 0.0006611951782508335,
+      "loss": 0.5046,
+      "step": 15660
+    },
+    {
+      "epoch": 2.0094896127212105,
+      "grad_norm": 0.6186564564704895,
+      "learning_rate": 0.0006603402581858597,
+      "loss": 0.435,
+      "step": 15670
+    },
+    {
+      "epoch": 2.0107719928186714,
+      "grad_norm": 1.0802409648895264,
+      "learning_rate": 0.0006594853381208857,
+      "loss": 0.379,
+      "step": 15680
+    },
+    {
+      "epoch": 2.012054372916132,
+      "grad_norm": 0.519303560256958,
+      "learning_rate": 0.0006586304180559118,
+      "loss": 0.3067,
+      "step": 15690
+    },
+    {
+      "epoch": 2.013336753013593,
+      "grad_norm": 0.6543425917625427,
+      "learning_rate": 0.0006577754979909379,
+      "loss": 0.4907,
+      "step": 15700
+    },
+    {
+      "epoch": 2.0146191331110543,
+      "grad_norm": 1.0013840198516846,
+      "learning_rate": 0.0006569205779259639,
+      "loss": 0.3811,
+      "step": 15710
+    },
+    {
+      "epoch": 2.015901513208515,
+      "grad_norm": 1.0863186120986938,
+      "learning_rate": 0.00065606565786099,
+      "loss": 0.4327,
+      "step": 15720
+    },
+    {
+      "epoch": 2.017183893305976,
+      "grad_norm": 0.8166930079460144,
+      "learning_rate": 0.000655210737796016,
+      "loss": 0.4129,
+      "step": 15730
+    },
+    {
+      "epoch": 2.018466273403437,
+      "grad_norm": 0.8001251220703125,
+      "learning_rate": 0.0006543558177310422,
+      "loss": 0.3394,
+      "step": 15740
+    },
+    {
+      "epoch": 2.0197486535008977,
+      "grad_norm": 1.3382858037948608,
+      "learning_rate": 0.0006535008976660682,
+      "loss": 0.4998,
+      "step": 15750
+    },
+    {
+      "epoch": 2.0210310335983586,
+      "grad_norm": 0.8801462054252625,
+      "learning_rate": 0.0006526459776010944,
+      "loss": 0.4464,
+      "step": 15760
+    },
+    {
+      "epoch": 2.0223134136958194,
+      "grad_norm": 0.940180778503418,
+      "learning_rate": 0.0006517910575361204,
+      "loss": 0.4152,
+      "step": 15770
+    },
+    {
+      "epoch": 2.0235957937932803,
+      "grad_norm": 0.6335304379463196,
+      "learning_rate": 0.0006509361374711464,
+      "loss": 0.3804,
+      "step": 15780
+    },
+    {
+      "epoch": 2.024878173890741,
+      "grad_norm": 0.5638919472694397,
+      "learning_rate": 0.0006500812174061725,
+      "loss": 0.4404,
+      "step": 15790
+    },
+    {
+      "epoch": 2.026160553988202,
+      "grad_norm": 1.3646224737167358,
+      "learning_rate": 0.0006492262973411987,
+      "loss": 0.4917,
+      "step": 15800
+    },
+    {
+      "epoch": 2.027442934085663,
+      "grad_norm": 0.39091867208480835,
+      "learning_rate": 0.0006483713772762246,
+      "loss": 0.4169,
+      "step": 15810
+    },
+    {
+      "epoch": 2.028725314183124,
+      "grad_norm": 1.3595271110534668,
+      "learning_rate": 0.0006475164572112507,
+      "loss": 0.4892,
+      "step": 15820
+    },
+    {
+      "epoch": 2.030007694280585,
+      "grad_norm": 0.7239012718200684,
+      "learning_rate": 0.0006466615371462769,
+      "loss": 0.4574,
+      "step": 15830
+    },
+    {
+      "epoch": 2.0312900743780458,
+      "grad_norm": 1.1888518333435059,
+      "learning_rate": 0.0006458066170813029,
+      "loss": 0.3847,
+      "step": 15840
+    },
+    {
+      "epoch": 2.0325724544755066,
+      "grad_norm": 0.48686522245407104,
+      "learning_rate": 0.0006449516970163289,
+      "loss": 0.2997,
+      "step": 15850
+    },
+    {
+      "epoch": 2.0338548345729675,
+      "grad_norm": 0.963004469871521,
+      "learning_rate": 0.0006440967769513551,
+      "loss": 0.4397,
+      "step": 15860
+    },
+    {
+      "epoch": 2.0351372146704283,
+      "grad_norm": 0.45735833048820496,
+      "learning_rate": 0.0006432418568863811,
+      "loss": 0.461,
+      "step": 15870
+    },
+    {
+      "epoch": 2.036419594767889,
+      "grad_norm": 1.019104242324829,
+      "learning_rate": 0.0006423869368214072,
+      "loss": 0.4187,
+      "step": 15880
+    },
+    {
+      "epoch": 2.03770197486535,
+      "grad_norm": 0.6047408580780029,
+      "learning_rate": 0.0006415320167564333,
+      "loss": 0.4521,
+      "step": 15890
+    },
+    {
+      "epoch": 2.038984354962811,
+      "grad_norm": 1.1490784883499146,
+      "learning_rate": 0.0006406770966914594,
+      "loss": 0.378,
+      "step": 15900
+    },
+    {
+      "epoch": 2.0402667350602717,
+      "grad_norm": 1.2890042066574097,
+      "learning_rate": 0.0006398221766264854,
+      "loss": 0.3913,
+      "step": 15910
+    },
+    {
+      "epoch": 2.041549115157733,
+      "grad_norm": 0.7499234676361084,
+      "learning_rate": 0.0006389672565615116,
+      "loss": 0.3937,
+      "step": 15920
+    },
+    {
+      "epoch": 2.042831495255194,
+      "grad_norm": 0.600645899772644,
+      "learning_rate": 0.0006381123364965376,
+      "loss": 0.3343,
+      "step": 15930
+    },
+    {
+      "epoch": 2.0441138753526547,
+      "grad_norm": 1.029549479484558,
+      "learning_rate": 0.0006372574164315636,
+      "loss": 0.4275,
+      "step": 15940
+    },
+    {
+      "epoch": 2.0453962554501155,
+      "grad_norm": 1.660400629043579,
+      "learning_rate": 0.0006364024963665898,
+      "loss": 0.5022,
+      "step": 15950
+    },
+    {
+      "epoch": 2.0466786355475763,
+      "grad_norm": 0.932639479637146,
+      "learning_rate": 0.0006355475763016159,
+      "loss": 0.3318,
+      "step": 15960
+    },
+    {
+      "epoch": 2.047961015645037,
+      "grad_norm": 0.5352082252502441,
+      "learning_rate": 0.0006346926562366418,
+      "loss": 0.459,
+      "step": 15970
+    },
+    {
+      "epoch": 2.049243395742498,
+      "grad_norm": 0.41553980112075806,
+      "learning_rate": 0.000633837736171668,
+      "loss": 0.4066,
+      "step": 15980
+    },
+    {
+      "epoch": 2.050525775839959,
+      "grad_norm": 0.6084936261177063,
+      "learning_rate": 0.0006329828161066941,
+      "loss": 0.3279,
+      "step": 15990
+    },
+    {
+      "epoch": 2.0518081559374197,
+      "grad_norm": 1.441450834274292,
+      "learning_rate": 0.0006321278960417201,
+      "loss": 0.4136,
+      "step": 16000
+    },
+    {
+      "epoch": 2.0530905360348806,
+      "grad_norm": 0.9884285926818848,
+      "learning_rate": 0.0006312729759767461,
+      "loss": 0.3913,
+      "step": 16010
+    },
+    {
+      "epoch": 2.0543729161323414,
+      "grad_norm": 1.6738002300262451,
+      "learning_rate": 0.0006304180559117723,
+      "loss": 0.4175,
+      "step": 16020
+    },
+    {
+      "epoch": 2.0556552962298027,
+      "grad_norm": 1.0428452491760254,
+      "learning_rate": 0.0006295631358467984,
+      "loss": 0.4279,
+      "step": 16030
+    },
+    {
+      "epoch": 2.0569376763272635,
+      "grad_norm": 1.673563838005066,
+      "learning_rate": 0.0006287082157818244,
+      "loss": 0.4554,
+      "step": 16040
+    },
+    {
+      "epoch": 2.0582200564247244,
+      "grad_norm": 0.5701791048049927,
+      "learning_rate": 0.0006278532957168505,
+      "loss": 0.3846,
+      "step": 16050
+    },
+    {
+      "epoch": 2.0595024365221852,
+      "grad_norm": 0.9378145337104797,
+      "learning_rate": 0.0006269983756518766,
+      "loss": 0.4478,
+      "step": 16060
+    },
+    {
+      "epoch": 2.060784816619646,
+      "grad_norm": 0.7080726623535156,
+      "learning_rate": 0.0006261434555869026,
+      "loss": 0.3637,
+      "step": 16070
+    },
+    {
+      "epoch": 2.062067196717107,
+      "grad_norm": 1.104427456855774,
+      "learning_rate": 0.0006252885355219287,
+      "loss": 0.3889,
+      "step": 16080
+    },
+    {
+      "epoch": 2.0633495768145678,
+      "grad_norm": 0.49368712306022644,
+      "learning_rate": 0.0006244336154569548,
+      "loss": 0.3342,
+      "step": 16090
+    },
+    {
+      "epoch": 2.0646319569120286,
+      "grad_norm": 0.5924476385116577,
+      "learning_rate": 0.0006235786953919808,
+      "loss": 0.383,
+      "step": 16100
+    },
+    {
+      "epoch": 2.0659143370094895,
+      "grad_norm": 0.8648740649223328,
+      "learning_rate": 0.000622723775327007,
+      "loss": 0.3354,
+      "step": 16110
+    },
+    {
+      "epoch": 2.0671967171069503,
+      "grad_norm": 0.9857394695281982,
+      "learning_rate": 0.000621868855262033,
+      "loss": 0.2874,
+      "step": 16120
+    },
+    {
+      "epoch": 2.0684790972044116,
+      "grad_norm": 0.6319371461868286,
+      "learning_rate": 0.000621013935197059,
+      "loss": 0.3847,
+      "step": 16130
+    },
+    {
+      "epoch": 2.0697614773018724,
+      "grad_norm": 1.4830057621002197,
+      "learning_rate": 0.0006201590151320852,
+      "loss": 0.3945,
+      "step": 16140
+    },
+    {
+      "epoch": 2.0710438573993333,
+      "grad_norm": 1.0306016206741333,
+      "learning_rate": 0.0006193040950671113,
+      "loss": 0.4081,
+      "step": 16150
+    },
+    {
+      "epoch": 2.072326237496794,
+      "grad_norm": 0.6749256253242493,
+      "learning_rate": 0.0006184491750021372,
+      "loss": 0.314,
+      "step": 16160
+    },
+    {
+      "epoch": 2.073608617594255,
+      "grad_norm": 0.7656669020652771,
+      "learning_rate": 0.0006175942549371633,
+      "loss": 0.3983,
+      "step": 16170
+    },
+    {
+      "epoch": 2.074890997691716,
+      "grad_norm": 0.7537424564361572,
+      "learning_rate": 0.0006167393348721895,
+      "loss": 0.3204,
+      "step": 16180
+    },
+    {
+      "epoch": 2.0761733777891767,
+      "grad_norm": 0.45361366868019104,
+      "learning_rate": 0.0006158844148072156,
+      "loss": 0.4101,
+      "step": 16190
+    },
+    {
+      "epoch": 2.0774557578866375,
+      "grad_norm": 1.6658540964126587,
+      "learning_rate": 0.0006150294947422415,
+      "loss": 0.4625,
+      "step": 16200
+    },
+    {
+      "epoch": 2.0787381379840983,
+      "grad_norm": 0.9616145491600037,
+      "learning_rate": 0.0006141745746772677,
+      "loss": 0.343,
+      "step": 16210
+    },
+    {
+      "epoch": 2.080020518081559,
+      "grad_norm": 1.1583889722824097,
+      "learning_rate": 0.0006133196546122938,
+      "loss": 0.3612,
+      "step": 16220
+    },
+    {
+      "epoch": 2.08130289817902,
+      "grad_norm": 0.46162256598472595,
+      "learning_rate": 0.0006124647345473198,
+      "loss": 0.3976,
+      "step": 16230
+    },
+    {
+      "epoch": 2.0825852782764813,
+      "grad_norm": 0.5580847859382629,
+      "learning_rate": 0.0006116098144823459,
+      "loss": 0.3302,
+      "step": 16240
+    },
+    {
+      "epoch": 2.083867658373942,
+      "grad_norm": 0.9140333533287048,
+      "learning_rate": 0.000610754894417372,
+      "loss": 0.4096,
+      "step": 16250
+    },
+    {
+      "epoch": 2.085150038471403,
+      "grad_norm": 1.2011090517044067,
+      "learning_rate": 0.000609899974352398,
+      "loss": 0.4436,
+      "step": 16260
+    },
+    {
+      "epoch": 2.086432418568864,
+      "grad_norm": 0.5058355331420898,
+      "learning_rate": 0.0006090450542874242,
+      "loss": 0.4824,
+      "step": 16270
+    },
+    {
+      "epoch": 2.0877147986663247,
+      "grad_norm": 0.9225788712501526,
+      "learning_rate": 0.0006081901342224502,
+      "loss": 0.3696,
+      "step": 16280
+    },
+    {
+      "epoch": 2.0889971787637855,
+      "grad_norm": 0.8377031683921814,
+      "learning_rate": 0.0006073352141574762,
+      "loss": 0.4635,
+      "step": 16290
+    },
+    {
+      "epoch": 2.0902795588612464,
+      "grad_norm": 1.528792142868042,
+      "learning_rate": 0.0006064802940925024,
+      "loss": 0.4176,
+      "step": 16300
+    },
+    {
+      "epoch": 2.0915619389587072,
+      "grad_norm": 0.798425555229187,
+      "learning_rate": 0.0006056253740275285,
+      "loss": 0.3938,
+      "step": 16310
+    },
+    {
+      "epoch": 2.092844319056168,
+      "grad_norm": 0.49224352836608887,
+      "learning_rate": 0.0006047704539625545,
+      "loss": 0.3357,
+      "step": 16320
+    },
+    {
+      "epoch": 2.094126699153629,
+      "grad_norm": 0.5816643238067627,
+      "learning_rate": 0.0006039155338975806,
+      "loss": 0.3523,
+      "step": 16330
+    },
+    {
+      "epoch": 2.09540907925109,
+      "grad_norm": 0.7259325385093689,
+      "learning_rate": 0.0006030606138326067,
+      "loss": 0.4355,
+      "step": 16340
+    },
+    {
+      "epoch": 2.096691459348551,
+      "grad_norm": 0.8192687630653381,
+      "learning_rate": 0.0006022056937676328,
+      "loss": 0.4217,
+      "step": 16350
+    },
+    {
+      "epoch": 2.097973839446012,
+      "grad_norm": 1.0315042734146118,
+      "learning_rate": 0.0006013507737026588,
+      "loss": 0.4108,
+      "step": 16360
+    },
+    {
+      "epoch": 2.0992562195434727,
+      "grad_norm": 0.7295234203338623,
+      "learning_rate": 0.0006004958536376849,
+      "loss": 0.3817,
+      "step": 16370
+    },
+    {
+      "epoch": 2.1005385996409336,
+      "grad_norm": 0.8055635094642639,
+      "learning_rate": 0.000599640933572711,
+      "loss": 0.4461,
+      "step": 16380
+    },
+    {
+      "epoch": 2.1018209797383944,
+      "grad_norm": 0.9838399887084961,
+      "learning_rate": 0.000598786013507737,
+      "loss": 0.4033,
+      "step": 16390
+    },
+    {
+      "epoch": 2.1031033598358553,
+      "grad_norm": 0.9164043068885803,
+      "learning_rate": 0.0005979310934427631,
+      "loss": 0.4508,
+      "step": 16400
+    },
+    {
+      "epoch": 2.104385739933316,
+      "grad_norm": 1.4616590738296509,
+      "learning_rate": 0.0005970761733777892,
+      "loss": 0.4832,
+      "step": 16410
+    },
+    {
+      "epoch": 2.105668120030777,
+      "grad_norm": 0.7076154351234436,
+      "learning_rate": 0.0005962212533128152,
+      "loss": 0.3766,
+      "step": 16420
+    },
+    {
+      "epoch": 2.106950500128238,
+      "grad_norm": 0.9713407754898071,
+      "learning_rate": 0.0005953663332478413,
+      "loss": 0.4856,
+      "step": 16430
+    },
+    {
+      "epoch": 2.108232880225699,
+      "grad_norm": 0.5862424373626709,
+      "learning_rate": 0.0005945114131828674,
+      "loss": 0.3552,
+      "step": 16440
+    },
+    {
+      "epoch": 2.10951526032316,
+      "grad_norm": 1.3312978744506836,
+      "learning_rate": 0.0005936564931178934,
+      "loss": 0.4499,
+      "step": 16450
+    },
+    {
+      "epoch": 2.110797640420621,
+      "grad_norm": 0.790224552154541,
+      "learning_rate": 0.0005928015730529196,
+      "loss": 0.4647,
+      "step": 16460
+    },
+    {
+      "epoch": 2.1120800205180816,
+      "grad_norm": 0.6152584552764893,
+      "learning_rate": 0.0005919466529879456,
+      "loss": 0.4688,
+      "step": 16470
+    },
+    {
+      "epoch": 2.1133624006155425,
+      "grad_norm": 0.586744487285614,
+      "learning_rate": 0.0005910917329229717,
+      "loss": 0.3254,
+      "step": 16480
+    },
+    {
+      "epoch": 2.1146447807130033,
+      "grad_norm": 0.9276888370513916,
+      "learning_rate": 0.0005902368128579978,
+      "loss": 0.4067,
+      "step": 16490
+    },
+    {
+      "epoch": 2.115927160810464,
+      "grad_norm": 0.5232440829277039,
+      "learning_rate": 0.0005893818927930239,
+      "loss": 0.2955,
+      "step": 16500
+    },
+    {
+      "epoch": 2.117209540907925,
+      "grad_norm": 1.1610713005065918,
+      "learning_rate": 0.0005885269727280499,
+      "loss": 0.3482,
+      "step": 16510
+    },
+    {
+      "epoch": 2.118491921005386,
+      "grad_norm": 0.8713477849960327,
+      "learning_rate": 0.000587672052663076,
+      "loss": 0.3607,
+      "step": 16520
+    },
+    {
+      "epoch": 2.1197743011028467,
+      "grad_norm": 1.2299057245254517,
+      "learning_rate": 0.0005868171325981021,
+      "loss": 0.489,
+      "step": 16530
+    },
+    {
+      "epoch": 2.121056681200308,
+      "grad_norm": 0.8449939489364624,
+      "learning_rate": 0.0005859622125331282,
+      "loss": 0.4024,
+      "step": 16540
+    },
+    {
+      "epoch": 2.122339061297769,
+      "grad_norm": 1.0268441438674927,
+      "learning_rate": 0.0005851072924681542,
+      "loss": 0.4341,
+      "step": 16550
+    },
+    {
+      "epoch": 2.1236214413952297,
+      "grad_norm": 0.7868974804878235,
+      "learning_rate": 0.0005842523724031803,
+      "loss": 0.3437,
+      "step": 16560
+    },
+    {
+      "epoch": 2.1249038214926905,
+      "grad_norm": 0.45466360449790955,
+      "learning_rate": 0.0005833974523382064,
+      "loss": 0.3059,
+      "step": 16570
+    },
+    {
+      "epoch": 2.1261862015901514,
+      "grad_norm": 0.604418933391571,
+      "learning_rate": 0.0005825425322732325,
+      "loss": 0.4058,
+      "step": 16580
+    },
+    {
+      "epoch": 2.127468581687612,
+      "grad_norm": 1.0346992015838623,
+      "learning_rate": 0.0005816876122082585,
+      "loss": 0.3903,
+      "step": 16590
+    },
+    {
+      "epoch": 2.128750961785073,
+      "grad_norm": 0.8088748455047607,
+      "learning_rate": 0.0005808326921432846,
+      "loss": 0.4438,
+      "step": 16600
+    },
+    {
+      "epoch": 2.130033341882534,
+      "grad_norm": 1.0457253456115723,
+      "learning_rate": 0.0005799777720783108,
+      "loss": 0.444,
+      "step": 16610
+    },
+    {
+      "epoch": 2.1313157219799947,
+      "grad_norm": 1.0352778434753418,
+      "learning_rate": 0.0005791228520133368,
+      "loss": 0.3832,
+      "step": 16620
+    },
+    {
+      "epoch": 2.1325981020774556,
+      "grad_norm": 0.9149858355522156,
+      "learning_rate": 0.0005782679319483628,
+      "loss": 0.3834,
+      "step": 16630
+    },
+    {
+      "epoch": 2.1338804821749164,
+      "grad_norm": 0.8805481791496277,
+      "learning_rate": 0.000577413011883389,
+      "loss": 0.4027,
+      "step": 16640
+    },
+    {
+      "epoch": 2.1351628622723777,
+      "grad_norm": 1.2850439548492432,
+      "learning_rate": 0.000576558091818415,
+      "loss": 0.5138,
+      "step": 16650
+    },
+    {
+      "epoch": 2.1364452423698386,
+      "grad_norm": 1.2789738178253174,
+      "learning_rate": 0.0005757031717534411,
+      "loss": 0.3797,
+      "step": 16660
+    },
+    {
+      "epoch": 2.1377276224672994,
+      "grad_norm": 1.1163911819458008,
+      "learning_rate": 0.0005748482516884671,
+      "loss": 0.4236,
+      "step": 16670
+    },
+    {
+      "epoch": 2.1390100025647603,
+      "grad_norm": 1.351048469543457,
+      "learning_rate": 0.0005739933316234932,
+      "loss": 0.5308,
+      "step": 16680
+    },
+    {
+      "epoch": 2.140292382662221,
+      "grad_norm": 0.5716691613197327,
+      "learning_rate": 0.0005731384115585193,
+      "loss": 0.4708,
+      "step": 16690
+    },
+    {
+      "epoch": 2.141574762759682,
+      "grad_norm": 0.4324432909488678,
+      "learning_rate": 0.0005722834914935454,
+      "loss": 0.4613,
+      "step": 16700
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 1.034508228302002,
+      "learning_rate": 0.0005714285714285714,
+      "loss": 0.4185,
+      "step": 16710
+    },
+    {
+      "epoch": 2.1441395229546036,
+      "grad_norm": 0.8945391774177551,
+      "learning_rate": 0.0005705736513635975,
+      "loss": 0.3495,
+      "step": 16720
+    },
+    {
+      "epoch": 2.1454219030520645,
+      "grad_norm": 1.2209150791168213,
+      "learning_rate": 0.0005697187312986236,
+      "loss": 0.3843,
+      "step": 16730
+    },
+    {
+      "epoch": 2.1467042831495253,
+      "grad_norm": 0.5386025309562683,
+      "learning_rate": 0.0005688638112336498,
+      "loss": 0.4124,
+      "step": 16740
+    },
+    {
+      "epoch": 2.1479866632469866,
+      "grad_norm": 0.7066617012023926,
+      "learning_rate": 0.0005680088911686757,
+      "loss": 0.3361,
+      "step": 16750
+    },
+    {
+      "epoch": 2.1492690433444475,
+      "grad_norm": 0.8300710916519165,
+      "learning_rate": 0.0005671539711037018,
+      "loss": 0.4056,
+      "step": 16760
+    },
+    {
+      "epoch": 2.1505514234419083,
+      "grad_norm": 1.522805094718933,
+      "learning_rate": 0.000566299051038728,
+      "loss": 0.4305,
+      "step": 16770
+    },
+    {
+      "epoch": 2.151833803539369,
+      "grad_norm": 0.5003076791763306,
+      "learning_rate": 0.0005654441309737539,
+      "loss": 0.3918,
+      "step": 16780
+    },
+    {
+      "epoch": 2.15311618363683,
+      "grad_norm": 1.1118338108062744,
+      "learning_rate": 0.00056458921090878,
+      "loss": 0.4591,
+      "step": 16790
+    },
+    {
+      "epoch": 2.154398563734291,
+      "grad_norm": 0.6473600268363953,
+      "learning_rate": 0.0005637342908438062,
+      "loss": 0.3684,
+      "step": 16800
+    },
+    {
+      "epoch": 2.1556809438317517,
+      "grad_norm": 0.5967467427253723,
+      "learning_rate": 0.0005628793707788322,
+      "loss": 0.4068,
+      "step": 16810
+    },
+    {
+      "epoch": 2.1569633239292125,
+      "grad_norm": 1.1594890356063843,
+      "learning_rate": 0.0005620244507138582,
+      "loss": 0.3506,
+      "step": 16820
+    },
+    {
+      "epoch": 2.1582457040266734,
+      "grad_norm": 0.6853704452514648,
+      "learning_rate": 0.0005611695306488844,
+      "loss": 0.4528,
+      "step": 16830
+    },
+    {
+      "epoch": 2.159528084124134,
+      "grad_norm": 0.7889552116394043,
+      "learning_rate": 0.0005603146105839104,
+      "loss": 0.4236,
+      "step": 16840
+    },
+    {
+      "epoch": 2.160810464221595,
+      "grad_norm": 1.3520945310592651,
+      "learning_rate": 0.0005594596905189365,
+      "loss": 0.521,
+      "step": 16850
+    },
+    {
+      "epoch": 2.1620928443190564,
+      "grad_norm": 1.283141851425171,
+      "learning_rate": 0.0005586047704539625,
+      "loss": 0.3847,
+      "step": 16860
+    },
+    {
+      "epoch": 2.163375224416517,
+      "grad_norm": 0.6394121050834656,
+      "learning_rate": 0.0005577498503889886,
+      "loss": 0.4256,
+      "step": 16870
+    },
+    {
+      "epoch": 2.164657604513978,
+      "grad_norm": 0.9717941880226135,
+      "learning_rate": 0.0005568949303240147,
+      "loss": 0.3616,
+      "step": 16880
+    },
+    {
+      "epoch": 2.165939984611439,
+      "grad_norm": 1.2002935409545898,
+      "learning_rate": 0.0005560400102590408,
+      "loss": 0.3632,
+      "step": 16890
+    },
+    {
+      "epoch": 2.1672223647088997,
+      "grad_norm": 1.209804654121399,
+      "learning_rate": 0.0005551850901940669,
+      "loss": 0.3692,
+      "step": 16900
+    },
+    {
+      "epoch": 2.1685047448063606,
+      "grad_norm": 1.1191928386688232,
+      "learning_rate": 0.0005543301701290929,
+      "loss": 0.3921,
+      "step": 16910
+    },
+    {
+      "epoch": 2.1697871249038214,
+      "grad_norm": 1.0837756395339966,
+      "learning_rate": 0.000553475250064119,
+      "loss": 0.3838,
+      "step": 16920
+    },
+    {
+      "epoch": 2.1710695050012823,
+      "grad_norm": 0.950324296951294,
+      "learning_rate": 0.0005526203299991452,
+      "loss": 0.4226,
+      "step": 16930
+    },
+    {
+      "epoch": 2.172351885098743,
+      "grad_norm": 0.4663751423358917,
+      "learning_rate": 0.0005517654099341711,
+      "loss": 0.412,
+      "step": 16940
+    },
+    {
+      "epoch": 2.173634265196204,
+      "grad_norm": 1.0437508821487427,
+      "learning_rate": 0.0005509104898691972,
+      "loss": 0.3732,
+      "step": 16950
+    },
+    {
+      "epoch": 2.1749166452936652,
+      "grad_norm": 0.5461912155151367,
+      "learning_rate": 0.0005500555698042234,
+      "loss": 0.4139,
+      "step": 16960
+    },
+    {
+      "epoch": 2.176199025391126,
+      "grad_norm": 0.7421871423721313,
+      "learning_rate": 0.0005492006497392494,
+      "loss": 0.3798,
+      "step": 16970
+    },
+    {
+      "epoch": 2.177481405488587,
+      "grad_norm": 0.5213621854782104,
+      "learning_rate": 0.0005483457296742754,
+      "loss": 0.4407,
+      "step": 16980
+    },
+    {
+      "epoch": 2.1787637855860478,
+      "grad_norm": 1.0616869926452637,
+      "learning_rate": 0.0005474908096093016,
+      "loss": 0.4209,
+      "step": 16990
+    },
+    {
+      "epoch": 2.1800461656835086,
+      "grad_norm": 0.861587405204773,
+      "learning_rate": 0.0005466358895443276,
+      "loss": 0.3885,
+      "step": 17000
+    },
+    {
+      "epoch": 2.1813285457809695,
+      "grad_norm": 0.4381602704524994,
+      "learning_rate": 0.0005457809694793537,
+      "loss": 0.5041,
+      "step": 17010
+    },
+    {
+      "epoch": 2.1826109258784303,
+      "grad_norm": 0.8949865102767944,
+      "learning_rate": 0.0005449260494143798,
+      "loss": 0.5102,
+      "step": 17020
+    },
+    {
+      "epoch": 2.183893305975891,
+      "grad_norm": 0.879464328289032,
+      "learning_rate": 0.0005440711293494058,
+      "loss": 0.536,
+      "step": 17030
+    },
+    {
+      "epoch": 2.185175686073352,
+      "grad_norm": 0.9006835222244263,
+      "learning_rate": 0.0005432162092844319,
+      "loss": 0.3617,
+      "step": 17040
+    },
+    {
+      "epoch": 2.186458066170813,
+      "grad_norm": 1.1125102043151855,
+      "learning_rate": 0.0005423612892194581,
+      "loss": 0.3968,
+      "step": 17050
+    },
+    {
+      "epoch": 2.1877404462682737,
+      "grad_norm": 0.6102015376091003,
+      "learning_rate": 0.0005415063691544841,
+      "loss": 0.2718,
+      "step": 17060
+    },
+    {
+      "epoch": 2.189022826365735,
+      "grad_norm": 1.4588760137557983,
+      "learning_rate": 0.0005406514490895101,
+      "loss": 0.5102,
+      "step": 17070
+    },
+    {
+      "epoch": 2.190305206463196,
+      "grad_norm": 1.0599772930145264,
+      "learning_rate": 0.0005397965290245362,
+      "loss": 0.3631,
+      "step": 17080
+    },
+    {
+      "epoch": 2.1915875865606567,
+      "grad_norm": 1.4130918979644775,
+      "learning_rate": 0.0005389416089595624,
+      "loss": 0.3659,
+      "step": 17090
+    },
+    {
+      "epoch": 2.1928699666581175,
+      "grad_norm": 0.6802207231521606,
+      "learning_rate": 0.0005380866888945883,
+      "loss": 0.3489,
+      "step": 17100
+    },
+    {
+      "epoch": 2.1941523467555784,
+      "grad_norm": 0.5897672176361084,
+      "learning_rate": 0.0005372317688296144,
+      "loss": 0.3961,
+      "step": 17110
+    },
+    {
+      "epoch": 2.195434726853039,
+      "grad_norm": 1.033302664756775,
+      "learning_rate": 0.0005363768487646406,
+      "loss": 0.4143,
+      "step": 17120
+    },
+    {
+      "epoch": 2.1967171069505,
+      "grad_norm": 0.8548007011413574,
+      "learning_rate": 0.0005355219286996665,
+      "loss": 0.3485,
+      "step": 17130
+    },
+    {
+      "epoch": 2.197999487047961,
+      "grad_norm": 1.1116507053375244,
+      "learning_rate": 0.0005346670086346926,
+      "loss": 0.444,
+      "step": 17140
+    },
+    {
+      "epoch": 2.1992818671454217,
+      "grad_norm": 1.2781219482421875,
+      "learning_rate": 0.0005338120885697188,
+      "loss": 0.4845,
+      "step": 17150
+    },
+    {
+      "epoch": 2.200564247242883,
+      "grad_norm": 1.2359662055969238,
+      "learning_rate": 0.0005329571685047448,
+      "loss": 0.4639,
+      "step": 17160
+    },
+    {
+      "epoch": 2.201846627340344,
+      "grad_norm": 1.5580798387527466,
+      "learning_rate": 0.0005321022484397708,
+      "loss": 0.4575,
+      "step": 17170
+    },
+    {
+      "epoch": 2.2031290074378047,
+      "grad_norm": 1.4028860330581665,
+      "learning_rate": 0.000531247328374797,
+      "loss": 0.324,
+      "step": 17180
+    },
+    {
+      "epoch": 2.2044113875352656,
+      "grad_norm": 0.6842575669288635,
+      "learning_rate": 0.0005303924083098231,
+      "loss": 0.4808,
+      "step": 17190
+    },
+    {
+      "epoch": 2.2056937676327264,
+      "grad_norm": 1.1696909666061401,
+      "learning_rate": 0.0005295374882448491,
+      "loss": 0.5438,
+      "step": 17200
+    },
+    {
+      "epoch": 2.2069761477301872,
+      "grad_norm": 0.7407712936401367,
+      "learning_rate": 0.0005286825681798752,
+      "loss": 0.3779,
+      "step": 17210
+    },
+    {
+      "epoch": 2.208258527827648,
+      "grad_norm": 1.0011707544326782,
+      "learning_rate": 0.0005278276481149013,
+      "loss": 0.5065,
+      "step": 17220
+    },
+    {
+      "epoch": 2.209540907925109,
+      "grad_norm": 0.871257483959198,
+      "learning_rate": 0.0005269727280499273,
+      "loss": 0.3252,
+      "step": 17230
+    },
+    {
+      "epoch": 2.2108232880225698,
+      "grad_norm": 0.9432925581932068,
+      "learning_rate": 0.0005261178079849535,
+      "loss": 0.344,
+      "step": 17240
+    },
+    {
+      "epoch": 2.2121056681200306,
+      "grad_norm": 0.726510763168335,
+      "learning_rate": 0.0005252628879199795,
+      "loss": 0.431,
+      "step": 17250
+    },
+    {
+      "epoch": 2.2133880482174915,
+      "grad_norm": 0.698881983757019,
+      "learning_rate": 0.0005244079678550055,
+      "loss": 0.4012,
+      "step": 17260
+    },
+    {
+      "epoch": 2.2146704283149528,
+      "grad_norm": 1.3157625198364258,
+      "learning_rate": 0.0005235530477900317,
+      "loss": 0.4714,
+      "step": 17270
+    },
+    {
+      "epoch": 2.2159528084124136,
+      "grad_norm": 1.106425166130066,
+      "learning_rate": 0.0005226981277250578,
+      "loss": 0.4249,
+      "step": 17280
+    },
+    {
+      "epoch": 2.2172351885098744,
+      "grad_norm": 1.1882113218307495,
+      "learning_rate": 0.0005218432076600837,
+      "loss": 0.4074,
+      "step": 17290
+    },
+    {
+      "epoch": 2.2185175686073353,
+      "grad_norm": 1.2039605379104614,
+      "learning_rate": 0.0005209882875951098,
+      "loss": 0.5073,
+      "step": 17300
+    },
+    {
+      "epoch": 2.219799948704796,
+      "grad_norm": 1.7524374723434448,
+      "learning_rate": 0.000520133367530136,
+      "loss": 0.4022,
+      "step": 17310
+    },
+    {
+      "epoch": 2.221082328802257,
+      "grad_norm": 0.8379983901977539,
+      "learning_rate": 0.0005192784474651621,
+      "loss": 0.4739,
+      "step": 17320
+    },
+    {
+      "epoch": 2.222364708899718,
+      "grad_norm": 1.3615164756774902,
+      "learning_rate": 0.000518423527400188,
+      "loss": 0.4061,
+      "step": 17330
+    },
+    {
+      "epoch": 2.2236470889971787,
+      "grad_norm": 1.1694985628128052,
+      "learning_rate": 0.0005175686073352142,
+      "loss": 0.3913,
+      "step": 17340
+    },
+    {
+      "epoch": 2.2249294690946395,
+      "grad_norm": 0.9127678871154785,
+      "learning_rate": 0.0005167136872702403,
+      "loss": 0.3678,
+      "step": 17350
+    },
+    {
+      "epoch": 2.2262118491921004,
+      "grad_norm": 1.2487945556640625,
+      "learning_rate": 0.0005158587672052663,
+      "loss": 0.4025,
+      "step": 17360
+    },
+    {
+      "epoch": 2.2274942292895616,
+      "grad_norm": 0.7297146916389465,
+      "learning_rate": 0.0005150038471402924,
+      "loss": 0.3391,
+      "step": 17370
+    },
+    {
+      "epoch": 2.2287766093870225,
+      "grad_norm": 0.7297811508178711,
+      "learning_rate": 0.0005141489270753185,
+      "loss": 0.404,
+      "step": 17380
+    },
+    {
+      "epoch": 2.2300589894844833,
+      "grad_norm": 1.2100460529327393,
+      "learning_rate": 0.0005132940070103445,
+      "loss": 0.5015,
+      "step": 17390
+    },
+    {
+      "epoch": 2.231341369581944,
+      "grad_norm": 1.232190728187561,
+      "learning_rate": 0.0005124390869453707,
+      "loss": 0.4982,
+      "step": 17400
+    },
+    {
+      "epoch": 2.232623749679405,
+      "grad_norm": 1.462148904800415,
+      "learning_rate": 0.0005115841668803967,
+      "loss": 0.4461,
+      "step": 17410
+    },
+    {
+      "epoch": 2.233906129776866,
+      "grad_norm": 0.9447479844093323,
+      "learning_rate": 0.0005107292468154227,
+      "loss": 0.3381,
+      "step": 17420
+    },
+    {
+      "epoch": 2.2351885098743267,
+      "grad_norm": 1.2533239126205444,
+      "learning_rate": 0.0005098743267504489,
+      "loss": 0.3932,
+      "step": 17430
+    },
+    {
+      "epoch": 2.2364708899717876,
+      "grad_norm": 0.4960061013698578,
+      "learning_rate": 0.000509019406685475,
+      "loss": 0.4423,
+      "step": 17440
+    },
+    {
+      "epoch": 2.2377532700692484,
+      "grad_norm": 1.033347487449646,
+      "learning_rate": 0.0005081644866205009,
+      "loss": 0.3857,
+      "step": 17450
+    },
+    {
+      "epoch": 2.2390356501667092,
+      "grad_norm": 0.45185425877571106,
+      "learning_rate": 0.000507309566555527,
+      "loss": 0.3485,
+      "step": 17460
+    },
+    {
+      "epoch": 2.24031803026417,
+      "grad_norm": 0.7259741425514221,
+      "learning_rate": 0.0005064546464905532,
+      "loss": 0.4259,
+      "step": 17470
+    },
+    {
+      "epoch": 2.2416004103616314,
+      "grad_norm": 1.2143189907073975,
+      "learning_rate": 0.0005055997264255792,
+      "loss": 0.439,
+      "step": 17480
+    },
+    {
+      "epoch": 2.2428827904590922,
+      "grad_norm": 0.7752086520195007,
+      "learning_rate": 0.0005047448063606052,
+      "loss": 0.3788,
+      "step": 17490
+    },
+    {
+      "epoch": 2.244165170556553,
+      "grad_norm": 1.4273003339767456,
+      "learning_rate": 0.0005038898862956314,
+      "loss": 0.4544,
+      "step": 17500
+    },
+    {
+      "epoch": 2.245447550654014,
+      "grad_norm": 0.5938236713409424,
+      "learning_rate": 0.0005030349662306575,
+      "loss": 0.3911,
+      "step": 17510
+    },
+    {
+      "epoch": 2.2467299307514748,
+      "grad_norm": 1.0833735466003418,
+      "learning_rate": 0.0005021800461656834,
+      "loss": 0.4827,
+      "step": 17520
+    },
+    {
+      "epoch": 2.2480123108489356,
+      "grad_norm": 0.9137888550758362,
+      "learning_rate": 0.0005013251261007096,
+      "loss": 0.4302,
+      "step": 17530
+    },
+    {
+      "epoch": 2.2492946909463964,
+      "grad_norm": 1.2359901666641235,
+      "learning_rate": 0.0005004702060357357,
+      "loss": 0.4328,
+      "step": 17540
+    },
+    {
+      "epoch": 2.2505770710438573,
+      "grad_norm": 0.5860967636108398,
+      "learning_rate": 0.0004996152859707617,
+      "loss": 0.3769,
+      "step": 17550
+    },
+    {
+      "epoch": 2.251859451141318,
+      "grad_norm": 0.7964845299720764,
+      "learning_rate": 0.0004987603659057879,
+      "loss": 0.4499,
+      "step": 17560
+    },
+    {
+      "epoch": 2.253141831238779,
+      "grad_norm": 0.6681275367736816,
+      "learning_rate": 0.0004979054458408139,
+      "loss": 0.5417,
+      "step": 17570
+    },
+    {
+      "epoch": 2.2544242113362403,
+      "grad_norm": 0.5192536115646362,
+      "learning_rate": 0.0004970505257758399,
+      "loss": 0.3263,
+      "step": 17580
+    },
+    {
+      "epoch": 2.255706591433701,
+      "grad_norm": 0.7628294229507446,
+      "learning_rate": 0.0004961956057108661,
+      "loss": 0.2887,
+      "step": 17590
+    },
+    {
+      "epoch": 2.256988971531162,
+      "grad_norm": 0.8533459901809692,
+      "learning_rate": 0.0004953406856458921,
+      "loss": 0.3149,
+      "step": 17600
+    },
+    {
+      "epoch": 2.258271351628623,
+      "grad_norm": 0.5388279557228088,
+      "learning_rate": 0.0004944857655809181,
+      "loss": 0.4372,
+      "step": 17610
+    },
+    {
+      "epoch": 2.2595537317260836,
+      "grad_norm": 0.8363872766494751,
+      "learning_rate": 0.0004936308455159443,
+      "loss": 0.5383,
+      "step": 17620
+    },
+    {
+      "epoch": 2.2608361118235445,
+      "grad_norm": 1.2380322217941284,
+      "learning_rate": 0.0004927759254509703,
+      "loss": 0.3947,
+      "step": 17630
+    },
+    {
+      "epoch": 2.2621184919210053,
+      "grad_norm": 0.5750362277030945,
+      "learning_rate": 0.0004919210053859964,
+      "loss": 0.3112,
+      "step": 17640
+    },
+    {
+      "epoch": 2.263400872018466,
+      "grad_norm": 1.3540990352630615,
+      "learning_rate": 0.0004910660853210225,
+      "loss": 0.4291,
+      "step": 17650
+    },
+    {
+      "epoch": 2.264683252115927,
+      "grad_norm": 1.2334551811218262,
+      "learning_rate": 0.0004902111652560486,
+      "loss": 0.3534,
+      "step": 17660
+    },
+    {
+      "epoch": 2.265965632213388,
+      "grad_norm": 1.0018736124038696,
+      "learning_rate": 0.0004893562451910746,
+      "loss": 0.3656,
+      "step": 17670
+    },
+    {
+      "epoch": 2.2672480123108487,
+      "grad_norm": 1.0932631492614746,
+      "learning_rate": 0.0004885013251261008,
+      "loss": 0.3613,
+      "step": 17680
+    },
+    {
+      "epoch": 2.26853039240831,
+      "grad_norm": 0.900193989276886,
+      "learning_rate": 0.0004876464050611268,
+      "loss": 0.3352,
+      "step": 17690
+    },
+    {
+      "epoch": 2.269812772505771,
+      "grad_norm": 0.511600136756897,
+      "learning_rate": 0.00048679148499615287,
+      "loss": 0.5532,
+      "step": 17700
+    },
+    {
+      "epoch": 2.2710951526032317,
+      "grad_norm": 1.1176284551620483,
+      "learning_rate": 0.00048593656493117895,
+      "loss": 0.2997,
+      "step": 17710
+    },
+    {
+      "epoch": 2.2723775327006925,
+      "grad_norm": 1.379473090171814,
+      "learning_rate": 0.00048508164486620503,
+      "loss": 0.3758,
+      "step": 17720
+    },
+    {
+      "epoch": 2.2736599127981534,
+      "grad_norm": 0.7329534888267517,
+      "learning_rate": 0.0004842267248012311,
+      "loss": 0.3541,
+      "step": 17730
+    },
+    {
+      "epoch": 2.2749422928956142,
+      "grad_norm": 1.0883692502975464,
+      "learning_rate": 0.0004833718047362572,
+      "loss": 0.4156,
+      "step": 17740
+    },
+    {
+      "epoch": 2.276224672993075,
+      "grad_norm": 1.1010819673538208,
+      "learning_rate": 0.0004825168846712832,
+      "loss": 0.5474,
+      "step": 17750
+    },
+    {
+      "epoch": 2.277507053090536,
+      "grad_norm": 1.4709731340408325,
+      "learning_rate": 0.00048166196460630936,
+      "loss": 0.3586,
+      "step": 17760
+    },
+    {
+      "epoch": 2.2787894331879968,
+      "grad_norm": 1.0419952869415283,
+      "learning_rate": 0.0004808070445413354,
+      "loss": 0.3637,
+      "step": 17770
+    },
+    {
+      "epoch": 2.280071813285458,
+      "grad_norm": 0.6669880747795105,
+      "learning_rate": 0.00047995212447636147,
+      "loss": 0.4534,
+      "step": 17780
+    },
+    {
+      "epoch": 2.281354193382919,
+      "grad_norm": 0.7150077223777771,
+      "learning_rate": 0.00047909720441138755,
+      "loss": 0.3625,
+      "step": 17790
+    },
+    {
+      "epoch": 2.2826365734803797,
+      "grad_norm": 0.8918224573135376,
+      "learning_rate": 0.00047824228434641364,
+      "loss": 0.469,
+      "step": 17800
+    },
+    {
+      "epoch": 2.2839189535778406,
+      "grad_norm": 1.1246883869171143,
+      "learning_rate": 0.0004773873642814397,
+      "loss": 0.3144,
+      "step": 17810
+    },
+    {
+      "epoch": 2.2852013336753014,
+      "grad_norm": 0.7975451946258545,
+      "learning_rate": 0.00047653244421646575,
+      "loss": 0.3007,
+      "step": 17820
+    },
+    {
+      "epoch": 2.2864837137727623,
+      "grad_norm": 1.3306605815887451,
+      "learning_rate": 0.00047567752415149183,
+      "loss": 0.5171,
+      "step": 17830
+    },
+    {
+      "epoch": 2.287766093870223,
+      "grad_norm": 0.8955139517784119,
+      "learning_rate": 0.0004748226040865179,
+      "loss": 0.4836,
+      "step": 17840
+    },
+    {
+      "epoch": 2.289048473967684,
+      "grad_norm": 1.8671926259994507,
+      "learning_rate": 0.000473967684021544,
+      "loss": 0.4806,
+      "step": 17850
+    },
+    {
+      "epoch": 2.290330854065145,
+      "grad_norm": 0.8943301439285278,
+      "learning_rate": 0.00047311276395657,
+      "loss": 0.3481,
+      "step": 17860
+    },
+    {
+      "epoch": 2.2916132341626057,
+      "grad_norm": 0.938799262046814,
+      "learning_rate": 0.00047225784389159616,
+      "loss": 0.3384,
+      "step": 17870
+    },
+    {
+      "epoch": 2.2928956142600665,
+      "grad_norm": 0.9175413846969604,
+      "learning_rate": 0.0004714029238266222,
+      "loss": 0.3455,
+      "step": 17880
+    },
+    {
+      "epoch": 2.2941779943575273,
+      "grad_norm": 0.8490305542945862,
+      "learning_rate": 0.0004705480037616483,
+      "loss": 0.4001,
+      "step": 17890
+    },
+    {
+      "epoch": 2.2954603744549886,
+      "grad_norm": 0.525170087814331,
+      "learning_rate": 0.00046969308369667435,
+      "loss": 0.3755,
+      "step": 17900
+    },
+    {
+      "epoch": 2.2967427545524495,
+      "grad_norm": 0.45375433564186096,
+      "learning_rate": 0.00046883816363170043,
+      "loss": 0.4539,
+      "step": 17910
+    },
+    {
+      "epoch": 2.2980251346499103,
+      "grad_norm": 0.6057801246643066,
+      "learning_rate": 0.0004679832435667265,
+      "loss": 0.3656,
+      "step": 17920
+    },
+    {
+      "epoch": 2.299307514747371,
+      "grad_norm": 1.6983225345611572,
+      "learning_rate": 0.0004671283235017526,
+      "loss": 0.4801,
+      "step": 17930
+    },
+    {
+      "epoch": 2.300589894844832,
+      "grad_norm": 0.8477333188056946,
+      "learning_rate": 0.0004662734034367787,
+      "loss": 0.3411,
+      "step": 17940
+    },
+    {
+      "epoch": 2.301872274942293,
+      "grad_norm": 1.024043321609497,
+      "learning_rate": 0.00046541848337180476,
+      "loss": 0.3007,
+      "step": 17950
+    },
+    {
+      "epoch": 2.3031546550397537,
+      "grad_norm": 1.2260679006576538,
+      "learning_rate": 0.0004645635633068308,
+      "loss": 0.4926,
+      "step": 17960
+    },
+    {
+      "epoch": 2.3044370351372145,
+      "grad_norm": 0.626004159450531,
+      "learning_rate": 0.00046370864324185693,
+      "loss": 0.3906,
+      "step": 17970
+    },
+    {
+      "epoch": 2.3057194152346754,
+      "grad_norm": 0.8693203330039978,
+      "learning_rate": 0.00046285372317688296,
+      "loss": 0.3919,
+      "step": 17980
+    },
+    {
+      "epoch": 2.3070017953321367,
+      "grad_norm": 0.8525885343551636,
+      "learning_rate": 0.00046199880311190904,
+      "loss": 0.4073,
+      "step": 17990
+    },
+    {
+      "epoch": 2.3082841754295975,
+      "grad_norm": 0.7898913025856018,
+      "learning_rate": 0.0004611438830469351,
+      "loss": 0.3701,
+      "step": 18000
+    },
+    {
+      "epoch": 2.3095665555270584,
+      "grad_norm": 0.6249486804008484,
+      "learning_rate": 0.0004602889629819612,
+      "loss": 0.4573,
+      "step": 18010
+    },
+    {
+      "epoch": 2.310848935624519,
+      "grad_norm": 0.5609285831451416,
+      "learning_rate": 0.0004594340429169873,
+      "loss": 0.2935,
+      "step": 18020
+    },
+    {
+      "epoch": 2.31213131572198,
+      "grad_norm": 0.6433789730072021,
+      "learning_rate": 0.00045857912285201337,
+      "loss": 0.386,
+      "step": 18030
+    },
+    {
+      "epoch": 2.313413695819441,
+      "grad_norm": 1.4051841497421265,
+      "learning_rate": 0.0004577242027870394,
+      "loss": 0.4438,
+      "step": 18040
+    },
+    {
+      "epoch": 2.3146960759169017,
+      "grad_norm": 0.8757970929145813,
+      "learning_rate": 0.00045686928272206553,
+      "loss": 0.3941,
+      "step": 18050
+    },
+    {
+      "epoch": 2.3159784560143626,
+      "grad_norm": 0.6573584675788879,
+      "learning_rate": 0.00045601436265709156,
+      "loss": 0.412,
+      "step": 18060
+    },
+    {
+      "epoch": 2.3172608361118234,
+      "grad_norm": 0.6750732064247131,
+      "learning_rate": 0.00045515944259211764,
+      "loss": 0.33,
+      "step": 18070
+    },
+    {
+      "epoch": 2.3185432162092843,
+      "grad_norm": 0.9263201951980591,
+      "learning_rate": 0.0004543045225271437,
+      "loss": 0.4086,
+      "step": 18080
+    },
+    {
+      "epoch": 2.319825596306745,
+      "grad_norm": 0.9872358441352844,
+      "learning_rate": 0.0004534496024621698,
+      "loss": 0.4036,
+      "step": 18090
+    },
+    {
+      "epoch": 2.321107976404206,
+      "grad_norm": 1.5108319520950317,
+      "learning_rate": 0.0004525946823971959,
+      "loss": 0.3106,
+      "step": 18100
+    },
+    {
+      "epoch": 2.3223903565016673,
+      "grad_norm": 0.9161720871925354,
+      "learning_rate": 0.00045173976233222197,
+      "loss": 0.3777,
+      "step": 18110
+    },
+    {
+      "epoch": 2.323672736599128,
+      "grad_norm": 1.0512194633483887,
+      "learning_rate": 0.000450884842267248,
+      "loss": 0.4419,
+      "step": 18120
+    },
+    {
+      "epoch": 2.324955116696589,
+      "grad_norm": 0.6393684148788452,
+      "learning_rate": 0.00045002992220227414,
+      "loss": 0.4628,
+      "step": 18130
+    },
+    {
+      "epoch": 2.32623749679405,
+      "grad_norm": 0.9643192887306213,
+      "learning_rate": 0.00044917500213730017,
+      "loss": 0.4549,
+      "step": 18140
+    },
+    {
+      "epoch": 2.3275198768915106,
+      "grad_norm": 1.658616542816162,
+      "learning_rate": 0.00044832008207232625,
+      "loss": 0.3435,
+      "step": 18150
+    },
+    {
+      "epoch": 2.3288022569889715,
+      "grad_norm": 0.7164269685745239,
+      "learning_rate": 0.00044746516200735233,
+      "loss": 0.2776,
+      "step": 18160
+    },
+    {
+      "epoch": 2.3300846370864323,
+      "grad_norm": 1.204102873802185,
+      "learning_rate": 0.00044661024194237836,
+      "loss": 0.399,
+      "step": 18170
+    },
+    {
+      "epoch": 2.331367017183893,
+      "grad_norm": 0.719174325466156,
+      "learning_rate": 0.0004457553218774045,
+      "loss": 0.3717,
+      "step": 18180
+    },
+    {
+      "epoch": 2.332649397281354,
+      "grad_norm": 0.8231685757637024,
+      "learning_rate": 0.0004449004018124305,
+      "loss": 0.3388,
+      "step": 18190
+    },
+    {
+      "epoch": 2.3339317773788153,
+      "grad_norm": 0.542766809463501,
+      "learning_rate": 0.0004440454817474566,
+      "loss": 0.3687,
+      "step": 18200
+    },
+    {
+      "epoch": 2.335214157476276,
+      "grad_norm": 0.7932581305503845,
+      "learning_rate": 0.0004431905616824827,
+      "loss": 0.4434,
+      "step": 18210
+    },
+    {
+      "epoch": 2.336496537573737,
+      "grad_norm": 1.064727544784546,
+      "learning_rate": 0.00044233564161750877,
+      "loss": 0.4495,
+      "step": 18220
+    },
+    {
+      "epoch": 2.337778917671198,
+      "grad_norm": 0.7613261342048645,
+      "learning_rate": 0.00044148072155253485,
+      "loss": 0.4192,
+      "step": 18230
+    },
+    {
+      "epoch": 2.3390612977686587,
+      "grad_norm": 1.3468183279037476,
+      "learning_rate": 0.00044062580148756093,
+      "loss": 0.454,
+      "step": 18240
+    },
+    {
+      "epoch": 2.3403436778661195,
+      "grad_norm": 1.017491102218628,
+      "learning_rate": 0.00043977088142258696,
+      "loss": 0.3561,
+      "step": 18250
+    },
+    {
+      "epoch": 2.3416260579635804,
+      "grad_norm": 1.4051862955093384,
+      "learning_rate": 0.0004389159613576131,
+      "loss": 0.4603,
+      "step": 18260
+    },
+    {
+      "epoch": 2.342908438061041,
+      "grad_norm": 0.8021685481071472,
+      "learning_rate": 0.00043806104129263913,
+      "loss": 0.3409,
+      "step": 18270
+    },
+    {
+      "epoch": 2.344190818158502,
+      "grad_norm": 0.889196515083313,
+      "learning_rate": 0.0004372061212276652,
+      "loss": 0.3659,
+      "step": 18280
+    },
+    {
+      "epoch": 2.345473198255963,
+      "grad_norm": 1.0410467386245728,
+      "learning_rate": 0.0004363512011626913,
+      "loss": 0.3478,
+      "step": 18290
+    },
+    {
+      "epoch": 2.3467555783534237,
+      "grad_norm": 0.5652367472648621,
+      "learning_rate": 0.0004354962810977174,
+      "loss": 0.3194,
+      "step": 18300
+    },
+    {
+      "epoch": 2.348037958450885,
+      "grad_norm": 1.7215555906295776,
+      "learning_rate": 0.00043464136103274346,
+      "loss": 0.39,
+      "step": 18310
+    },
+    {
+      "epoch": 2.349320338548346,
+      "grad_norm": 0.96045982837677,
+      "learning_rate": 0.00043378644096776954,
+      "loss": 0.3795,
+      "step": 18320
+    },
+    {
+      "epoch": 2.3506027186458067,
+      "grad_norm": 1.5710773468017578,
+      "learning_rate": 0.00043293152090279557,
+      "loss": 0.39,
+      "step": 18330
+    },
+    {
+      "epoch": 2.3518850987432676,
+      "grad_norm": 1.176043152809143,
+      "learning_rate": 0.0004320766008378217,
+      "loss": 0.3328,
+      "step": 18340
+    },
+    {
+      "epoch": 2.3531674788407284,
+      "grad_norm": 1.4193735122680664,
+      "learning_rate": 0.00043122168077284773,
+      "loss": 0.36,
+      "step": 18350
+    },
+    {
+      "epoch": 2.3544498589381893,
+      "grad_norm": 0.6019266247749329,
+      "learning_rate": 0.0004303667607078738,
+      "loss": 0.3065,
+      "step": 18360
+    },
+    {
+      "epoch": 2.35573223903565,
+      "grad_norm": 0.5137869715690613,
+      "learning_rate": 0.0004295118406428999,
+      "loss": 0.4731,
+      "step": 18370
+    },
+    {
+      "epoch": 2.357014619133111,
+      "grad_norm": 1.5411295890808105,
+      "learning_rate": 0.000428656920577926,
+      "loss": 0.4936,
+      "step": 18380
+    },
+    {
+      "epoch": 2.358296999230572,
+      "grad_norm": 0.8280097842216492,
+      "learning_rate": 0.00042780200051295206,
+      "loss": 0.4289,
+      "step": 18390
+    },
+    {
+      "epoch": 2.359579379328033,
+      "grad_norm": 0.6101049184799194,
+      "learning_rate": 0.00042694708044797814,
+      "loss": 0.2947,
+      "step": 18400
+    },
+    {
+      "epoch": 2.360861759425494,
+      "grad_norm": 1.0666029453277588,
+      "learning_rate": 0.00042609216038300417,
+      "loss": 0.3594,
+      "step": 18410
+    },
+    {
+      "epoch": 2.3621441395229548,
+      "grad_norm": 0.8030332326889038,
+      "learning_rate": 0.0004252372403180303,
+      "loss": 0.4313,
+      "step": 18420
+    },
+    {
+      "epoch": 2.3634265196204156,
+      "grad_norm": 1.3051592111587524,
+      "learning_rate": 0.00042438232025305634,
+      "loss": 0.3878,
+      "step": 18430
+    },
+    {
+      "epoch": 2.3647088997178765,
+      "grad_norm": 0.7515511512756348,
+      "learning_rate": 0.0004235274001880824,
+      "loss": 0.4099,
+      "step": 18440
+    },
+    {
+      "epoch": 2.3659912798153373,
+      "grad_norm": 0.8009350895881653,
+      "learning_rate": 0.0004226724801231085,
+      "loss": 0.5788,
+      "step": 18450
+    },
+    {
+      "epoch": 2.367273659912798,
+      "grad_norm": 0.7808216214179993,
+      "learning_rate": 0.0004218175600581346,
+      "loss": 0.3801,
+      "step": 18460
+    },
+    {
+      "epoch": 2.368556040010259,
+      "grad_norm": 0.9818991422653198,
+      "learning_rate": 0.00042096263999316067,
+      "loss": 0.4815,
+      "step": 18470
+    },
+    {
+      "epoch": 2.36983842010772,
+      "grad_norm": 0.838982343673706,
+      "learning_rate": 0.00042010771992818675,
+      "loss": 0.4868,
+      "step": 18480
+    },
+    {
+      "epoch": 2.3711208002051807,
+      "grad_norm": 1.2091493606567383,
+      "learning_rate": 0.0004192527998632128,
+      "loss": 0.4504,
+      "step": 18490
+    },
+    {
+      "epoch": 2.3724031803026415,
+      "grad_norm": 0.793835461139679,
+      "learning_rate": 0.0004183978797982389,
+      "loss": 0.3619,
+      "step": 18500
+    },
+    {
+      "epoch": 2.3736855604001024,
+      "grad_norm": 0.6502864956855774,
+      "learning_rate": 0.00041754295973326494,
+      "loss": 0.313,
+      "step": 18510
+    },
+    {
+      "epoch": 2.3749679404975637,
+      "grad_norm": 0.6209380626678467,
+      "learning_rate": 0.000416688039668291,
+      "loss": 0.466,
+      "step": 18520
+    },
+    {
+      "epoch": 2.3762503205950245,
+      "grad_norm": 0.6486326456069946,
+      "learning_rate": 0.0004158331196033171,
+      "loss": 0.3264,
+      "step": 18530
+    },
+    {
+      "epoch": 2.3775327006924853,
+      "grad_norm": 1.1120644807815552,
+      "learning_rate": 0.00041497819953834313,
+      "loss": 0.4322,
+      "step": 18540
+    },
+    {
+      "epoch": 2.378815080789946,
+      "grad_norm": 0.805433452129364,
+      "learning_rate": 0.00041412327947336927,
+      "loss": 0.3712,
+      "step": 18550
+    },
+    {
+      "epoch": 2.380097460887407,
+      "grad_norm": 1.1664881706237793,
+      "learning_rate": 0.0004132683594083953,
+      "loss": 0.3844,
+      "step": 18560
+    },
+    {
+      "epoch": 2.381379840984868,
+      "grad_norm": 0.5431153178215027,
+      "learning_rate": 0.0004124134393434214,
+      "loss": 0.419,
+      "step": 18570
+    },
+    {
+      "epoch": 2.3826622210823287,
+      "grad_norm": 1.0935227870941162,
+      "learning_rate": 0.00041155851927844746,
+      "loss": 0.3942,
+      "step": 18580
+    },
+    {
+      "epoch": 2.3839446011797896,
+      "grad_norm": 0.9874739050865173,
+      "learning_rate": 0.00041070359921347355,
+      "loss": 0.4566,
+      "step": 18590
+    },
+    {
+      "epoch": 2.3852269812772504,
+      "grad_norm": 1.6212762594223022,
+      "learning_rate": 0.00040984867914849963,
+      "loss": 0.5304,
+      "step": 18600
+    },
+    {
+      "epoch": 2.3865093613747117,
+      "grad_norm": 0.9659703969955444,
+      "learning_rate": 0.0004089937590835257,
+      "loss": 0.3745,
+      "step": 18610
+    },
+    {
+      "epoch": 2.3877917414721725,
+      "grad_norm": 1.1413301229476929,
+      "learning_rate": 0.00040813883901855174,
+      "loss": 0.3498,
+      "step": 18620
+    },
+    {
+      "epoch": 2.3890741215696334,
+      "grad_norm": 0.9907665848731995,
+      "learning_rate": 0.0004072839189535779,
+      "loss": 0.3246,
+      "step": 18630
+    },
+    {
+      "epoch": 2.3903565016670942,
+      "grad_norm": 1.7018821239471436,
+      "learning_rate": 0.0004064289988886039,
+      "loss": 0.4562,
+      "step": 18640
+    },
+    {
+      "epoch": 2.391638881764555,
+      "grad_norm": 0.7171698808670044,
+      "learning_rate": 0.00040557407882363,
+      "loss": 0.3188,
+      "step": 18650
+    },
+    {
+      "epoch": 2.392921261862016,
+      "grad_norm": 1.6024487018585205,
+      "learning_rate": 0.00040471915875865607,
+      "loss": 0.4274,
+      "step": 18660
+    },
+    {
+      "epoch": 2.3942036419594768,
+      "grad_norm": 0.6559688448905945,
+      "learning_rate": 0.00040386423869368215,
+      "loss": 0.3052,
+      "step": 18670
+    },
+    {
+      "epoch": 2.3954860220569376,
+      "grad_norm": 0.2720082402229309,
+      "learning_rate": 0.00040300931862870823,
+      "loss": 0.309,
+      "step": 18680
+    },
+    {
+      "epoch": 2.3967684021543985,
+      "grad_norm": 1.082115650177002,
+      "learning_rate": 0.0004021543985637343,
+      "loss": 0.3961,
+      "step": 18690
+    },
+    {
+      "epoch": 2.3980507822518593,
+      "grad_norm": 1.2949116230010986,
+      "learning_rate": 0.00040129947849876034,
+      "loss": 0.4343,
+      "step": 18700
+    },
+    {
+      "epoch": 2.39933316234932,
+      "grad_norm": 1.1575446128845215,
+      "learning_rate": 0.0004004445584337865,
+      "loss": 0.3872,
+      "step": 18710
+    },
+    {
+      "epoch": 2.400615542446781,
+      "grad_norm": 1.3714033365249634,
+      "learning_rate": 0.0003995896383688125,
+      "loss": 0.403,
+      "step": 18720
+    },
+    {
+      "epoch": 2.4018979225442423,
+      "grad_norm": 0.7358514070510864,
+      "learning_rate": 0.0003987347183038386,
+      "loss": 0.3598,
+      "step": 18730
+    },
+    {
+      "epoch": 2.403180302641703,
+      "grad_norm": 0.6895415186882019,
+      "learning_rate": 0.0003978797982388647,
+      "loss": 0.3644,
+      "step": 18740
+    },
+    {
+      "epoch": 2.404462682739164,
+      "grad_norm": 0.7910656332969666,
+      "learning_rate": 0.00039702487817389076,
+      "loss": 0.3489,
+      "step": 18750
+    },
+    {
+      "epoch": 2.405745062836625,
+      "grad_norm": 0.6187024712562561,
+      "learning_rate": 0.00039616995810891684,
+      "loss": 0.4018,
+      "step": 18760
+    },
+    {
+      "epoch": 2.4070274429340857,
+      "grad_norm": 1.0988044738769531,
+      "learning_rate": 0.0003953150380439429,
+      "loss": 0.4284,
+      "step": 18770
+    },
+    {
+      "epoch": 2.4083098230315465,
+      "grad_norm": 1.2347112894058228,
+      "learning_rate": 0.00039446011797896895,
+      "loss": 0.4418,
+      "step": 18780
+    },
+    {
+      "epoch": 2.4095922031290073,
+      "grad_norm": 0.756648600101471,
+      "learning_rate": 0.0003936051979139951,
+      "loss": 0.2948,
+      "step": 18790
+    },
+    {
+      "epoch": 2.410874583226468,
+      "grad_norm": 0.7087267637252808,
+      "learning_rate": 0.0003927502778490211,
+      "loss": 0.3494,
+      "step": 18800
+    },
+    {
+      "epoch": 2.412156963323929,
+      "grad_norm": 0.8558051586151123,
+      "learning_rate": 0.00039189535778404725,
+      "loss": 0.3838,
+      "step": 18810
+    },
+    {
+      "epoch": 2.4134393434213903,
+      "grad_norm": 0.669138491153717,
+      "learning_rate": 0.0003910404377190733,
+      "loss": 0.3141,
+      "step": 18820
+    },
+    {
+      "epoch": 2.414721723518851,
+      "grad_norm": 0.7983182072639465,
+      "learning_rate": 0.00039018551765409936,
+      "loss": 0.3701,
+      "step": 18830
+    },
+    {
+      "epoch": 2.416004103616312,
+      "grad_norm": 0.9110289812088013,
+      "learning_rate": 0.00038933059758912544,
+      "loss": 0.3892,
+      "step": 18840
+    },
+    {
+      "epoch": 2.417286483713773,
+      "grad_norm": 0.7137938141822815,
+      "learning_rate": 0.0003884756775241515,
+      "loss": 0.4112,
+      "step": 18850
+    },
+    {
+      "epoch": 2.4185688638112337,
+      "grad_norm": 1.2632485628128052,
+      "learning_rate": 0.00038762075745917755,
+      "loss": 0.5279,
+      "step": 18860
+    },
+    {
+      "epoch": 2.4198512439086945,
+      "grad_norm": 0.7221540212631226,
+      "learning_rate": 0.00038676583739420364,
+      "loss": 0.3697,
+      "step": 18870
+    },
+    {
+      "epoch": 2.4211336240061554,
+      "grad_norm": 0.3167746365070343,
+      "learning_rate": 0.0003859109173292297,
+      "loss": 0.2561,
+      "step": 18880
+    },
+    {
+      "epoch": 2.4224160041036162,
+      "grad_norm": 1.2461453676223755,
+      "learning_rate": 0.0003850559972642558,
+      "loss": 0.4454,
+      "step": 18890
+    },
+    {
+      "epoch": 2.423698384201077,
+      "grad_norm": 1.2429416179656982,
+      "learning_rate": 0.0003842010771992819,
+      "loss": 0.3414,
+      "step": 18900
+    },
+    {
+      "epoch": 2.424980764298538,
+      "grad_norm": 0.8229495882987976,
+      "learning_rate": 0.0003833461571343079,
+      "loss": 0.5697,
+      "step": 18910
+    },
+    {
+      "epoch": 2.4262631443959988,
+      "grad_norm": 1.0524449348449707,
+      "learning_rate": 0.00038249123706933405,
+      "loss": 0.4613,
+      "step": 18920
+    },
+    {
+      "epoch": 2.4275455244934596,
+      "grad_norm": 1.0772918462753296,
+      "learning_rate": 0.0003816363170043601,
+      "loss": 0.3401,
+      "step": 18930
+    },
+    {
+      "epoch": 2.428827904590921,
+      "grad_norm": 1.0349977016448975,
+      "learning_rate": 0.00038078139693938616,
+      "loss": 0.4301,
+      "step": 18940
+    },
+    {
+      "epoch": 2.4301102846883817,
+      "grad_norm": 1.188043236732483,
+      "learning_rate": 0.00037992647687441224,
+      "loss": 0.4321,
+      "step": 18950
+    },
+    {
+      "epoch": 2.4313926647858426,
+      "grad_norm": 0.5111313462257385,
+      "learning_rate": 0.0003790715568094383,
+      "loss": 0.406,
+      "step": 18960
+    },
+    {
+      "epoch": 2.4326750448833034,
+      "grad_norm": 0.7800171375274658,
+      "learning_rate": 0.0003782166367444644,
+      "loss": 0.336,
+      "step": 18970
+    },
+    {
+      "epoch": 2.4339574249807643,
+      "grad_norm": 1.0893301963806152,
+      "learning_rate": 0.0003773617166794905,
+      "loss": 0.3267,
+      "step": 18980
+    },
+    {
+      "epoch": 2.435239805078225,
+      "grad_norm": 1.028470754623413,
+      "learning_rate": 0.0003765067966145165,
+      "loss": 0.3977,
+      "step": 18990
+    },
+    {
+      "epoch": 2.436522185175686,
+      "grad_norm": 1.0852724313735962,
+      "learning_rate": 0.00037565187654954265,
+      "loss": 0.3528,
+      "step": 19000
+    },
+    {
+      "epoch": 2.437804565273147,
+      "grad_norm": 0.8436377644538879,
+      "learning_rate": 0.0003747969564845687,
+      "loss": 0.4934,
+      "step": 19010
+    },
+    {
+      "epoch": 2.4390869453706077,
+      "grad_norm": 0.8028691411018372,
+      "learning_rate": 0.00037394203641959476,
+      "loss": 0.2514,
+      "step": 19020
+    },
+    {
+      "epoch": 2.440369325468069,
+      "grad_norm": 0.6978164911270142,
+      "learning_rate": 0.00037308711635462084,
+      "loss": 0.2909,
+      "step": 19030
+    },
+    {
+      "epoch": 2.44165170556553,
+      "grad_norm": 0.9961578249931335,
+      "learning_rate": 0.0003722321962896469,
+      "loss": 0.5097,
+      "step": 19040
+    },
+    {
+      "epoch": 2.4429340856629906,
+      "grad_norm": 0.8044784069061279,
+      "learning_rate": 0.000371377276224673,
+      "loss": 0.3696,
+      "step": 19050
+    },
+    {
+      "epoch": 2.4442164657604515,
+      "grad_norm": 0.9142523407936096,
+      "learning_rate": 0.0003705223561596991,
+      "loss": 0.2975,
+      "step": 19060
+    },
+    {
+      "epoch": 2.4454988458579123,
+      "grad_norm": 0.6743261814117432,
+      "learning_rate": 0.0003696674360947251,
+      "loss": 0.3438,
+      "step": 19070
+    },
+    {
+      "epoch": 2.446781225955373,
+      "grad_norm": 0.9086779356002808,
+      "learning_rate": 0.00036881251602975126,
+      "loss": 0.3507,
+      "step": 19080
+    },
+    {
+      "epoch": 2.448063606052834,
+      "grad_norm": 0.8643527030944824,
+      "learning_rate": 0.0003679575959647773,
+      "loss": 0.4654,
+      "step": 19090
+    },
+    {
+      "epoch": 2.449345986150295,
+      "grad_norm": 0.6658887267112732,
+      "learning_rate": 0.0003671026758998034,
+      "loss": 0.2952,
+      "step": 19100
+    },
+    {
+      "epoch": 2.4506283662477557,
+      "grad_norm": 1.4154678583145142,
+      "learning_rate": 0.00036624775583482945,
+      "loss": 0.4116,
+      "step": 19110
+    },
+    {
+      "epoch": 2.4519107463452166,
+      "grad_norm": 0.9834240674972534,
+      "learning_rate": 0.00036539283576985553,
+      "loss": 0.4582,
+      "step": 19120
+    },
+    {
+      "epoch": 2.4531931264426774,
+      "grad_norm": 1.1444348096847534,
+      "learning_rate": 0.0003645379157048816,
+      "loss": 0.4559,
+      "step": 19130
+    },
+    {
+      "epoch": 2.4544755065401387,
+      "grad_norm": 1.2544337511062622,
+      "learning_rate": 0.0003636829956399077,
+      "loss": 0.3877,
+      "step": 19140
+    },
+    {
+      "epoch": 2.4557578866375995,
+      "grad_norm": 0.7545201182365417,
+      "learning_rate": 0.0003628280755749337,
+      "loss": 0.3751,
+      "step": 19150
+    },
+    {
+      "epoch": 2.4570402667350604,
+      "grad_norm": 1.476630449295044,
+      "learning_rate": 0.00036197315550995986,
+      "loss": 0.5416,
+      "step": 19160
+    },
+    {
+      "epoch": 2.458322646832521,
+      "grad_norm": 0.867030143737793,
+      "learning_rate": 0.0003611182354449859,
+      "loss": 0.3719,
+      "step": 19170
+    },
+    {
+      "epoch": 2.459605026929982,
+      "grad_norm": 0.511754035949707,
+      "learning_rate": 0.000360263315380012,
+      "loss": 0.41,
+      "step": 19180
+    },
+    {
+      "epoch": 2.460887407027443,
+      "grad_norm": 1.1626338958740234,
+      "learning_rate": 0.00035940839531503805,
+      "loss": 0.4129,
+      "step": 19190
+    },
+    {
+      "epoch": 2.4621697871249038,
+      "grad_norm": 0.35824307799339294,
+      "learning_rate": 0.00035855347525006414,
+      "loss": 0.319,
+      "step": 19200
+    },
+    {
+      "epoch": 2.4634521672223646,
+      "grad_norm": 1.2998716831207275,
+      "learning_rate": 0.0003576985551850902,
+      "loss": 0.3915,
+      "step": 19210
+    },
+    {
+      "epoch": 2.4647345473198254,
+      "grad_norm": 0.6478980183601379,
+      "learning_rate": 0.00035684363512011625,
+      "loss": 0.3292,
+      "step": 19220
+    },
+    {
+      "epoch": 2.4660169274172867,
+      "grad_norm": 1.1961947679519653,
+      "learning_rate": 0.00035598871505514233,
+      "loss": 0.4412,
+      "step": 19230
+    },
+    {
+      "epoch": 2.4672993075147476,
+      "grad_norm": 0.7244174480438232,
+      "learning_rate": 0.0003551337949901684,
+      "loss": 0.3171,
+      "step": 19240
+    },
+    {
+      "epoch": 2.4685816876122084,
+      "grad_norm": 0.6592457294464111,
+      "learning_rate": 0.0003542788749251945,
+      "loss": 0.3354,
+      "step": 19250
+    },
+    {
+      "epoch": 2.4698640677096693,
+      "grad_norm": 0.946502685546875,
+      "learning_rate": 0.0003534239548602206,
+      "loss": 0.3961,
+      "step": 19260
+    },
+    {
+      "epoch": 2.47114644780713,
+      "grad_norm": 0.8770771026611328,
+      "learning_rate": 0.00035256903479524666,
+      "loss": 0.4879,
+      "step": 19270
+    },
+    {
+      "epoch": 2.472428827904591,
+      "grad_norm": 0.7424082159996033,
+      "learning_rate": 0.0003517141147302727,
+      "loss": 0.3845,
+      "step": 19280
+    },
+    {
+      "epoch": 2.473711208002052,
+      "grad_norm": 0.8747217655181885,
+      "learning_rate": 0.0003508591946652988,
+      "loss": 0.3393,
+      "step": 19290
+    },
+    {
+      "epoch": 2.4749935880995126,
+      "grad_norm": 1.3483731746673584,
+      "learning_rate": 0.00035000427460032485,
+      "loss": 0.3517,
+      "step": 19300
+    },
+    {
+      "epoch": 2.4762759681969735,
+      "grad_norm": 0.5340741276741028,
+      "learning_rate": 0.00034914935453535093,
+      "loss": 0.4246,
+      "step": 19310
+    },
+    {
+      "epoch": 2.4775583482944343,
+      "grad_norm": 1.0605217218399048,
+      "learning_rate": 0.000348294434470377,
+      "loss": 0.5212,
+      "step": 19320
+    },
+    {
+      "epoch": 2.478840728391895,
+      "grad_norm": 1.1678279638290405,
+      "learning_rate": 0.0003474395144054031,
+      "loss": 0.3448,
+      "step": 19330
+    },
+    {
+      "epoch": 2.480123108489356,
+      "grad_norm": 1.3842048645019531,
+      "learning_rate": 0.0003465845943404292,
+      "loss": 0.4713,
+      "step": 19340
+    },
+    {
+      "epoch": 2.4814054885868173,
+      "grad_norm": 0.9531245231628418,
+      "learning_rate": 0.00034572967427545526,
+      "loss": 0.4228,
+      "step": 19350
+    },
+    {
+      "epoch": 2.482687868684278,
+      "grad_norm": 1.5676864385604858,
+      "learning_rate": 0.0003448747542104813,
+      "loss": 0.397,
+      "step": 19360
+    },
+    {
+      "epoch": 2.483970248781739,
+      "grad_norm": 0.8071860671043396,
+      "learning_rate": 0.00034401983414550743,
+      "loss": 0.3884,
+      "step": 19370
+    },
+    {
+      "epoch": 2.4852526288792,
+      "grad_norm": 1.1921252012252808,
+      "learning_rate": 0.00034316491408053346,
+      "loss": 0.3698,
+      "step": 19380
+    },
+    {
+      "epoch": 2.4865350089766607,
+      "grad_norm": 0.7575945854187012,
+      "learning_rate": 0.0003423099940155596,
+      "loss": 0.4962,
+      "step": 19390
+    },
+    {
+      "epoch": 2.4878173890741215,
+      "grad_norm": 0.9211723804473877,
+      "learning_rate": 0.0003414550739505856,
+      "loss": 0.4712,
+      "step": 19400
+    },
+    {
+      "epoch": 2.4890997691715824,
+      "grad_norm": 1.3572173118591309,
+      "learning_rate": 0.0003406001538856117,
+      "loss": 0.3369,
+      "step": 19410
+    },
+    {
+      "epoch": 2.490382149269043,
+      "grad_norm": 0.8064128160476685,
+      "learning_rate": 0.0003397452338206378,
+      "loss": 0.3977,
+      "step": 19420
+    },
+    {
+      "epoch": 2.491664529366504,
+      "grad_norm": 0.708720326423645,
+      "learning_rate": 0.00033889031375566387,
+      "loss": 0.4332,
+      "step": 19430
+    },
+    {
+      "epoch": 2.4929469094639654,
+      "grad_norm": 0.34566161036491394,
+      "learning_rate": 0.0003380353936906899,
+      "loss": 0.366,
+      "step": 19440
+    },
+    {
+      "epoch": 2.494229289561426,
+      "grad_norm": 0.815828263759613,
+      "learning_rate": 0.00033718047362571603,
+      "loss": 0.4021,
+      "step": 19450
+    },
+    {
+      "epoch": 2.495511669658887,
+      "grad_norm": 0.8650433421134949,
+      "learning_rate": 0.00033632555356074206,
+      "loss": 0.3124,
+      "step": 19460
+    },
+    {
+      "epoch": 2.496794049756348,
+      "grad_norm": 1.2092469930648804,
+      "learning_rate": 0.0003354706334957682,
+      "loss": 0.4188,
+      "step": 19470
+    },
+    {
+      "epoch": 2.4980764298538087,
+      "grad_norm": 0.8805145025253296,
+      "learning_rate": 0.0003346157134307942,
+      "loss": 0.3866,
+      "step": 19480
+    },
+    {
+      "epoch": 2.4993588099512696,
+      "grad_norm": 0.9097617864608765,
+      "learning_rate": 0.0003337607933658203,
+      "loss": 0.3674,
+      "step": 19490
+    },
+    {
+      "epoch": 2.5006411900487304,
+      "grad_norm": 0.8548180460929871,
+      "learning_rate": 0.0003329058733008464,
+      "loss": 0.4066,
+      "step": 19500
+    },
+    {
+      "epoch": 2.5019235701461913,
+      "grad_norm": 0.5404782295227051,
+      "learning_rate": 0.00033205095323587247,
+      "loss": 0.3742,
+      "step": 19510
+    },
+    {
+      "epoch": 2.503205950243652,
+      "grad_norm": 0.4802301526069641,
+      "learning_rate": 0.0003311960331708985,
+      "loss": 0.3414,
+      "step": 19520
+    },
+    {
+      "epoch": 2.504488330341113,
+      "grad_norm": 0.5459701418876648,
+      "learning_rate": 0.00033034111310592464,
+      "loss": 0.3094,
+      "step": 19530
+    },
+    {
+      "epoch": 2.505770710438574,
+      "grad_norm": 1.0268832445144653,
+      "learning_rate": 0.00032948619304095067,
+      "loss": 0.3653,
+      "step": 19540
+    },
+    {
+      "epoch": 2.5070530905360346,
+      "grad_norm": 1.0585857629776,
+      "learning_rate": 0.0003286312729759768,
+      "loss": 0.4984,
+      "step": 19550
+    },
+    {
+      "epoch": 2.508335470633496,
+      "grad_norm": 0.943658709526062,
+      "learning_rate": 0.00032777635291100283,
+      "loss": 0.3321,
+      "step": 19560
+    },
+    {
+      "epoch": 2.509617850730957,
+      "grad_norm": 1.1988105773925781,
+      "learning_rate": 0.00032692143284602886,
+      "loss": 0.3189,
+      "step": 19570
+    },
+    {
+      "epoch": 2.5109002308284176,
+      "grad_norm": 1.466678261756897,
+      "learning_rate": 0.000326066512781055,
+      "loss": 0.4272,
+      "step": 19580
+    },
+    {
+      "epoch": 2.5121826109258785,
+      "grad_norm": 0.9461327791213989,
+      "learning_rate": 0.000325211592716081,
+      "loss": 0.5022,
+      "step": 19590
+    },
+    {
+      "epoch": 2.5134649910233393,
+      "grad_norm": 0.9493967294692993,
+      "learning_rate": 0.0003243566726511071,
+      "loss": 0.2942,
+      "step": 19600
+    },
+    {
+      "epoch": 2.5147473711208,
+      "grad_norm": 0.6060981154441833,
+      "learning_rate": 0.0003235017525861332,
+      "loss": 0.3608,
+      "step": 19610
+    },
+    {
+      "epoch": 2.516029751218261,
+      "grad_norm": 1.081632137298584,
+      "learning_rate": 0.00032264683252115927,
+      "loss": 0.3932,
+      "step": 19620
+    },
+    {
+      "epoch": 2.517312131315722,
+      "grad_norm": 0.272013396024704,
+      "learning_rate": 0.00032179191245618535,
+      "loss": 0.3491,
+      "step": 19630
+    },
+    {
+      "epoch": 2.518594511413183,
+      "grad_norm": 0.7338408827781677,
+      "learning_rate": 0.00032093699239121144,
+      "loss": 0.4811,
+      "step": 19640
+    },
+    {
+      "epoch": 2.519876891510644,
+      "grad_norm": 0.6062107086181641,
+      "learning_rate": 0.00032008207232623746,
+      "loss": 0.3817,
+      "step": 19650
+    },
+    {
+      "epoch": 2.521159271608105,
+      "grad_norm": 1.2783069610595703,
+      "learning_rate": 0.0003192271522612636,
+      "loss": 0.3512,
+      "step": 19660
+    },
+    {
+      "epoch": 2.5224416517055657,
+      "grad_norm": 1.2621718645095825,
+      "learning_rate": 0.00031837223219628963,
+      "loss": 0.416,
+      "step": 19670
+    },
+    {
+      "epoch": 2.5237240318030265,
+      "grad_norm": 0.6203981637954712,
+      "learning_rate": 0.00031751731213131576,
+      "loss": 0.307,
+      "step": 19680
+    },
+    {
+      "epoch": 2.5250064119004874,
+      "grad_norm": 0.8723649978637695,
+      "learning_rate": 0.0003166623920663418,
+      "loss": 0.3647,
+      "step": 19690
+    },
+    {
+      "epoch": 2.526288791997948,
+      "grad_norm": 0.887333333492279,
+      "learning_rate": 0.0003158074720013679,
+      "loss": 0.5004,
+      "step": 19700
+    },
+    {
+      "epoch": 2.527571172095409,
+      "grad_norm": 0.40670618414878845,
+      "learning_rate": 0.00031495255193639396,
+      "loss": 0.3769,
+      "step": 19710
+    },
+    {
+      "epoch": 2.52885355219287,
+      "grad_norm": 0.5381103157997131,
+      "learning_rate": 0.00031409763187142004,
+      "loss": 0.259,
+      "step": 19720
+    },
+    {
+      "epoch": 2.5301359322903307,
+      "grad_norm": 0.7360714673995972,
+      "learning_rate": 0.00031324271180644607,
+      "loss": 0.4182,
+      "step": 19730
+    },
+    {
+      "epoch": 2.5314183123877916,
+      "grad_norm": 0.8640091419219971,
+      "learning_rate": 0.0003123877917414722,
+      "loss": 0.3245,
+      "step": 19740
+    },
+    {
+      "epoch": 2.5327006924852524,
+      "grad_norm": 0.5540979504585266,
+      "learning_rate": 0.00031153287167649823,
+      "loss": 0.3386,
+      "step": 19750
+    },
+    {
+      "epoch": 2.5339830725827133,
+      "grad_norm": 0.7436388731002808,
+      "learning_rate": 0.00031067795161152437,
+      "loss": 0.3423,
+      "step": 19760
+    },
+    {
+      "epoch": 2.5352654526801746,
+      "grad_norm": 0.657111644744873,
+      "learning_rate": 0.0003098230315465504,
+      "loss": 0.5047,
+      "step": 19770
+    },
+    {
+      "epoch": 2.5365478327776354,
+      "grad_norm": 0.8611753582954407,
+      "learning_rate": 0.0003089681114815765,
+      "loss": 0.4797,
+      "step": 19780
+    },
+    {
+      "epoch": 2.5378302128750962,
+      "grad_norm": 0.834993302822113,
+      "learning_rate": 0.00030811319141660256,
+      "loss": 0.3757,
+      "step": 19790
+    },
+    {
+      "epoch": 2.539112592972557,
+      "grad_norm": 1.335398554801941,
+      "learning_rate": 0.00030725827135162864,
+      "loss": 0.4135,
+      "step": 19800
+    },
+    {
+      "epoch": 2.540394973070018,
+      "grad_norm": 0.5498932600021362,
+      "learning_rate": 0.00030640335128665467,
+      "loss": 0.4223,
+      "step": 19810
+    },
+    {
+      "epoch": 2.541677353167479,
+      "grad_norm": 0.754500150680542,
+      "learning_rate": 0.0003055484312216808,
+      "loss": 0.3339,
+      "step": 19820
+    },
+    {
+      "epoch": 2.5429597332649396,
+      "grad_norm": 1.278773307800293,
+      "learning_rate": 0.00030469351115670684,
+      "loss": 0.5497,
+      "step": 19830
+    },
+    {
+      "epoch": 2.5442421133624005,
+      "grad_norm": 0.549717903137207,
+      "learning_rate": 0.000303838591091733,
+      "loss": 0.4507,
+      "step": 19840
+    },
+    {
+      "epoch": 2.5455244934598618,
+      "grad_norm": 0.7708590626716614,
+      "learning_rate": 0.000302983671026759,
+      "loss": 0.363,
+      "step": 19850
+    },
+    {
+      "epoch": 2.5468068735573226,
+      "grad_norm": 0.4803219437599182,
+      "learning_rate": 0.0003021287509617851,
+      "loss": 0.2542,
+      "step": 19860
+    },
+    {
+      "epoch": 2.5480892536547834,
+      "grad_norm": 0.9697148203849792,
+      "learning_rate": 0.00030127383089681117,
+      "loss": 0.3473,
+      "step": 19870
+    },
+    {
+      "epoch": 2.5493716337522443,
+      "grad_norm": 1.0347312688827515,
+      "learning_rate": 0.00030041891083183725,
+      "loss": 0.3943,
+      "step": 19880
+    },
+    {
+      "epoch": 2.550654013849705,
+      "grad_norm": 0.8918094635009766,
+      "learning_rate": 0.0002995639907668633,
+      "loss": 0.295,
+      "step": 19890
+    },
+    {
+      "epoch": 2.551936393947166,
+      "grad_norm": 0.8626148700714111,
+      "learning_rate": 0.0002987090707018894,
+      "loss": 0.3919,
+      "step": 19900
+    },
+    {
+      "epoch": 2.553218774044627,
+      "grad_norm": 1.0296040773391724,
+      "learning_rate": 0.00029785415063691544,
+      "loss": 0.3154,
+      "step": 19910
+    },
+    {
+      "epoch": 2.5545011541420877,
+      "grad_norm": 0.8652689456939697,
+      "learning_rate": 0.0002969992305719415,
+      "loss": 0.4078,
+      "step": 19920
+    },
+    {
+      "epoch": 2.5557835342395485,
+      "grad_norm": 0.6881958246231079,
+      "learning_rate": 0.0002961443105069676,
+      "loss": 0.3029,
+      "step": 19930
+    },
+    {
+      "epoch": 2.5570659143370094,
+      "grad_norm": 0.627172589302063,
+      "learning_rate": 0.00029528939044199363,
+      "loss": 0.3965,
+      "step": 19940
+    },
+    {
+      "epoch": 2.55834829443447,
+      "grad_norm": 0.9632807970046997,
+      "learning_rate": 0.00029443447037701977,
+      "loss": 0.3817,
+      "step": 19950
+    },
+    {
+      "epoch": 2.559630674531931,
+      "grad_norm": 0.7820205688476562,
+      "learning_rate": 0.0002935795503120458,
+      "loss": 0.2984,
+      "step": 19960
+    },
+    {
+      "epoch": 2.560913054629392,
+      "grad_norm": 0.7165734767913818,
+      "learning_rate": 0.00029272463024707194,
+      "loss": 0.3589,
+      "step": 19970
+    },
+    {
+      "epoch": 2.562195434726853,
+      "grad_norm": 0.43464410305023193,
+      "learning_rate": 0.00029186971018209796,
+      "loss": 0.2894,
+      "step": 19980
+    },
+    {
+      "epoch": 2.563477814824314,
+      "grad_norm": 0.7545332312583923,
+      "learning_rate": 0.00029101479011712405,
+      "loss": 0.3861,
+      "step": 19990
+    },
+    {
+      "epoch": 2.564760194921775,
+      "grad_norm": 0.6315227746963501,
+      "learning_rate": 0.00029015987005215013,
+      "loss": 0.3933,
+      "step": 20000
+    },
+    {
+      "epoch": 2.5660425750192357,
+      "grad_norm": 0.8390231132507324,
+      "learning_rate": 0.0002893049499871762,
+      "loss": 0.4576,
+      "step": 20010
+    },
+    {
+      "epoch": 2.5673249551166966,
+      "grad_norm": 1.075249433517456,
+      "learning_rate": 0.00028845002992220224,
+      "loss": 0.394,
+      "step": 20020
+    },
+    {
+      "epoch": 2.5686073352141574,
+      "grad_norm": 0.9567833542823792,
+      "learning_rate": 0.0002875951098572284,
+      "loss": 0.3557,
+      "step": 20030
+    },
+    {
+      "epoch": 2.5698897153116183,
+      "grad_norm": 1.3885024785995483,
+      "learning_rate": 0.0002867401897922544,
+      "loss": 0.367,
+      "step": 20040
+    },
+    {
+      "epoch": 2.571172095409079,
+      "grad_norm": 0.8868879079818726,
+      "learning_rate": 0.00028588526972728054,
+      "loss": 0.3346,
+      "step": 20050
+    },
+    {
+      "epoch": 2.5724544755065404,
+      "grad_norm": 0.37503331899642944,
+      "learning_rate": 0.00028503034966230657,
+      "loss": 0.3142,
+      "step": 20060
+    },
+    {
+      "epoch": 2.5737368556040012,
+      "grad_norm": 1.0467469692230225,
+      "learning_rate": 0.00028417542959733265,
+      "loss": 0.379,
+      "step": 20070
+    },
+    {
+      "epoch": 2.575019235701462,
+      "grad_norm": 1.1559125185012817,
+      "learning_rate": 0.00028332050953235873,
+      "loss": 0.3753,
+      "step": 20080
+    },
+    {
+      "epoch": 2.576301615798923,
+      "grad_norm": 0.7905515432357788,
+      "learning_rate": 0.0002824655894673848,
+      "loss": 0.3534,
+      "step": 20090
+    },
+    {
+      "epoch": 2.5775839958963838,
+      "grad_norm": 0.44288355112075806,
+      "learning_rate": 0.00028161066940241084,
+      "loss": 0.3416,
+      "step": 20100
+    },
+    {
+      "epoch": 2.5788663759938446,
+      "grad_norm": 0.7493765950202942,
+      "learning_rate": 0.000280755749337437,
+      "loss": 0.4072,
+      "step": 20110
+    },
+    {
+      "epoch": 2.5801487560913055,
+      "grad_norm": 0.42998605966567993,
+      "learning_rate": 0.000279900829272463,
+      "loss": 0.317,
+      "step": 20120
+    },
+    {
+      "epoch": 2.5814311361887663,
+      "grad_norm": 1.049352765083313,
+      "learning_rate": 0.00027904590920748915,
+      "loss": 0.4015,
+      "step": 20130
+    },
+    {
+      "epoch": 2.582713516286227,
+      "grad_norm": 0.5475008487701416,
+      "learning_rate": 0.0002781909891425152,
+      "loss": 0.3598,
+      "step": 20140
+    },
+    {
+      "epoch": 2.583995896383688,
+      "grad_norm": 0.8483502864837646,
+      "learning_rate": 0.00027733606907754126,
+      "loss": 0.4564,
+      "step": 20150
+    },
+    {
+      "epoch": 2.585278276481149,
+      "grad_norm": 1.3677246570587158,
+      "learning_rate": 0.00027648114901256734,
+      "loss": 0.4583,
+      "step": 20160
+    },
+    {
+      "epoch": 2.5865606565786097,
+      "grad_norm": 1.5475443601608276,
+      "learning_rate": 0.0002756262289475934,
+      "loss": 0.4106,
+      "step": 20170
+    },
+    {
+      "epoch": 2.5878430366760705,
+      "grad_norm": 0.5748480558395386,
+      "learning_rate": 0.00027477130888261945,
+      "loss": 0.416,
+      "step": 20180
+    },
+    {
+      "epoch": 2.589125416773532,
+      "grad_norm": 0.9539164304733276,
+      "learning_rate": 0.0002739163888176456,
+      "loss": 0.4095,
+      "step": 20190
+    },
+    {
+      "epoch": 2.5904077968709927,
+      "grad_norm": 0.8380826115608215,
+      "learning_rate": 0.0002730614687526716,
+      "loss": 0.404,
+      "step": 20200
+    },
+    {
+      "epoch": 2.5916901769684535,
+      "grad_norm": 1.1457738876342773,
+      "learning_rate": 0.00027220654868769775,
+      "loss": 0.478,
+      "step": 20210
+    },
+    {
+      "epoch": 2.5929725570659143,
+      "grad_norm": 0.5963801741600037,
+      "learning_rate": 0.0002713516286227238,
+      "loss": 0.3425,
+      "step": 20220
+    },
+    {
+      "epoch": 2.594254937163375,
+      "grad_norm": 1.2528159618377686,
+      "learning_rate": 0.00027049670855774986,
+      "loss": 0.4059,
+      "step": 20230
+    },
+    {
+      "epoch": 2.595537317260836,
+      "grad_norm": 1.1081477403640747,
+      "learning_rate": 0.00026964178849277594,
+      "loss": 0.2923,
+      "step": 20240
+    },
+    {
+      "epoch": 2.596819697358297,
+      "grad_norm": 1.046190857887268,
+      "learning_rate": 0.000268786868427802,
+      "loss": 0.3084,
+      "step": 20250
+    },
+    {
+      "epoch": 2.598102077455758,
+      "grad_norm": 0.7045506238937378,
+      "learning_rate": 0.0002679319483628281,
+      "loss": 0.3575,
+      "step": 20260
+    },
+    {
+      "epoch": 2.599384457553219,
+      "grad_norm": 0.8695869445800781,
+      "learning_rate": 0.00026707702829785414,
+      "loss": 0.4175,
+      "step": 20270
+    },
+    {
+      "epoch": 2.60066683765068,
+      "grad_norm": 0.9905348420143127,
+      "learning_rate": 0.0002662221082328802,
+      "loss": 0.4376,
+      "step": 20280
+    },
+    {
+      "epoch": 2.6019492177481407,
+      "grad_norm": 1.3747539520263672,
+      "learning_rate": 0.0002653671881679063,
+      "loss": 0.4679,
+      "step": 20290
+    },
+    {
+      "epoch": 2.6032315978456015,
+      "grad_norm": 1.023525595664978,
+      "learning_rate": 0.0002645122681029324,
+      "loss": 0.4441,
+      "step": 20300
+    },
+    {
+      "epoch": 2.6045139779430624,
+      "grad_norm": 0.8504759669303894,
+      "learning_rate": 0.0002636573480379584,
+      "loss": 0.539,
+      "step": 20310
+    },
+    {
+      "epoch": 2.6057963580405232,
+      "grad_norm": 0.48631325364112854,
+      "learning_rate": 0.00026280242797298455,
+      "loss": 0.5464,
+      "step": 20320
+    },
+    {
+      "epoch": 2.607078738137984,
+      "grad_norm": 0.42857420444488525,
+      "learning_rate": 0.0002619475079080106,
+      "loss": 0.3781,
+      "step": 20330
+    },
+    {
+      "epoch": 2.608361118235445,
+      "grad_norm": 0.6672760844230652,
+      "learning_rate": 0.0002610925878430367,
+      "loss": 0.4347,
+      "step": 20340
+    },
+    {
+      "epoch": 2.6096434983329058,
+      "grad_norm": 0.5698977112770081,
+      "learning_rate": 0.00026023766777806274,
+      "loss": 0.4583,
+      "step": 20350
+    },
+    {
+      "epoch": 2.6109258784303666,
+      "grad_norm": 1.0976148843765259,
+      "learning_rate": 0.0002593827477130888,
+      "loss": 0.3995,
+      "step": 20360
+    },
+    {
+      "epoch": 2.6122082585278275,
+      "grad_norm": 1.1578220129013062,
+      "learning_rate": 0.0002585278276481149,
+      "loss": 0.3532,
+      "step": 20370
+    },
+    {
+      "epoch": 2.6134906386252883,
+      "grad_norm": 1.0207488536834717,
+      "learning_rate": 0.000257672907583141,
+      "loss": 0.4577,
+      "step": 20380
+    },
+    {
+      "epoch": 2.6147730187227496,
+      "grad_norm": 1.2871861457824707,
+      "learning_rate": 0.000256817987518167,
+      "loss": 0.336,
+      "step": 20390
+    },
+    {
+      "epoch": 2.6160553988202104,
+      "grad_norm": 0.5854607224464417,
+      "learning_rate": 0.00025596306745319315,
+      "loss": 0.3488,
+      "step": 20400
+    },
+    {
+      "epoch": 2.6173377789176713,
+      "grad_norm": 1.5783365964889526,
+      "learning_rate": 0.0002551081473882192,
+      "loss": 0.4988,
+      "step": 20410
+    },
+    {
+      "epoch": 2.618620159015132,
+      "grad_norm": 1.0990679264068604,
+      "learning_rate": 0.0002542532273232453,
+      "loss": 0.4439,
+      "step": 20420
+    },
+    {
+      "epoch": 2.619902539112593,
+      "grad_norm": 0.5611817836761475,
+      "learning_rate": 0.00025339830725827135,
+      "loss": 0.2935,
+      "step": 20430
+    },
+    {
+      "epoch": 2.621184919210054,
+      "grad_norm": 1.3916196823120117,
+      "learning_rate": 0.00025254338719329743,
+      "loss": 0.3949,
+      "step": 20440
+    },
+    {
+      "epoch": 2.6224672993075147,
+      "grad_norm": 0.7436792254447937,
+      "learning_rate": 0.0002516884671283235,
+      "loss": 0.444,
+      "step": 20450
+    },
+    {
+      "epoch": 2.6237496794049755,
+      "grad_norm": 1.4927194118499756,
+      "learning_rate": 0.0002508335470633496,
+      "loss": 0.384,
+      "step": 20460
+    },
+    {
+      "epoch": 2.625032059502437,
+      "grad_norm": 1.047260046005249,
+      "learning_rate": 0.0002499786269983756,
+      "loss": 0.3375,
+      "step": 20470
+    },
+    {
+      "epoch": 2.6263144395998976,
+      "grad_norm": 0.9535210728645325,
+      "learning_rate": 0.0002491237069334017,
+      "loss": 0.3326,
+      "step": 20480
+    },
+    {
+      "epoch": 2.6275968196973585,
+      "grad_norm": 1.1021919250488281,
+      "learning_rate": 0.0002482687868684278,
+      "loss": 0.4592,
+      "step": 20490
+    },
+    {
+      "epoch": 2.6288791997948193,
+      "grad_norm": 0.6787020564079285,
+      "learning_rate": 0.00024741386680345387,
+      "loss": 0.3567,
+      "step": 20500
+    },
+    {
+      "epoch": 2.63016157989228,
+      "grad_norm": 0.5073117017745972,
+      "learning_rate": 0.00024655894673847995,
+      "loss": 0.2802,
+      "step": 20510
+    },
+    {
+      "epoch": 2.631443959989741,
+      "grad_norm": 0.7730292677879333,
+      "learning_rate": 0.00024570402667350603,
+      "loss": 0.3604,
+      "step": 20520
+    },
+    {
+      "epoch": 2.632726340087202,
+      "grad_norm": 1.1327155828475952,
+      "learning_rate": 0.0002448491066085321,
+      "loss": 0.4408,
+      "step": 20530
+    },
+    {
+      "epoch": 2.6340087201846627,
+      "grad_norm": 0.8838372826576233,
+      "learning_rate": 0.00024399418654355817,
+      "loss": 0.5678,
+      "step": 20540
+    },
+    {
+      "epoch": 2.6352911002821235,
+      "grad_norm": 0.5180802345275879,
+      "learning_rate": 0.00024313926647858425,
+      "loss": 0.3285,
+      "step": 20550
+    },
+    {
+      "epoch": 2.6365734803795844,
+      "grad_norm": 0.879054605960846,
+      "learning_rate": 0.00024228434641361033,
+      "loss": 0.4184,
+      "step": 20560
+    },
+    {
+      "epoch": 2.6378558604770452,
+      "grad_norm": 0.9276881814002991,
+      "learning_rate": 0.00024142942634863642,
+      "loss": 0.3412,
+      "step": 20570
+    },
+    {
+      "epoch": 2.639138240574506,
+      "grad_norm": 1.4996106624603271,
+      "learning_rate": 0.00024057450628366247,
+      "loss": 0.3899,
+      "step": 20580
+    },
+    {
+      "epoch": 2.640420620671967,
+      "grad_norm": 1.0205820798873901,
+      "learning_rate": 0.00023971958621868855,
+      "loss": 0.3253,
+      "step": 20590
+    },
+    {
+      "epoch": 2.641703000769428,
+      "grad_norm": 1.2541202306747437,
+      "learning_rate": 0.00023886466615371464,
+      "loss": 0.579,
+      "step": 20600
+    },
+    {
+      "epoch": 2.642985380866889,
+      "grad_norm": 1.1668142080307007,
+      "learning_rate": 0.00023800974608874072,
+      "loss": 0.4951,
+      "step": 20610
+    },
+    {
+      "epoch": 2.64426776096435,
+      "grad_norm": 0.9040181636810303,
+      "learning_rate": 0.00023715482602376677,
+      "loss": 0.3967,
+      "step": 20620
+    },
+    {
+      "epoch": 2.6455501410618107,
+      "grad_norm": 1.3997057676315308,
+      "learning_rate": 0.00023629990595879286,
+      "loss": 0.414,
+      "step": 20630
+    },
+    {
+      "epoch": 2.6468325211592716,
+      "grad_norm": 0.3811419904232025,
+      "learning_rate": 0.00023544498589381894,
+      "loss": 0.4728,
+      "step": 20640
+    },
+    {
+      "epoch": 2.6481149012567324,
+      "grad_norm": 0.7340693473815918,
+      "learning_rate": 0.00023459006582884502,
+      "loss": 0.4724,
+      "step": 20650
+    },
+    {
+      "epoch": 2.6493972813541933,
+      "grad_norm": 0.602635383605957,
+      "learning_rate": 0.00023373514576387108,
+      "loss": 0.305,
+      "step": 20660
+    },
+    {
+      "epoch": 2.650679661451654,
+      "grad_norm": 1.357358694076538,
+      "learning_rate": 0.00023288022569889716,
+      "loss": 0.3615,
+      "step": 20670
+    },
+    {
+      "epoch": 2.6519620415491154,
+      "grad_norm": 1.631966233253479,
+      "learning_rate": 0.00023202530563392324,
+      "loss": 0.3992,
+      "step": 20680
+    },
+    {
+      "epoch": 2.6532444216465763,
+      "grad_norm": 1.3770554065704346,
+      "learning_rate": 0.00023117038556894932,
+      "loss": 0.4558,
+      "step": 20690
+    },
+    {
+      "epoch": 2.654526801744037,
+      "grad_norm": 1.2345914840698242,
+      "learning_rate": 0.00023031546550397538,
+      "loss": 0.53,
+      "step": 20700
+    },
+    {
+      "epoch": 2.655809181841498,
+      "grad_norm": 0.8587237000465393,
+      "learning_rate": 0.00022946054543900146,
+      "loss": 0.326,
+      "step": 20710
+    },
+    {
+      "epoch": 2.657091561938959,
+      "grad_norm": 0.5960670709609985,
+      "learning_rate": 0.00022860562537402754,
+      "loss": 0.321,
+      "step": 20720
+    },
+    {
+      "epoch": 2.6583739420364196,
+      "grad_norm": 0.8732848763465881,
+      "learning_rate": 0.00022775070530905363,
+      "loss": 0.3383,
+      "step": 20730
+    },
+    {
+      "epoch": 2.6596563221338805,
+      "grad_norm": 1.1003626585006714,
+      "learning_rate": 0.00022689578524407968,
+      "loss": 0.2627,
+      "step": 20740
+    },
+    {
+      "epoch": 2.6609387022313413,
+      "grad_norm": 0.8450182676315308,
+      "learning_rate": 0.00022604086517910576,
+      "loss": 0.3813,
+      "step": 20750
+    },
+    {
+      "epoch": 2.662221082328802,
+      "grad_norm": 1.0902912616729736,
+      "learning_rate": 0.00022518594511413185,
+      "loss": 0.4519,
+      "step": 20760
+    },
+    {
+      "epoch": 2.663503462426263,
+      "grad_norm": 0.6427618861198425,
+      "learning_rate": 0.00022433102504915793,
+      "loss": 0.355,
+      "step": 20770
+    },
+    {
+      "epoch": 2.664785842523724,
+      "grad_norm": 1.0089173316955566,
+      "learning_rate": 0.00022347610498418396,
+      "loss": 0.316,
+      "step": 20780
+    },
+    {
+      "epoch": 2.6660682226211847,
+      "grad_norm": 0.8819127082824707,
+      "learning_rate": 0.00022262118491921004,
+      "loss": 0.3964,
+      "step": 20790
+    },
+    {
+      "epoch": 2.6673506027186455,
+      "grad_norm": 1.0088744163513184,
+      "learning_rate": 0.00022176626485423612,
+      "loss": 0.4064,
+      "step": 20800
+    },
+    {
+      "epoch": 2.668632982816107,
+      "grad_norm": 0.45587706565856934,
+      "learning_rate": 0.0002209113447892622,
+      "loss": 0.4366,
+      "step": 20810
+    },
+    {
+      "epoch": 2.6699153629135677,
+      "grad_norm": 0.8636410236358643,
+      "learning_rate": 0.0002200564247242883,
+      "loss": 0.2694,
+      "step": 20820
+    },
+    {
+      "epoch": 2.6711977430110285,
+      "grad_norm": 0.5451250672340393,
+      "learning_rate": 0.00021920150465931434,
+      "loss": 0.3359,
+      "step": 20830
+    },
+    {
+      "epoch": 2.6724801231084894,
+      "grad_norm": 1.1861648559570312,
+      "learning_rate": 0.00021834658459434042,
+      "loss": 0.4072,
+      "step": 20840
+    },
+    {
+      "epoch": 2.67376250320595,
+      "grad_norm": 1.4000024795532227,
+      "learning_rate": 0.0002174916645293665,
+      "loss": 0.4205,
+      "step": 20850
+    },
+    {
+      "epoch": 2.675044883303411,
+      "grad_norm": 0.5738406181335449,
+      "learning_rate": 0.0002166367444643926,
+      "loss": 0.5497,
+      "step": 20860
+    },
+    {
+      "epoch": 2.676327263400872,
+      "grad_norm": 0.49758780002593994,
+      "learning_rate": 0.00021578182439941864,
+      "loss": 0.2646,
+      "step": 20870
+    },
+    {
+      "epoch": 2.6776096434983327,
+      "grad_norm": 0.9785353541374207,
+      "learning_rate": 0.00021492690433444473,
+      "loss": 0.4507,
+      "step": 20880
+    },
+    {
+      "epoch": 2.678892023595794,
+      "grad_norm": 0.9146700501441956,
+      "learning_rate": 0.0002140719842694708,
+      "loss": 0.3821,
+      "step": 20890
+    },
+    {
+      "epoch": 2.680174403693255,
+      "grad_norm": 0.8285348415374756,
+      "learning_rate": 0.0002132170642044969,
+      "loss": 0.2993,
+      "step": 20900
+    },
+    {
+      "epoch": 2.6814567837907157,
+      "grad_norm": 0.9271901845932007,
+      "learning_rate": 0.00021236214413952295,
+      "loss": 0.3949,
+      "step": 20910
+    },
+    {
+      "epoch": 2.6827391638881766,
+      "grad_norm": 0.5074835419654846,
+      "learning_rate": 0.00021150722407454903,
+      "loss": 0.2717,
+      "step": 20920
+    },
+    {
+      "epoch": 2.6840215439856374,
+      "grad_norm": 0.8156689405441284,
+      "learning_rate": 0.0002106523040095751,
+      "loss": 0.3361,
+      "step": 20930
+    },
+    {
+      "epoch": 2.6853039240830983,
+      "grad_norm": 0.5775778293609619,
+      "learning_rate": 0.0002097973839446012,
+      "loss": 0.3324,
+      "step": 20940
+    },
+    {
+      "epoch": 2.686586304180559,
+      "grad_norm": 0.9868631958961487,
+      "learning_rate": 0.00020894246387962725,
+      "loss": 0.2793,
+      "step": 20950
+    },
+    {
+      "epoch": 2.68786868427802,
+      "grad_norm": 0.631433367729187,
+      "learning_rate": 0.00020808754381465333,
+      "loss": 0.4849,
+      "step": 20960
+    },
+    {
+      "epoch": 2.689151064375481,
+      "grad_norm": 0.5689303874969482,
+      "learning_rate": 0.0002072326237496794,
+      "loss": 0.3136,
+      "step": 20970
+    },
+    {
+      "epoch": 2.6904334444729416,
+      "grad_norm": 1.3480650186538696,
+      "learning_rate": 0.0002063777036847055,
+      "loss": 0.3781,
+      "step": 20980
+    },
+    {
+      "epoch": 2.6917158245704025,
+      "grad_norm": 0.9957194328308105,
+      "learning_rate": 0.00020552278361973155,
+      "loss": 0.3369,
+      "step": 20990
+    },
+    {
+      "epoch": 2.6929982046678633,
+      "grad_norm": 1.6954656839370728,
+      "learning_rate": 0.00020466786355475763,
+      "loss": 0.4473,
+      "step": 21000
+    },
+    {
+      "epoch": 2.694280584765324,
+      "grad_norm": 0.7176766991615295,
+      "learning_rate": 0.00020381294348978372,
+      "loss": 0.2455,
+      "step": 21010
+    },
+    {
+      "epoch": 2.6955629648627855,
+      "grad_norm": 1.3188832998275757,
+      "learning_rate": 0.0002029580234248098,
+      "loss": 0.4423,
+      "step": 21020
+    },
+    {
+      "epoch": 2.6968453449602463,
+      "grad_norm": 1.5381674766540527,
+      "learning_rate": 0.00020210310335983585,
+      "loss": 0.375,
+      "step": 21030
+    },
+    {
+      "epoch": 2.698127725057707,
+      "grad_norm": 0.8166912794113159,
+      "learning_rate": 0.00020124818329486194,
+      "loss": 0.4334,
+      "step": 21040
+    },
+    {
+      "epoch": 2.699410105155168,
+      "grad_norm": 1.0441280603408813,
+      "learning_rate": 0.00020039326322988802,
+      "loss": 0.3905,
+      "step": 21050
+    },
+    {
+      "epoch": 2.700692485252629,
+      "grad_norm": 0.7561319470405579,
+      "learning_rate": 0.0001995383431649141,
+      "loss": 0.316,
+      "step": 21060
+    },
+    {
+      "epoch": 2.7019748653500897,
+      "grad_norm": 0.8861315250396729,
+      "learning_rate": 0.00019868342309994016,
+      "loss": 0.3531,
+      "step": 21070
+    },
+    {
+      "epoch": 2.7032572454475505,
+      "grad_norm": 0.887611448764801,
+      "learning_rate": 0.00019782850303496624,
+      "loss": 0.4692,
+      "step": 21080
+    },
+    {
+      "epoch": 2.704539625545012,
+      "grad_norm": 0.985729455947876,
+      "learning_rate": 0.00019697358296999232,
+      "loss": 0.4464,
+      "step": 21090
+    },
+    {
+      "epoch": 2.7058220056424727,
+      "grad_norm": 0.8642510175704956,
+      "learning_rate": 0.0001961186629050184,
+      "loss": 0.3553,
+      "step": 21100
+    },
+    {
+      "epoch": 2.7071043857399335,
+      "grad_norm": 0.7116155028343201,
+      "learning_rate": 0.00019526374284004449,
+      "loss": 0.4635,
+      "step": 21110
+    },
+    {
+      "epoch": 2.7083867658373943,
+      "grad_norm": 0.9211375713348389,
+      "learning_rate": 0.00019440882277507054,
+      "loss": 0.356,
+      "step": 21120
+    },
+    {
+      "epoch": 2.709669145934855,
+      "grad_norm": 0.6295248866081238,
+      "learning_rate": 0.0001935539027100966,
+      "loss": 0.3683,
+      "step": 21130
+    },
+    {
+      "epoch": 2.710951526032316,
+      "grad_norm": 0.596107006072998,
+      "learning_rate": 0.00019269898264512268,
+      "loss": 0.274,
+      "step": 21140
+    },
+    {
+      "epoch": 2.712233906129777,
+      "grad_norm": 1.4957377910614014,
+      "learning_rate": 0.00019184406258014876,
+      "loss": 0.3562,
+      "step": 21150
+    },
+    {
+      "epoch": 2.7135162862272377,
+      "grad_norm": 1.4567288160324097,
+      "learning_rate": 0.00019098914251517482,
+      "loss": 0.3553,
+      "step": 21160
+    },
+    {
+      "epoch": 2.7147986663246986,
+      "grad_norm": 0.44168442487716675,
+      "learning_rate": 0.0001901342224502009,
+      "loss": 0.434,
+      "step": 21170
+    },
+    {
+      "epoch": 2.7160810464221594,
+      "grad_norm": 1.3469419479370117,
+      "learning_rate": 0.00018927930238522698,
+      "loss": 0.5778,
+      "step": 21180
+    },
+    {
+      "epoch": 2.7173634265196203,
+      "grad_norm": 0.3783499300479889,
+      "learning_rate": 0.00018842438232025306,
+      "loss": 0.2967,
+      "step": 21190
+    },
+    {
+      "epoch": 2.718645806617081,
+      "grad_norm": 0.9081128239631653,
+      "learning_rate": 0.00018756946225527912,
+      "loss": 0.4134,
+      "step": 21200
+    },
+    {
+      "epoch": 2.719928186714542,
+      "grad_norm": 1.2152372598648071,
+      "learning_rate": 0.0001867145421903052,
+      "loss": 0.4139,
+      "step": 21210
+    },
+    {
+      "epoch": 2.7212105668120032,
+      "grad_norm": 0.8168225288391113,
+      "learning_rate": 0.00018585962212533128,
+      "loss": 0.3853,
+      "step": 21220
+    },
+    {
+      "epoch": 2.722492946909464,
+      "grad_norm": 0.8900707364082336,
+      "learning_rate": 0.00018500470206035737,
+      "loss": 0.3369,
+      "step": 21230
+    },
+    {
+      "epoch": 2.723775327006925,
+      "grad_norm": 0.8105087280273438,
+      "learning_rate": 0.00018414978199538342,
+      "loss": 0.3421,
+      "step": 21240
+    },
+    {
+      "epoch": 2.7250577071043858,
+      "grad_norm": 1.3624615669250488,
+      "learning_rate": 0.0001832948619304095,
+      "loss": 0.4038,
+      "step": 21250
+    },
+    {
+      "epoch": 2.7263400872018466,
+      "grad_norm": 0.7589739561080933,
+      "learning_rate": 0.00018243994186543559,
+      "loss": 0.2604,
+      "step": 21260
+    },
+    {
+      "epoch": 2.7276224672993075,
+      "grad_norm": 0.9599238038063049,
+      "learning_rate": 0.00018158502180046167,
+      "loss": 0.4537,
+      "step": 21270
+    },
+    {
+      "epoch": 2.7289048473967683,
+      "grad_norm": 0.5657153725624084,
+      "learning_rate": 0.00018073010173548772,
+      "loss": 0.4679,
+      "step": 21280
+    },
+    {
+      "epoch": 2.730187227494229,
+      "grad_norm": 1.2358009815216064,
+      "learning_rate": 0.0001798751816705138,
+      "loss": 0.3009,
+      "step": 21290
+    },
+    {
+      "epoch": 2.7314696075916904,
+      "grad_norm": 0.7661507725715637,
+      "learning_rate": 0.0001790202616055399,
+      "loss": 0.3509,
+      "step": 21300
+    },
+    {
+      "epoch": 2.7327519876891513,
+      "grad_norm": 2.557483673095703,
+      "learning_rate": 0.00017816534154056597,
+      "loss": 0.4234,
+      "step": 21310
+    },
+    {
+      "epoch": 2.734034367786612,
+      "grad_norm": 0.7089506387710571,
+      "learning_rate": 0.00017731042147559203,
+      "loss": 0.3774,
+      "step": 21320
+    },
+    {
+      "epoch": 2.735316747884073,
+      "grad_norm": 1.5683780908584595,
+      "learning_rate": 0.0001764555014106181,
+      "loss": 0.3991,
+      "step": 21330
+    },
+    {
+      "epoch": 2.736599127981534,
+      "grad_norm": 0.6015053987503052,
+      "learning_rate": 0.0001756005813456442,
+      "loss": 0.332,
+      "step": 21340
+    },
+    {
+      "epoch": 2.7378815080789947,
+      "grad_norm": 0.6616173386573792,
+      "learning_rate": 0.00017474566128067027,
+      "loss": 0.3991,
+      "step": 21350
+    },
+    {
+      "epoch": 2.7391638881764555,
+      "grad_norm": 1.1823351383209229,
+      "learning_rate": 0.00017389074121569633,
+      "loss": 0.4555,
+      "step": 21360
+    },
+    {
+      "epoch": 2.7404462682739164,
+      "grad_norm": 0.7915502190589905,
+      "learning_rate": 0.0001730358211507224,
+      "loss": 0.3425,
+      "step": 21370
+    },
+    {
+      "epoch": 2.741728648371377,
+      "grad_norm": 1.186974287033081,
+      "learning_rate": 0.0001721809010857485,
+      "loss": 0.5156,
+      "step": 21380
+    },
+    {
+      "epoch": 2.743011028468838,
+      "grad_norm": 0.8260472416877747,
+      "learning_rate": 0.00017132598102077457,
+      "loss": 0.4004,
+      "step": 21390
+    },
+    {
+      "epoch": 2.744293408566299,
+      "grad_norm": 1.5226585865020752,
+      "learning_rate": 0.00017047106095580066,
+      "loss": 0.3825,
+      "step": 21400
+    },
+    {
+      "epoch": 2.7455757886637597,
+      "grad_norm": 0.7888827919960022,
+      "learning_rate": 0.0001696161408908267,
+      "loss": 0.3379,
+      "step": 21410
+    },
+    {
+      "epoch": 2.7468581687612206,
+      "grad_norm": 1.188528060913086,
+      "learning_rate": 0.0001687612208258528,
+      "loss": 0.2501,
+      "step": 21420
+    },
+    {
+      "epoch": 2.748140548858682,
+      "grad_norm": 1.040313720703125,
+      "learning_rate": 0.00016790630076087888,
+      "loss": 0.5706,
+      "step": 21430
+    },
+    {
+      "epoch": 2.7494229289561427,
+      "grad_norm": 1.1419790983200073,
+      "learning_rate": 0.00016705138069590496,
+      "loss": 0.345,
+      "step": 21440
+    },
+    {
+      "epoch": 2.7507053090536036,
+      "grad_norm": 1.0169458389282227,
+      "learning_rate": 0.00016619646063093101,
+      "loss": 0.4079,
+      "step": 21450
+    },
+    {
+      "epoch": 2.7519876891510644,
+      "grad_norm": 1.201564073562622,
+      "learning_rate": 0.0001653415405659571,
+      "loss": 0.2988,
+      "step": 21460
+    },
+    {
+      "epoch": 2.7532700692485252,
+      "grad_norm": 0.5512075424194336,
+      "learning_rate": 0.00016448662050098318,
+      "loss": 0.3651,
+      "step": 21470
+    },
+    {
+      "epoch": 2.754552449345986,
+      "grad_norm": 1.1715940237045288,
+      "learning_rate": 0.00016363170043600923,
+      "loss": 0.3561,
+      "step": 21480
+    },
+    {
+      "epoch": 2.755834829443447,
+      "grad_norm": 1.5060564279556274,
+      "learning_rate": 0.0001627767803710353,
+      "loss": 0.4964,
+      "step": 21490
+    },
+    {
+      "epoch": 2.7571172095409078,
+      "grad_norm": 1.0363975763320923,
+      "learning_rate": 0.00016192186030606137,
+      "loss": 0.4628,
+      "step": 21500
+    },
+    {
+      "epoch": 2.758399589638369,
+      "grad_norm": 0.6451253890991211,
+      "learning_rate": 0.00016106694024108745,
+      "loss": 0.5482,
+      "step": 21510
+    },
+    {
+      "epoch": 2.75968196973583,
+      "grad_norm": 0.8802538514137268,
+      "learning_rate": 0.00016021202017611354,
+      "loss": 0.4662,
+      "step": 21520
+    },
+    {
+      "epoch": 2.7609643498332908,
+      "grad_norm": 0.6708236336708069,
+      "learning_rate": 0.0001593571001111396,
+      "loss": 0.2757,
+      "step": 21530
+    },
+    {
+      "epoch": 2.7622467299307516,
+      "grad_norm": 0.5467422604560852,
+      "learning_rate": 0.00015850218004616567,
+      "loss": 0.4559,
+      "step": 21540
+    },
+    {
+      "epoch": 2.7635291100282124,
+      "grad_norm": 0.9822036623954773,
+      "learning_rate": 0.00015764725998119176,
+      "loss": 0.3517,
+      "step": 21550
+    },
+    {
+      "epoch": 2.7648114901256733,
+      "grad_norm": 0.6225240230560303,
+      "learning_rate": 0.00015679233991621784,
+      "loss": 0.3221,
+      "step": 21560
+    },
+    {
+      "epoch": 2.766093870223134,
+      "grad_norm": 0.5968758463859558,
+      "learning_rate": 0.0001559374198512439,
+      "loss": 0.4548,
+      "step": 21570
+    },
+    {
+      "epoch": 2.767376250320595,
+      "grad_norm": 0.8913034200668335,
+      "learning_rate": 0.00015508249978626998,
+      "loss": 0.4053,
+      "step": 21580
+    },
+    {
+      "epoch": 2.768658630418056,
+      "grad_norm": 1.6031399965286255,
+      "learning_rate": 0.00015422757972129606,
+      "loss": 0.3838,
+      "step": 21590
+    },
+    {
+      "epoch": 2.7699410105155167,
+      "grad_norm": 0.9392004609107971,
+      "learning_rate": 0.00015337265965632214,
+      "loss": 0.3233,
+      "step": 21600
+    },
+    {
+      "epoch": 2.7712233906129775,
+      "grad_norm": 0.7516948580741882,
+      "learning_rate": 0.0001525177395913482,
+      "loss": 0.25,
+      "step": 21610
+    },
+    {
+      "epoch": 2.7725057707104384,
+      "grad_norm": 0.7983139157295227,
+      "learning_rate": 0.00015166281952637428,
+      "loss": 0.3126,
+      "step": 21620
+    },
+    {
+      "epoch": 2.773788150807899,
+      "grad_norm": 0.7680755853652954,
+      "learning_rate": 0.00015080789946140036,
+      "loss": 0.3528,
+      "step": 21630
+    },
+    {
+      "epoch": 2.7750705309053605,
+      "grad_norm": 0.7174438834190369,
+      "learning_rate": 0.00014995297939642644,
+      "loss": 0.4091,
+      "step": 21640
+    },
+    {
+      "epoch": 2.7763529110028213,
+      "grad_norm": 0.8676108717918396,
+      "learning_rate": 0.0001490980593314525,
+      "loss": 0.5095,
+      "step": 21650
+    },
+    {
+      "epoch": 2.777635291100282,
+      "grad_norm": 0.7086964249610901,
+      "learning_rate": 0.00014824313926647858,
+      "loss": 0.2819,
+      "step": 21660
+    },
+    {
+      "epoch": 2.778917671197743,
+      "grad_norm": 1.6894848346710205,
+      "learning_rate": 0.00014738821920150466,
+      "loss": 0.3624,
+      "step": 21670
+    },
+    {
+      "epoch": 2.780200051295204,
+      "grad_norm": 0.7783902287483215,
+      "learning_rate": 0.00014653329913653075,
+      "loss": 0.4304,
+      "step": 21680
+    },
+    {
+      "epoch": 2.7814824313926647,
+      "grad_norm": 0.7895000576972961,
+      "learning_rate": 0.00014567837907155683,
+      "loss": 0.3576,
+      "step": 21690
+    },
+    {
+      "epoch": 2.7827648114901256,
+      "grad_norm": 0.5636423826217651,
+      "learning_rate": 0.00014482345900658288,
+      "loss": 0.3083,
+      "step": 21700
+    },
+    {
+      "epoch": 2.7840471915875864,
+      "grad_norm": 1.1489410400390625,
+      "learning_rate": 0.00014396853894160897,
+      "loss": 0.3091,
+      "step": 21710
+    },
+    {
+      "epoch": 2.7853295716850477,
+      "grad_norm": 0.59771728515625,
+      "learning_rate": 0.00014311361887663505,
+      "loss": 0.4021,
+      "step": 21720
+    },
+    {
+      "epoch": 2.7866119517825085,
+      "grad_norm": 0.722762405872345,
+      "learning_rate": 0.00014225869881166113,
+      "loss": 0.3188,
+      "step": 21730
+    },
+    {
+      "epoch": 2.7878943318799694,
+      "grad_norm": 0.6990886330604553,
+      "learning_rate": 0.00014140377874668719,
+      "loss": 0.3733,
+      "step": 21740
+    },
+    {
+      "epoch": 2.7891767119774302,
+      "grad_norm": 0.8142735362052917,
+      "learning_rate": 0.00014054885868171327,
+      "loss": 0.3454,
+      "step": 21750
+    },
+    {
+      "epoch": 2.790459092074891,
+      "grad_norm": 1.8750430345535278,
+      "learning_rate": 0.00013969393861673935,
+      "loss": 0.501,
+      "step": 21760
+    },
+    {
+      "epoch": 2.791741472172352,
+      "grad_norm": 0.7295469641685486,
+      "learning_rate": 0.00013883901855176543,
+      "loss": 0.3375,
+      "step": 21770
+    },
+    {
+      "epoch": 2.7930238522698128,
+      "grad_norm": 0.9579476118087769,
+      "learning_rate": 0.0001379840984867915,
+      "loss": 0.4572,
+      "step": 21780
+    },
+    {
+      "epoch": 2.7943062323672736,
+      "grad_norm": 0.9507008790969849,
+      "learning_rate": 0.00013712917842181757,
+      "loss": 0.3134,
+      "step": 21790
+    },
+    {
+      "epoch": 2.7955886124647344,
+      "grad_norm": 1.0686496496200562,
+      "learning_rate": 0.00013627425835684365,
+      "loss": 0.365,
+      "step": 21800
+    },
+    {
+      "epoch": 2.7968709925621953,
+      "grad_norm": 0.6618695855140686,
+      "learning_rate": 0.00013541933829186974,
+      "loss": 0.5158,
+      "step": 21810
+    },
+    {
+      "epoch": 2.798153372659656,
+      "grad_norm": 0.7745763659477234,
+      "learning_rate": 0.0001345644182268958,
+      "loss": 0.4521,
+      "step": 21820
+    },
+    {
+      "epoch": 2.799435752757117,
+      "grad_norm": 0.9630032777786255,
+      "learning_rate": 0.00013370949816192185,
+      "loss": 0.383,
+      "step": 21830
+    },
+    {
+      "epoch": 2.800718132854578,
+      "grad_norm": 0.9685844779014587,
+      "learning_rate": 0.00013285457809694793,
+      "loss": 0.351,
+      "step": 21840
+    },
+    {
+      "epoch": 2.802000512952039,
+      "grad_norm": 1.8922075033187866,
+      "learning_rate": 0.000131999658031974,
+      "loss": 0.3359,
+      "step": 21850
+    },
+    {
+      "epoch": 2.8032828930495,
+      "grad_norm": 1.1599595546722412,
+      "learning_rate": 0.00013114473796700007,
+      "loss": 0.5045,
+      "step": 21860
+    },
+    {
+      "epoch": 2.804565273146961,
+      "grad_norm": 0.761369526386261,
+      "learning_rate": 0.00013028981790202615,
+      "loss": 0.3795,
+      "step": 21870
+    },
+    {
+      "epoch": 2.8058476532444216,
+      "grad_norm": 0.4400559663772583,
+      "learning_rate": 0.00012943489783705223,
+      "loss": 0.4146,
+      "step": 21880
+    },
+    {
+      "epoch": 2.8071300333418825,
+      "grad_norm": 0.6165184378623962,
+      "learning_rate": 0.0001285799777720783,
+      "loss": 0.3696,
+      "step": 21890
+    },
+    {
+      "epoch": 2.8084124134393433,
+      "grad_norm": 1.1559704542160034,
+      "learning_rate": 0.00012772505770710437,
+      "loss": 0.3715,
+      "step": 21900
+    },
+    {
+      "epoch": 2.809694793536804,
+      "grad_norm": 0.7321136593818665,
+      "learning_rate": 0.00012687013764213045,
+      "loss": 0.3048,
+      "step": 21910
+    },
+    {
+      "epoch": 2.8109771736342655,
+      "grad_norm": 0.5283898711204529,
+      "learning_rate": 0.00012601521757715653,
+      "loss": 0.2636,
+      "step": 21920
+    },
+    {
+      "epoch": 2.8122595537317263,
+      "grad_norm": 0.8270158171653748,
+      "learning_rate": 0.00012516029751218262,
+      "loss": 0.4423,
+      "step": 21930
+    },
+    {
+      "epoch": 2.813541933829187,
+      "grad_norm": 0.872068464756012,
+      "learning_rate": 0.00012430537744720867,
+      "loss": 0.2926,
+      "step": 21940
+    },
+    {
+      "epoch": 2.814824313926648,
+      "grad_norm": 1.1108500957489014,
+      "learning_rate": 0.00012345045738223475,
+      "loss": 0.349,
+      "step": 21950
+    },
+    {
+      "epoch": 2.816106694024109,
+      "grad_norm": 1.0009726285934448,
+      "learning_rate": 0.00012259553731726084,
+      "loss": 0.3206,
+      "step": 21960
+    },
+    {
+      "epoch": 2.8173890741215697,
+      "grad_norm": 0.44574859738349915,
+      "learning_rate": 0.00012174061725228692,
+      "loss": 0.2925,
+      "step": 21970
+    },
+    {
+      "epoch": 2.8186714542190305,
+      "grad_norm": 0.8400396704673767,
+      "learning_rate": 0.00012088569718731299,
+      "loss": 0.3977,
+      "step": 21980
+    },
+    {
+      "epoch": 2.8199538343164914,
+      "grad_norm": 0.853813111782074,
+      "learning_rate": 0.00012003077712233907,
+      "loss": 0.4258,
+      "step": 21990
+    },
+    {
+      "epoch": 2.8212362144139522,
+      "grad_norm": 0.6891235709190369,
+      "learning_rate": 0.00011917585705736514,
+      "loss": 0.2868,
+      "step": 22000
+    },
+    {
+      "epoch": 2.822518594511413,
+      "grad_norm": 0.9624373316764832,
+      "learning_rate": 0.00011832093699239122,
+      "loss": 0.4167,
+      "step": 22010
+    },
+    {
+      "epoch": 2.823800974608874,
+      "grad_norm": 0.8667474389076233,
+      "learning_rate": 0.00011746601692741729,
+      "loss": 0.3906,
+      "step": 22020
+    },
+    {
+      "epoch": 2.8250833547063348,
+      "grad_norm": 0.9315304756164551,
+      "learning_rate": 0.00011661109686244337,
+      "loss": 0.2763,
+      "step": 22030
+    },
+    {
+      "epoch": 2.8263657348037956,
+      "grad_norm": 0.48842424154281616,
+      "learning_rate": 0.00011575617679746944,
+      "loss": 0.3595,
+      "step": 22040
+    },
+    {
+      "epoch": 2.827648114901257,
+      "grad_norm": 2.02878737449646,
+      "learning_rate": 0.00011490125673249552,
+      "loss": 0.432,
+      "step": 22050
+    },
+    {
+      "epoch": 2.8289304949987177,
+      "grad_norm": 1.5318242311477661,
+      "learning_rate": 0.00011404633666752159,
+      "loss": 0.4374,
+      "step": 22060
+    },
+    {
+      "epoch": 2.8302128750961786,
+      "grad_norm": 1.2656123638153076,
+      "learning_rate": 0.00011319141660254767,
+      "loss": 0.3225,
+      "step": 22070
+    },
+    {
+      "epoch": 2.8314952551936394,
+      "grad_norm": 1.2422733306884766,
+      "learning_rate": 0.00011233649653757374,
+      "loss": 0.4328,
+      "step": 22080
+    },
+    {
+      "epoch": 2.8327776352911003,
+      "grad_norm": 0.769603967666626,
+      "learning_rate": 0.00011148157647259981,
+      "loss": 0.301,
+      "step": 22090
+    },
+    {
+      "epoch": 2.834060015388561,
+      "grad_norm": 1.1890935897827148,
+      "learning_rate": 0.00011062665640762588,
+      "loss": 0.4696,
+      "step": 22100
+    },
+    {
+      "epoch": 2.835342395486022,
+      "grad_norm": 0.8918318748474121,
+      "learning_rate": 0.00010977173634265196,
+      "loss": 0.3997,
+      "step": 22110
+    },
+    {
+      "epoch": 2.836624775583483,
+      "grad_norm": 0.7001236081123352,
+      "learning_rate": 0.00010891681627767803,
+      "loss": 0.4216,
+      "step": 22120
+    },
+    {
+      "epoch": 2.837907155680944,
+      "grad_norm": 0.84539794921875,
+      "learning_rate": 0.00010806189621270411,
+      "loss": 0.3192,
+      "step": 22130
+    },
+    {
+      "epoch": 2.839189535778405,
+      "grad_norm": 0.9644067287445068,
+      "learning_rate": 0.00010720697614773018,
+      "loss": 0.474,
+      "step": 22140
+    },
+    {
+      "epoch": 2.840471915875866,
+      "grad_norm": 0.9339047074317932,
+      "learning_rate": 0.00010635205608275626,
+      "loss": 0.3042,
+      "step": 22150
+    },
+    {
+      "epoch": 2.8417542959733266,
+      "grad_norm": 0.7227121591567993,
+      "learning_rate": 0.00010549713601778233,
+      "loss": 0.2391,
+      "step": 22160
+    },
+    {
+      "epoch": 2.8430366760707875,
+      "grad_norm": 0.7822548747062683,
+      "learning_rate": 0.00010464221595280842,
+      "loss": 0.4902,
+      "step": 22170
+    },
+    {
+      "epoch": 2.8443190561682483,
+      "grad_norm": 0.9597374200820923,
+      "learning_rate": 0.00010378729588783448,
+      "loss": 0.3201,
+      "step": 22180
+    },
+    {
+      "epoch": 2.845601436265709,
+      "grad_norm": 1.0328844785690308,
+      "learning_rate": 0.00010293237582286057,
+      "loss": 0.3916,
+      "step": 22190
+    },
+    {
+      "epoch": 2.84688381636317,
+      "grad_norm": 0.6888856291770935,
+      "learning_rate": 0.00010207745575788664,
+      "loss": 0.415,
+      "step": 22200
+    },
+    {
+      "epoch": 2.848166196460631,
+      "grad_norm": 1.4465842247009277,
+      "learning_rate": 0.00010122253569291272,
+      "loss": 0.3799,
+      "step": 22210
+    },
+    {
+      "epoch": 2.8494485765580917,
+      "grad_norm": 1.1186655759811401,
+      "learning_rate": 0.00010036761562793879,
+      "loss": 0.3732,
+      "step": 22220
+    },
+    {
+      "epoch": 2.8507309566555525,
+      "grad_norm": 0.5343247056007385,
+      "learning_rate": 9.951269556296487e-05,
+      "loss": 0.3953,
+      "step": 22230
+    },
+    {
+      "epoch": 2.8520133367530134,
+      "grad_norm": 0.5710815191268921,
+      "learning_rate": 9.865777549799095e-05,
+      "loss": 0.4029,
+      "step": 22240
+    },
+    {
+      "epoch": 2.8532957168504742,
+      "grad_norm": 1.0526983737945557,
+      "learning_rate": 9.780285543301702e-05,
+      "loss": 0.4013,
+      "step": 22250
+    },
+    {
+      "epoch": 2.8545780969479355,
+      "grad_norm": 0.9180122017860413,
+      "learning_rate": 9.694793536804309e-05,
+      "loss": 0.3656,
+      "step": 22260
+    },
+    {
+      "epoch": 2.8558604770453964,
+      "grad_norm": 0.5228607654571533,
+      "learning_rate": 9.609301530306916e-05,
+      "loss": 0.388,
+      "step": 22270
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.7112893462181091,
+      "learning_rate": 9.523809523809524e-05,
+      "loss": 0.2866,
+      "step": 22280
+    },
+    {
+      "epoch": 2.858425237240318,
+      "grad_norm": 1.2582242488861084,
+      "learning_rate": 9.438317517312131e-05,
+      "loss": 0.3768,
+      "step": 22290
+    },
+    {
+      "epoch": 2.859707617337779,
+      "grad_norm": 0.9449999332427979,
+      "learning_rate": 9.352825510814739e-05,
+      "loss": 0.4034,
+      "step": 22300
+    },
+    {
+      "epoch": 2.8609899974352397,
+      "grad_norm": 0.7868074774742126,
+      "learning_rate": 9.267333504317346e-05,
+      "loss": 0.4197,
+      "step": 22310
+    },
+    {
+      "epoch": 2.8622723775327006,
+      "grad_norm": 0.5401546359062195,
+      "learning_rate": 9.181841497819954e-05,
+      "loss": 0.3198,
+      "step": 22320
+    },
+    {
+      "epoch": 2.8635547576301614,
+      "grad_norm": 1.1672154664993286,
+      "learning_rate": 9.096349491322561e-05,
+      "loss": 0.3383,
+      "step": 22330
+    },
+    {
+      "epoch": 2.8648371377276227,
+      "grad_norm": 0.43170639872550964,
+      "learning_rate": 9.01085748482517e-05,
+      "loss": 0.3007,
+      "step": 22340
+    },
+    {
+      "epoch": 2.8661195178250836,
+      "grad_norm": 1.1403450965881348,
+      "learning_rate": 8.925365478327776e-05,
+      "loss": 0.3279,
+      "step": 22350
+    },
+    {
+      "epoch": 2.8674018979225444,
+      "grad_norm": 1.2685964107513428,
+      "learning_rate": 8.839873471830385e-05,
+      "loss": 0.5152,
+      "step": 22360
+    },
+    {
+      "epoch": 2.8686842780200053,
+      "grad_norm": 0.43280231952667236,
+      "learning_rate": 8.754381465332991e-05,
+      "loss": 0.3793,
+      "step": 22370
+    },
+    {
+      "epoch": 2.869966658117466,
+      "grad_norm": 0.7950090169906616,
+      "learning_rate": 8.6688894588356e-05,
+      "loss": 0.4339,
+      "step": 22380
+    },
+    {
+      "epoch": 2.871249038214927,
+      "grad_norm": 0.9394015669822693,
+      "learning_rate": 8.583397452338207e-05,
+      "loss": 0.4288,
+      "step": 22390
+    },
+    {
+      "epoch": 2.872531418312388,
+      "grad_norm": 1.5615211725234985,
+      "learning_rate": 8.497905445840815e-05,
+      "loss": 0.5027,
+      "step": 22400
+    },
+    {
+      "epoch": 2.8738137984098486,
+      "grad_norm": 0.9067406058311462,
+      "learning_rate": 8.412413439343422e-05,
+      "loss": 0.4342,
+      "step": 22410
+    },
+    {
+      "epoch": 2.8750961785073095,
+      "grad_norm": 1.3683377504348755,
+      "learning_rate": 8.32692143284603e-05,
+      "loss": 0.3952,
+      "step": 22420
+    },
+    {
+      "epoch": 2.8763785586047703,
+      "grad_norm": 0.5947908163070679,
+      "learning_rate": 8.241429426348637e-05,
+      "loss": 0.3272,
+      "step": 22430
+    },
+    {
+      "epoch": 2.877660938702231,
+      "grad_norm": 0.5604143142700195,
+      "learning_rate": 8.155937419851244e-05,
+      "loss": 0.3151,
+      "step": 22440
+    },
+    {
+      "epoch": 2.878943318799692,
+      "grad_norm": 0.4945407509803772,
+      "learning_rate": 8.07044541335385e-05,
+      "loss": 0.3728,
+      "step": 22450
+    },
+    {
+      "epoch": 2.880225698897153,
+      "grad_norm": 1.287941336631775,
+      "learning_rate": 7.984953406856459e-05,
+      "loss": 0.2752,
+      "step": 22460
+    },
+    {
+      "epoch": 2.881508078994614,
+      "grad_norm": 0.7874084115028381,
+      "learning_rate": 7.899461400359066e-05,
+      "loss": 0.4383,
+      "step": 22470
+    },
+    {
+      "epoch": 2.882790459092075,
+      "grad_norm": 0.8812971115112305,
+      "learning_rate": 7.813969393861674e-05,
+      "loss": 0.4036,
+      "step": 22480
+    },
+    {
+      "epoch": 2.884072839189536,
+      "grad_norm": 0.5514728426933289,
+      "learning_rate": 7.728477387364281e-05,
+      "loss": 0.2882,
+      "step": 22490
+    },
+    {
+      "epoch": 2.8853552192869967,
+      "grad_norm": 0.7565945386886597,
+      "learning_rate": 7.642985380866889e-05,
+      "loss": 0.3155,
+      "step": 22500
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 23394,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1628484544661760.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}