diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,56774 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 209.0,
+  "eval_steps": 500,
+  "global_step": 78375,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 9.25,
+      "learning_rate": 5.399999999999999e-06,
+      "loss": 1.0637,
+      "step": 10
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 8.25,
+      "learning_rate": 1.14e-05,
+      "loss": 1.0494,
+      "step": 20
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 4.84375,
+      "learning_rate": 1.74e-05,
+      "loss": 1.0261,
+      "step": 30
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 1.1171875,
+      "learning_rate": 2.34e-05,
+      "loss": 0.9926,
+      "step": 40
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.99609375,
+      "learning_rate": 2.94e-05,
+      "loss": 0.9774,
+      "step": 50
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.875,
+      "learning_rate": 3.539999999999999e-05,
+      "loss": 0.9531,
+      "step": 60
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.71484375,
+      "learning_rate": 4.14e-05,
+      "loss": 0.9159,
+      "step": 70
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.5234375,
+      "learning_rate": 4.7399999999999993e-05,
+      "loss": 0.8883,
+      "step": 80
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.54296875,
+      "learning_rate": 5.339999999999999e-05,
+      "loss": 0.8364,
+      "step": 90
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.46875,
+      "learning_rate": 5.94e-05,
+      "loss": 0.7873,
+      "step": 100
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 6.539999999999999e-05,
+      "loss": 0.7557,
+      "step": 110
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.21484375,
+      "learning_rate": 7.139999999999999e-05,
+      "loss": 0.7321,
+      "step": 120
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 7.74e-05,
+      "loss": 0.7241,
+      "step": 130
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.265625,
+      "learning_rate": 8.34e-05,
+      "loss": 0.7289,
+      "step": 140
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 8.939999999999999e-05,
+      "loss": 0.7268,
+      "step": 150
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 9.539999999999999e-05,
+      "loss": 0.7145,
+      "step": 160
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001014,
+      "loss": 0.7186,
+      "step": 170
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.00010739999999999998,
+      "loss": 0.7153,
+      "step": 180
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00011339999999999999,
+      "loss": 0.7115,
+      "step": 190
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001194,
+      "loss": 0.6966,
+      "step": 200
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.00012539999999999999,
+      "loss": 0.705,
+      "step": 210
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001314,
+      "loss": 0.7021,
+      "step": 220
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001374,
+      "loss": 0.7011,
+      "step": 230
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001434,
+      "loss": 0.6889,
+      "step": 240
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0001494,
+      "loss": 0.6945,
+      "step": 250
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.00015539999999999998,
+      "loss": 0.6997,
+      "step": 260
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001614,
+      "loss": 0.6889,
+      "step": 270
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001674,
+      "loss": 0.6743,
+      "step": 280
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00017339999999999996,
+      "loss": 0.6687,
+      "step": 290
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00017939999999999997,
+      "loss": 0.6447,
+      "step": 300
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00018539999999999998,
+      "loss": 0.6309,
+      "step": 310
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001914,
+      "loss": 0.6308,
+      "step": 320
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001974,
+      "loss": 0.616,
+      "step": 330
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00020339999999999998,
+      "loss": 0.6208,
+      "step": 340
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.00020939999999999997,
+      "loss": 0.6243,
+      "step": 350
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.00021539999999999998,
+      "loss": 0.6198,
+      "step": 360
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0002214,
+      "loss": 0.6235,
+      "step": 370
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.6113528609275818,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0988,
+      "eval_samples_per_second": 1.584,
+      "eval_steps_per_second": 0.099,
+      "step": 375
+    },
+    {
+      "epoch": 1.0133333333333334,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00022739999999999997,
+      "loss": 0.613,
+      "step": 380
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00023339999999999998,
+      "loss": 0.6216,
+      "step": 390
+    },
+    {
+      "epoch": 1.0666666666666667,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0002394,
+      "loss": 0.6229,
+      "step": 400
+    },
+    {
+      "epoch": 1.0933333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00024539999999999995,
+      "loss": 0.613,
+      "step": 410
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0002514,
+      "loss": 0.6115,
+      "step": 420
+    },
+    {
+      "epoch": 1.1466666666666667,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.00025739999999999997,
+      "loss": 0.6322,
+      "step": 430
+    },
+    {
+      "epoch": 1.1733333333333333,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.00026339999999999995,
+      "loss": 0.61,
+      "step": 440
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002694,
+      "loss": 0.6103,
+      "step": 450
+    },
+    {
+      "epoch": 1.2266666666666666,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.00027539999999999997,
+      "loss": 0.612,
+      "step": 460
+    },
+    {
+      "epoch": 1.2533333333333334,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00028139999999999996,
+      "loss": 0.6018,
+      "step": 470
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00028739999999999994,
+      "loss": 0.6077,
+      "step": 480
+    },
+    {
+      "epoch": 1.3066666666666666,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002934,
+      "loss": 0.5906,
+      "step": 490
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.00029939999999999996,
+      "loss": 0.6005,
+      "step": 500
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029999999989326596,
+      "loss": 0.6028,
+      "step": 510
+    },
+    {
+      "epoch": 1.3866666666666667,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002999999995243089,
+      "loss": 0.5956,
+      "step": 520
+    },
+    {
+      "epoch": 1.4133333333333333,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00029999999889181124,
+      "loss": 0.5882,
+      "step": 530
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0002999999979957728,
+      "loss": 0.5923,
+      "step": 540
+    },
+    {
+      "epoch": 1.4666666666666668,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0002999999968361936,
+      "loss": 0.5985,
+      "step": 550
+    },
+    {
+      "epoch": 1.4933333333333334,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00029999999541307376,
+      "loss": 0.5878,
+      "step": 560
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002999999937264132,
+      "loss": 0.587,
+      "step": 570
+    },
+    {
+      "epoch": 1.5466666666666666,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.00029999999177621184,
+      "loss": 0.5771,
+      "step": 580
+    },
+    {
+      "epoch": 1.5733333333333333,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.00029999998956246985,
+      "loss": 0.5678,
+      "step": 590
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029999998708518715,
+      "loss": 0.5659,
+      "step": 600
+    },
+    {
+      "epoch": 1.6266666666666667,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.00029999998434436374,
+      "loss": 0.5446,
+      "step": 610
+    },
+    {
+      "epoch": 1.6533333333333333,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0002999999813399996,
+      "loss": 0.5666,
+      "step": 620
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0002999999780720948,
+      "loss": 0.5615,
+      "step": 630
+    },
+    {
+      "epoch": 1.7066666666666666,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029999997454064936,
+      "loss": 0.5576,
+      "step": 640
+    },
+    {
+      "epoch": 1.7333333333333334,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0002999999707456632,
+      "loss": 0.5802,
+      "step": 650
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.00029999996668713633,
+      "loss": 0.5764,
+      "step": 660
+    },
+    {
+      "epoch": 1.7866666666666666,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002999999623650689,
+      "loss": 0.5754,
+      "step": 670
+    },
+    {
+      "epoch": 1.8133333333333335,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029999995777946073,
+      "loss": 0.56,
+      "step": 680
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029999995293031193,
+      "loss": 0.5677,
+      "step": 690
+    },
+    {
+      "epoch": 1.8666666666666667,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.00029999994781762247,
+      "loss": 0.5564,
+      "step": 700
+    },
+    {
+      "epoch": 1.8933333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002999999424413924,
+      "loss": 0.5461,
+      "step": 710
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002999999368016218,
+      "loss": 0.5634,
+      "step": 720
+    },
+    {
+      "epoch": 1.9466666666666668,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0002999999308983105,
+      "loss": 0.5511,
+      "step": 730
+    },
+    {
+      "epoch": 1.9733333333333334,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0002999999247314586,
+      "loss": 0.5605,
+      "step": 740
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0002999999183010661,
+      "loss": 0.5486,
+      "step": 750
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.5647158026695251,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7822,
+      "eval_samples_per_second": 1.636,
+      "eval_steps_per_second": 0.102,
+      "step": 750
+    },
+    {
+      "epoch": 2.026666666666667,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029999991160713305,
+      "loss": 0.5642,
+      "step": 760
+    },
+    {
+      "epoch": 2.0533333333333332,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029999990464965945,
+      "loss": 0.5692,
+      "step": 770
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.00029999989742864524,
+      "loss": 0.5634,
+      "step": 780
+    },
+    {
+      "epoch": 2.1066666666666665,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.00029999988994409055,
+      "loss": 0.5492,
+      "step": 790
+    },
+    {
+      "epoch": 2.1333333333333333,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0002999998821959953,
+      "loss": 0.5613,
+      "step": 800
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029999987418435957,
+      "loss": 0.5597,
+      "step": 810
+    },
+    {
+      "epoch": 2.1866666666666665,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029999986590918334,
+      "loss": 0.5467,
+      "step": 820
+    },
+    {
+      "epoch": 2.2133333333333334,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029999985737046657,
+      "loss": 0.5505,
+      "step": 830
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002999998485682093,
+      "loss": 0.5472,
+      "step": 840
+    },
+    {
+      "epoch": 2.2666666666666666,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029999983950241166,
+      "loss": 0.5492,
+      "step": 850
+    },
+    {
+      "epoch": 2.2933333333333334,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0002999998301730736,
+      "loss": 0.541,
+      "step": 860
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.000299999820580195,
+      "loss": 0.5435,
+      "step": 870
+    },
+    {
+      "epoch": 2.3466666666666667,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0002999998107237761,
+      "loss": 0.5546,
+      "step": 880
+    },
+    {
+      "epoch": 2.3733333333333335,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002999998006038168,
+      "loss": 0.5503,
+      "step": 890
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002999997902203171,
+      "loss": 0.5423,
+      "step": 900
+    },
+    {
+      "epoch": 2.4266666666666667,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.00029999977957327704,
+      "loss": 0.5461,
+      "step": 910
+    },
+    {
+      "epoch": 2.453333333333333,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00029999976866269666,
+      "loss": 0.5513,
+      "step": 920
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.00029999975748857595,
+      "loss": 0.5532,
+      "step": 930
+    },
+    {
+      "epoch": 2.506666666666667,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.00029999974605091496,
+      "loss": 0.547,
+      "step": 940
+    },
+    {
+      "epoch": 2.533333333333333,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029999973434971375,
+      "loss": 0.5383,
+      "step": 950
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002999997223849722,
+      "loss": 0.5375,
+      "step": 960
+    },
+    {
+      "epoch": 2.586666666666667,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002999997101566904,
+      "loss": 0.5307,
+      "step": 970
+    },
+    {
+      "epoch": 2.6133333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002999996976648685,
+      "loss": 0.5155,
+      "step": 980
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.00029999968490950635,
+      "loss": 0.518,
+      "step": 990
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.000299999671890604,
+      "loss": 0.539,
+      "step": 1000
+    },
+    {
+      "epoch": 2.6933333333333334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002999996586081615,
+      "loss": 0.5208,
+      "step": 1010
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002999996450621789,
+      "loss": 0.5389,
+      "step": 1020
+    },
+    {
+      "epoch": 2.7466666666666666,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0002999996312526562,
+      "loss": 0.5422,
+      "step": 1030
+    },
+    {
+      "epoch": 2.7733333333333334,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0002999996171795935,
+      "loss": 0.5498,
+      "step": 1040
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0002999996028429907,
+      "loss": 0.5356,
+      "step": 1050
+    },
+    {
+      "epoch": 2.8266666666666667,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0002999995882428479,
+      "loss": 0.5313,
+      "step": 1060
+    },
+    {
+      "epoch": 2.8533333333333335,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029999957337916507,
+      "loss": 0.5375,
+      "step": 1070
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029999955825194224,
+      "loss": 0.5209,
+      "step": 1080
+    },
+    {
+      "epoch": 2.9066666666666667,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.00029999954286117947,
+      "loss": 0.5268,
+      "step": 1090
+    },
+    {
+      "epoch": 2.9333333333333336,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0002999995272068768,
+      "loss": 0.5294,
+      "step": 1100
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0002999995112890343,
+      "loss": 0.5262,
+      "step": 1110
+    },
+    {
+      "epoch": 2.986666666666667,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.00029999949510765183,
+      "loss": 0.5403,
+      "step": 1120
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.5400508046150208,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0742,
+      "eval_samples_per_second": 1.588,
+      "eval_steps_per_second": 0.099,
+      "step": 1125
+    },
+    {
+      "epoch": 3.013333333333333,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0002999994786627296,
+      "loss": 0.5269,
+      "step": 1130
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.00029999946195426755,
+      "loss": 0.5475,
+      "step": 1140
+    },
+    {
+      "epoch": 3.066666666666667,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029999944498226573,
+      "loss": 0.54,
+      "step": 1150
+    },
+    {
+      "epoch": 3.0933333333333333,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0002999994277467241,
+      "loss": 0.5373,
+      "step": 1160
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029999941024764284,
+      "loss": 0.5319,
+      "step": 1170
+    },
+    {
+      "epoch": 3.1466666666666665,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029999939248502187,
+      "loss": 0.5417,
+      "step": 1180
+    },
+    {
+      "epoch": 3.1733333333333333,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.00029999937445886123,
+      "loss": 0.528,
+      "step": 1190
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.00029999935616916096,
+      "loss": 0.5285,
+      "step": 1200
+    },
+    {
+      "epoch": 3.2266666666666666,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002999993376159211,
+      "loss": 0.528,
+      "step": 1210
+    },
+    {
+      "epoch": 3.2533333333333334,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0002999993187991417,
+      "loss": 0.5237,
+      "step": 1220
+    },
+    {
+      "epoch": 3.2800000000000002,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0002999992997188228,
+      "loss": 0.535,
+      "step": 1230
+    },
+    {
+      "epoch": 3.3066666666666666,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.00029999928037496436,
+      "loss": 0.5154,
+      "step": 1240
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002999992607675665,
+      "loss": 0.532,
+      "step": 1250
+    },
+    {
+      "epoch": 3.36,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0002999992408966292,
+      "loss": 0.5363,
+      "step": 1260
+    },
+    {
+      "epoch": 3.3866666666666667,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029999922076215255,
+      "loss": 0.5266,
+      "step": 1270
+    },
+    {
+      "epoch": 3.413333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002999992003641365,
+      "loss": 0.526,
+      "step": 1280
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0002999991797025811,
+      "loss": 0.532,
+      "step": 1290
+    },
+    {
+      "epoch": 3.466666666666667,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0002999991587774865,
+      "loss": 0.5403,
+      "step": 1300
+    },
+    {
+      "epoch": 3.493333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002999991375888526,
+      "loss": 0.5313,
+      "step": 1310
+    },
+    {
+      "epoch": 3.52,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029999911613667955,
+      "loss": 0.5282,
+      "step": 1320
+    },
+    {
+      "epoch": 3.546666666666667,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00029999909442096726,
+      "loss": 0.5223,
+      "step": 1330
+    },
+    {
+      "epoch": 3.5733333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029999907244171594,
+      "loss": 0.5182,
+      "step": 1340
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029999905019892543,
+      "loss": 0.5138,
+      "step": 1350
+    },
+    {
+      "epoch": 3.626666666666667,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0002999990276925959,
+      "loss": 0.4966,
+      "step": 1360
+    },
+    {
+      "epoch": 3.6533333333333333,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0002999990049227274,
+      "loss": 0.5187,
+      "step": 1370
+    },
+    {
+      "epoch": 3.68,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002999989818893199,
+      "loss": 0.5107,
+      "step": 1380
+    },
+    {
+      "epoch": 3.7066666666666666,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029999895859237344,
+      "loss": 0.5109,
+      "step": 1390
+    },
+    {
+      "epoch": 3.7333333333333334,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029999893503188807,
+      "loss": 0.5325,
+      "step": 1400
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.00029999891120786393,
+      "loss": 0.5327,
+      "step": 1410
+    },
+    {
+      "epoch": 3.7866666666666666,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.00029999888712030093,
+      "loss": 0.5306,
+      "step": 1420
+    },
+    {
+      "epoch": 3.8133333333333335,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0002999988627691992,
+      "loss": 0.5156,
+      "step": 1430
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029999883815455867,
+      "loss": 0.5235,
+      "step": 1440
+    },
+    {
+      "epoch": 3.8666666666666667,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.00029999881327637956,
+      "loss": 0.5176,
+      "step": 1450
+    },
+    {
+      "epoch": 3.8933333333333335,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.00029999878813466175,
+      "loss": 0.5076,
+      "step": 1460
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029999876272940535,
+      "loss": 0.5211,
+      "step": 1470
+    },
+    {
+      "epoch": 3.9466666666666668,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0002999987370606104,
+      "loss": 0.5123,
+      "step": 1480
+    },
+    {
+      "epoch": 3.9733333333333336,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.000299998711128277,
+      "loss": 0.5238,
+      "step": 1490
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.00029999868493240506,
+      "loss": 0.5122,
+      "step": 1500
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.5266960859298706,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9024,
+      "eval_samples_per_second": 1.616,
+      "eval_steps_per_second": 0.101,
+      "step": 1500
+    },
+    {
+      "epoch": 4.026666666666666,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029999865847299477,
+      "loss": 0.5296,
+      "step": 1510
+    },
+    {
+      "epoch": 4.053333333333334,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002999986317500461,
+      "loss": 0.5363,
+      "step": 1520
+    },
+    {
+      "epoch": 4.08,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0002999986047635591,
+      "loss": 0.5265,
+      "step": 1530
+    },
+    {
+      "epoch": 4.1066666666666665,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.00029999857751353384,
+      "loss": 0.5187,
+      "step": 1540
+    },
+    {
+      "epoch": 4.133333333333334,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.00029999854999997037,
+      "loss": 0.5247,
+      "step": 1550
+    },
+    {
+      "epoch": 4.16,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.00029999852222286873,
+      "loss": 0.5269,
+      "step": 1560
+    },
+    {
+      "epoch": 4.1866666666666665,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.000299998494182229,
+      "loss": 0.5152,
+      "step": 1570
+    },
+    {
+      "epoch": 4.213333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002999984658780511,
+      "loss": 0.5159,
+      "step": 1580
+    },
+    {
+      "epoch": 4.24,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029999843731033524,
+      "loss": 0.515,
+      "step": 1590
+    },
+    {
+      "epoch": 4.266666666666667,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.00029999840847908134,
+      "loss": 0.5188,
+      "step": 1600
+    },
+    {
+      "epoch": 4.293333333333333,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00029999837938428955,
+      "loss": 0.5117,
+      "step": 1610
+    },
+    {
+      "epoch": 4.32,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.00029999835002595993,
+      "loss": 0.5159,
+      "step": 1620
+    },
+    {
+      "epoch": 4.346666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029999832040409247,
+      "loss": 0.5263,
+      "step": 1630
+    },
+    {
+      "epoch": 4.373333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002999982905186872,
+      "loss": 0.5212,
+      "step": 1640
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029999826036974426,
+      "loss": 0.5149,
+      "step": 1650
+    },
+    {
+      "epoch": 4.426666666666667,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029999822995726357,
+      "loss": 0.521,
+      "step": 1660
+    },
+    {
+      "epoch": 4.453333333333333,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0002999981992812454,
+      "loss": 0.5257,
+      "step": 1670
+    },
+    {
+      "epoch": 4.48,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0002999981683416896,
+      "loss": 0.5305,
+      "step": 1680
+    },
+    {
+      "epoch": 4.506666666666667,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0002999981371385963,
+      "loss": 0.5198,
+      "step": 1690
+    },
+    {
+      "epoch": 4.533333333333333,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.00029999810567196553,
+      "loss": 0.5147,
+      "step": 1700
+    },
+    {
+      "epoch": 4.5600000000000005,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0002999980739417974,
+      "loss": 0.512,
+      "step": 1710
+    },
+    {
+      "epoch": 4.586666666666667,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002999980419480919,
+      "loss": 0.5087,
+      "step": 1720
+    },
+    {
+      "epoch": 4.613333333333333,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.00029999800969084914,
+      "loss": 0.4936,
+      "step": 1730
+    },
+    {
+      "epoch": 4.64,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0002999979771700692,
+      "loss": 0.4966,
+      "step": 1740
+    },
+    {
+      "epoch": 4.666666666666667,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.00029999794438575203,
+      "loss": 0.5153,
+      "step": 1750
+    },
+    {
+      "epoch": 4.693333333333333,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029999791133789776,
+      "loss": 0.4968,
+      "step": 1760
+    },
+    {
+      "epoch": 4.72,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029999787802650646,
+      "loss": 0.5156,
+      "step": 1770
+    },
+    {
+      "epoch": 4.746666666666667,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.00029999784445157814,
+      "loss": 0.5193,
+      "step": 1780
+    },
+    {
+      "epoch": 4.773333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002999978106131129,
+      "loss": 0.5272,
+      "step": 1790
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0002999977765111108,
+      "loss": 0.5128,
+      "step": 1800
+    },
+    {
+      "epoch": 4.826666666666666,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.00029999774214557187,
+      "loss": 0.5092,
+      "step": 1810
+    },
+    {
+      "epoch": 4.8533333333333335,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0002999977075164962,
+      "loss": 0.5169,
+      "step": 1820
+    },
+    {
+      "epoch": 4.88,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0002999976726238838,
+      "loss": 0.502,
+      "step": 1830
+    },
+    {
+      "epoch": 4.906666666666666,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.00029999763746773483,
+      "loss": 0.5062,
+      "step": 1840
+    },
+    {
+      "epoch": 4.933333333333334,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00029999760204804925,
+      "loss": 0.5079,
+      "step": 1850
+    },
+    {
+      "epoch": 4.96,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029999756636482714,
+      "loss": 0.5061,
+      "step": 1860
+    },
+    {
+      "epoch": 4.986666666666666,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0002999975304180686,
+      "loss": 0.5212,
+      "step": 1870
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 0.5196089148521423,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1189,
+      "eval_samples_per_second": 1.581,
+      "eval_steps_per_second": 0.099,
+      "step": 1875
+    },
+    {
+      "epoch": 5.013333333333334,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002999974942077737,
+      "loss": 0.5085,
+      "step": 1880
+    },
+    {
+      "epoch": 5.04,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029999745773394247,
+      "loss": 0.5315,
+      "step": 1890
+    },
+    {
+      "epoch": 5.066666666666666,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029999742099657493,
+      "loss": 0.521,
+      "step": 1900
+    },
+    {
+      "epoch": 5.093333333333334,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0002999973839956713,
+      "loss": 0.5159,
+      "step": 1910
+    },
+    {
+      "epoch": 5.12,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002999973467312315,
+      "loss": 0.5129,
+      "step": 1920
+    },
+    {
+      "epoch": 5.1466666666666665,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.00029999730920325565,
+      "loss": 0.5224,
+      "step": 1930
+    },
+    {
+      "epoch": 5.173333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029999727141174375,
+      "loss": 0.5098,
+      "step": 1940
+    },
+    {
+      "epoch": 5.2,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.000299997233356696,
+      "loss": 0.5091,
+      "step": 1950
+    },
+    {
+      "epoch": 5.226666666666667,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0002999971950381123,
+      "loss": 0.5076,
+      "step": 1960
+    },
+    {
+      "epoch": 5.253333333333333,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002999971564559929,
+      "loss": 0.506,
+      "step": 1970
+    },
+    {
+      "epoch": 5.28,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00029999711761033774,
+      "loss": 0.518,
+      "step": 1980
+    },
+    {
+      "epoch": 5.306666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002999970785011469,
+      "loss": 0.4997,
+      "step": 1990
+    },
+    {
+      "epoch": 5.333333333333333,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00029999703912842054,
+      "loss": 0.5168,
+      "step": 2000
+    },
+    {
+      "epoch": 5.36,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0002999969994921586,
+      "loss": 0.5201,
+      "step": 2010
+    },
+    {
+      "epoch": 5.386666666666667,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029999695959236127,
+      "loss": 0.5091,
+      "step": 2020
+    },
+    {
+      "epoch": 5.413333333333333,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002999969194290285,
+      "loss": 0.5114,
+      "step": 2030
+    },
+    {
+      "epoch": 5.44,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002999968790021604,
+      "loss": 0.5168,
+      "step": 2040
+    },
+    {
+      "epoch": 5.466666666666667,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002999968383117571,
+      "loss": 0.5251,
+      "step": 2050
+    },
+    {
+      "epoch": 5.493333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002999967973578186,
+      "loss": 0.516,
+      "step": 2060
+    },
+    {
+      "epoch": 5.52,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029999675614034507,
+      "loss": 0.511,
+      "step": 2070
+    },
+    {
+      "epoch": 5.546666666666667,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0002999967146593365,
+      "loss": 0.5065,
+      "step": 2080
+    },
+    {
+      "epoch": 5.573333333333333,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.00029999667291479297,
+      "loss": 0.5039,
+      "step": 2090
+    },
+    {
+      "epoch": 5.6,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0002999966309067146,
+      "loss": 0.5002,
+      "step": 2100
+    },
+    {
+      "epoch": 5.626666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002999965886351014,
+      "loss": 0.4836,
+      "step": 2110
+    },
+    {
+      "epoch": 5.653333333333333,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0002999965460999534,
+      "loss": 0.5049,
+      "step": 2120
+    },
+    {
+      "epoch": 5.68,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.00029999650330127085,
+      "loss": 0.4963,
+      "step": 2130
+    },
+    {
+      "epoch": 5.706666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002999964602390537,
+      "loss": 0.497,
+      "step": 2140
+    },
+    {
+      "epoch": 5.733333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.000299996416913302,
+      "loss": 0.5173,
+      "step": 2150
+    },
+    {
+      "epoch": 5.76,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002999963733240159,
+      "loss": 0.5179,
+      "step": 2160
+    },
+    {
+      "epoch": 5.786666666666667,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029999632947119546,
+      "loss": 0.5171,
+      "step": 2170
+    },
+    {
+      "epoch": 5.8133333333333335,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002999962853548408,
+      "loss": 0.5014,
+      "step": 2180
+    },
+    {
+      "epoch": 5.84,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002999962409749518,
+      "loss": 0.509,
+      "step": 2190
+    },
+    {
+      "epoch": 5.866666666666667,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002999961963315288,
+      "loss": 0.5061,
+      "step": 2200
+    },
+    {
+      "epoch": 5.8933333333333335,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.00029999615142457174,
+      "loss": 0.4957,
+      "step": 2210
+    },
+    {
+      "epoch": 5.92,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0002999961062540807,
+      "loss": 0.5071,
+      "step": 2220
+    },
+    {
+      "epoch": 5.946666666666666,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0002999960608200558,
+      "loss": 0.4997,
+      "step": 2230
+    },
+    {
+      "epoch": 5.973333333333334,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0002999960151224971,
+      "loss": 0.512,
+      "step": 2240
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029999596916140467,
+      "loss": 0.4999,
+      "step": 2250
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 0.5139528512954712,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7016,
+      "eval_samples_per_second": 1.649,
+      "eval_steps_per_second": 0.103,
+      "step": 2250
+    },
+    {
+      "epoch": 6.026666666666666,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002999959229367786,
+      "loss": 0.518,
+      "step": 2260
+    },
+    {
+      "epoch": 6.053333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.000299995876448619,
+      "loss": 0.5242,
+      "step": 2270
+    },
+    {
+      "epoch": 6.08,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0002999958296969259,
+      "loss": 0.5135,
+      "step": 2280
+    },
+    {
+      "epoch": 6.1066666666666665,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0002999957826816994,
+      "loss": 0.5064,
+      "step": 2290
+    },
+    {
+      "epoch": 6.133333333333334,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0002999957354029396,
+      "loss": 0.5111,
+      "step": 2300
+    },
+    {
+      "epoch": 6.16,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.00029999568786064654,
+      "loss": 0.5151,
+      "step": 2310
+    },
+    {
+      "epoch": 6.1866666666666665,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002999956400548204,
+      "loss": 0.5047,
+      "step": 2320
+    },
+    {
+      "epoch": 6.213333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029999559198546114,
+      "loss": 0.5028,
+      "step": 2330
+    },
+    {
+      "epoch": 6.24,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.00029999554365256893,
+      "loss": 0.5031,
+      "step": 2340
+    },
+    {
+      "epoch": 6.266666666666667,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002999954950561438,
+      "loss": 0.5078,
+      "step": 2350
+    },
+    {
+      "epoch": 6.293333333333333,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.00029999544619618585,
+      "loss": 0.501,
+      "step": 2360
+    },
+    {
+      "epoch": 6.32,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.00029999539707269526,
+      "loss": 0.5044,
+      "step": 2370
+    },
+    {
+      "epoch": 6.346666666666667,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.000299995347685672,
+      "loss": 0.5143,
+      "step": 2380
+    },
+    {
+      "epoch": 6.373333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002999952980351162,
+      "loss": 0.5106,
+      "step": 2390
+    },
+    {
+      "epoch": 6.4,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0002999952481210279,
+      "loss": 0.5038,
+      "step": 2400
+    },
+    {
+      "epoch": 6.426666666666667,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0002999951979434073,
+      "loss": 0.5108,
+      "step": 2410
+    },
+    {
+      "epoch": 6.453333333333333,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.00029999514750225434,
+      "loss": 0.5141,
+      "step": 2420
+    },
+    {
+      "epoch": 6.48,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.00029999509679756915,
+      "loss": 0.5192,
+      "step": 2430
+    },
+    {
+      "epoch": 6.506666666666667,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.000299995045829352,
+      "loss": 0.5093,
+      "step": 2440
+    },
+    {
+      "epoch": 6.533333333333333,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.00029999499459760267,
+      "loss": 0.5037,
+      "step": 2450
+    },
+    {
+      "epoch": 6.5600000000000005,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002999949431023215,
+      "loss": 0.5031,
+      "step": 2460
+    },
+    {
+      "epoch": 6.586666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002999948913435085,
+      "loss": 0.4985,
+      "step": 2470
+    },
+    {
+      "epoch": 6.613333333333333,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002999948393211637,
+      "loss": 0.484,
+      "step": 2480
+    },
+    {
+      "epoch": 6.64,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029999478703528735,
+      "loss": 0.4873,
+      "step": 2490
+    },
+    {
+      "epoch": 6.666666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029999473448587933,
+      "loss": 0.5056,
+      "step": 2500
+    },
+    {
+      "epoch": 6.693333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029999468167293987,
+      "loss": 0.4872,
+      "step": 2510
+    },
+    {
+      "epoch": 6.72,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002999946285964691,
+      "loss": 0.5052,
+      "step": 2520
+    },
+    {
+      "epoch": 6.746666666666667,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.000299994575256467,
+      "loss": 0.51,
+      "step": 2530
+    },
+    {
+      "epoch": 6.773333333333333,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0002999945216529337,
+      "loss": 0.5176,
+      "step": 2540
+    },
+    {
+      "epoch": 6.8,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.00029999446778586927,
+      "loss": 0.5028,
+      "step": 2550
+    },
+    {
+      "epoch": 6.826666666666666,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0002999944136552739,
+      "loss": 0.4989,
+      "step": 2560
+    },
+    {
+      "epoch": 6.8533333333333335,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00029999435926114757,
+      "loss": 0.5076,
+      "step": 2570
+    },
+    {
+      "epoch": 6.88,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002999943046034905,
+      "loss": 0.4933,
+      "step": 2580
+    },
+    {
+      "epoch": 6.906666666666666,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029999424968230265,
+      "loss": 0.4958,
+      "step": 2590
+    },
+    {
+      "epoch": 6.933333333333334,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.00029999419449758415,
+      "loss": 0.4988,
+      "step": 2600
+    },
+    {
+      "epoch": 6.96,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.00029999413904933524,
+      "loss": 0.4965,
+      "step": 2610
+    },
+    {
+      "epoch": 6.986666666666666,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0002999940833375558,
+      "loss": 0.5124,
+      "step": 2620
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 0.5085073113441467,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8821,
+      "eval_samples_per_second": 1.619,
+      "eval_steps_per_second": 0.101,
+      "step": 2625
+    },
+    {
+      "epoch": 7.013333333333334,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029999402736224614,
+      "loss": 0.4992,
+      "step": 2630
+    },
+    {
+      "epoch": 7.04,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00029999397112340616,
+      "loss": 0.5232,
+      "step": 2640
+    },
+    {
+      "epoch": 7.066666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002999939146210361,
+      "loss": 0.5122,
+      "step": 2650
+    },
+    {
+      "epoch": 7.093333333333334,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029999385785513597,
+      "loss": 0.5061,
+      "step": 2660
+    },
+    {
+      "epoch": 7.12,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00029999380082570596,
+      "loss": 0.5025,
+      "step": 2670
+    },
+    {
+      "epoch": 7.1466666666666665,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002999937435327461,
+      "loss": 0.5125,
+      "step": 2680
+    },
+    {
+      "epoch": 7.173333333333334,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002999936859762565,
+      "loss": 0.5008,
+      "step": 2690
+    },
+    {
+      "epoch": 7.2,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002999936281562373,
+      "loss": 0.5003,
+      "step": 2700
+    },
+    {
+      "epoch": 7.226666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002999935700726885,
+      "loss": 0.4984,
+      "step": 2710
+    },
+    {
+      "epoch": 7.253333333333333,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00029999351172561037,
+      "loss": 0.4968,
+      "step": 2720
+    },
+    {
+      "epoch": 7.28,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0002999934531150029,
+      "loss": 0.5096,
+      "step": 2730
+    },
+    {
+      "epoch": 7.306666666666667,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029999339424086616,
+      "loss": 0.491,
+      "step": 2740
+    },
+    {
+      "epoch": 7.333333333333333,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0002999933351032003,
+      "loss": 0.508,
+      "step": 2750
+    },
+    {
+      "epoch": 7.36,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00029999327570200547,
+      "loss": 0.5109,
+      "step": 2760
+    },
+    {
+      "epoch": 7.386666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029999321603728173,
+      "loss": 0.5009,
+      "step": 2770
+    },
+    {
+      "epoch": 7.413333333333333,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0002999931561090292,
+      "loss": 0.5036,
+      "step": 2780
+    },
+    {
+      "epoch": 7.44,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.00029999309591724797,
+      "loss": 0.5086,
+      "step": 2790
+    },
+    {
+      "epoch": 7.466666666666667,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002999930354619381,
+      "loss": 0.5176,
+      "step": 2800
+    },
+    {
+      "epoch": 7.493333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029999297474309983,
+      "loss": 0.5078,
+      "step": 2810
+    },
+    {
+      "epoch": 7.52,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029999291376073316,
+      "loss": 0.5031,
+      "step": 2820
+    },
+    {
+      "epoch": 7.546666666666667,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029999285251483814,
+      "loss": 0.4992,
+      "step": 2830
+    },
+    {
+      "epoch": 7.573333333333333,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.00029999279100541504,
+      "loss": 0.4969,
+      "step": 2840
+    },
+    {
+      "epoch": 7.6,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002999927292324638,
+      "loss": 0.4917,
+      "step": 2850
+    },
+    {
+      "epoch": 7.626666666666667,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0002999926671959847,
+      "loss": 0.4774,
+      "step": 2860
+    },
+    {
+      "epoch": 7.653333333333333,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0002999926048959777,
+      "loss": 0.4978,
+      "step": 2870
+    },
+    {
+      "epoch": 7.68,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.000299992542332443,
+      "loss": 0.4893,
+      "step": 2880
+    },
+    {
+      "epoch": 7.706666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002999924795053807,
+      "loss": 0.4894,
+      "step": 2890
+    },
+    {
+      "epoch": 7.733333333333333,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002999924164147909,
+      "loss": 0.5103,
+      "step": 2900
+    },
+    {
+      "epoch": 7.76,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.00029999235306067364,
+      "loss": 0.5113,
+      "step": 2910
+    },
+    {
+      "epoch": 7.786666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002999922894430291,
+      "loss": 0.5091,
+      "step": 2920
+    },
+    {
+      "epoch": 7.8133333333333335,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029999222556185745,
+      "loss": 0.494,
+      "step": 2930
+    },
+    {
+      "epoch": 7.84,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002999921614171587,
+      "loss": 0.5013,
+      "step": 2940
+    },
+    {
+      "epoch": 7.866666666666667,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.000299992097008933,
+      "loss": 0.4989,
+      "step": 2950
+    },
+    {
+      "epoch": 7.8933333333333335,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029999203233718047,
+      "loss": 0.4895,
+      "step": 2960
+    },
+    {
+      "epoch": 7.92,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029999196740190126,
+      "loss": 0.4999,
+      "step": 2970
+    },
+    {
+      "epoch": 7.946666666666666,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002999919022030954,
+      "loss": 0.4928,
+      "step": 2980
+    },
+    {
+      "epoch": 7.973333333333334,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.000299991836740763,
+      "loss": 0.5055,
+      "step": 2990
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002999917710149043,
+      "loss": 0.4937,
+      "step": 3000
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 0.5058842301368713,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.4441,
+      "eval_samples_per_second": 1.532,
+      "eval_steps_per_second": 0.096,
+      "step": 3000
+    },
+    {
+      "epoch": 8.026666666666667,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0002999917050255193,
+      "loss": 0.5115,
+      "step": 3010
+    },
+    {
+      "epoch": 8.053333333333333,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0002999916387726081,
+      "loss": 0.5175,
+      "step": 3020
+    },
+    {
+      "epoch": 8.08,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.00029999157225617094,
+      "loss": 0.5061,
+      "step": 3030
+    },
+    {
+      "epoch": 8.106666666666667,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.00029999150547620787,
+      "loss": 0.5004,
+      "step": 3040
+    },
+    {
+      "epoch": 8.133333333333333,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.000299991438432719,
+      "loss": 0.5034,
+      "step": 3050
+    },
+    {
+      "epoch": 8.16,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029999137112570444,
+      "loss": 0.5069,
+      "step": 3060
+    },
+    {
+      "epoch": 8.186666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002999913035551643,
+      "loss": 0.4977,
+      "step": 3070
+    },
+    {
+      "epoch": 8.213333333333333,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002999912357210987,
+      "loss": 0.494,
+      "step": 3080
+    },
+    {
+      "epoch": 8.24,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029999116762350784,
+      "loss": 0.4971,
+      "step": 3090
+    },
+    {
+      "epoch": 8.266666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029999109926239176,
+      "loss": 0.5005,
+      "step": 3100
+    },
+    {
+      "epoch": 8.293333333333333,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002999910306377506,
+      "loss": 0.4945,
+      "step": 3110
+    },
+    {
+      "epoch": 8.32,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0002999909617495844,
+      "loss": 0.4983,
+      "step": 3120
+    },
+    {
+      "epoch": 8.346666666666668,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002999908925978934,
+      "loss": 0.5074,
+      "step": 3130
+    },
+    {
+      "epoch": 8.373333333333333,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002999908231826777,
+      "loss": 0.504,
+      "step": 3140
+    },
+    {
+      "epoch": 8.4,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.00029999075350393745,
+      "loss": 0.4984,
+      "step": 3150
+    },
+    {
+      "epoch": 8.426666666666666,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.00029999068356167263,
+      "loss": 0.5046,
+      "step": 3160
+    },
+    {
+      "epoch": 8.453333333333333,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029999061335588353,
+      "loss": 0.5083,
+      "step": 3170
+    },
+    {
+      "epoch": 8.48,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00029999054288657015,
+      "loss": 0.513,
+      "step": 3180
+    },
+    {
+      "epoch": 8.506666666666666,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0002999904721537327,
+      "loss": 0.5038,
+      "step": 3190
+    },
+    {
+      "epoch": 8.533333333333333,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029999040115737124,
+      "loss": 0.4976,
+      "step": 3200
+    },
+    {
+      "epoch": 8.56,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.000299990329897486,
+      "loss": 0.4967,
+      "step": 3210
+    },
+    {
+      "epoch": 8.586666666666666,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002999902583740769,
+      "loss": 0.4925,
+      "step": 3220
+    },
+    {
+      "epoch": 8.613333333333333,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00029999018658714426,
+      "loss": 0.4784,
+      "step": 3230
+    },
+    {
+      "epoch": 8.64,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.00029999011453668816,
+      "loss": 0.4819,
+      "step": 3240
+    },
+    {
+      "epoch": 8.666666666666666,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002999900422227087,
+      "loss": 0.4993,
+      "step": 3250
+    },
+    {
+      "epoch": 8.693333333333333,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.00029998996964520594,
+      "loss": 0.4815,
+      "step": 3260
+    },
+    {
+      "epoch": 8.72,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0002999898968041802,
+      "loss": 0.4992,
+      "step": 3270
+    },
+    {
+      "epoch": 8.746666666666666,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0002999898236996314,
+      "loss": 0.5047,
+      "step": 3280
+    },
+    {
+      "epoch": 8.773333333333333,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0002999897503315598,
+      "loss": 0.5119,
+      "step": 3290
+    },
+    {
+      "epoch": 8.8,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0002999896766999655,
+      "loss": 0.498,
+      "step": 3300
+    },
+    {
+      "epoch": 8.826666666666666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002999896028048486,
+      "loss": 0.4932,
+      "step": 3310
+    },
+    {
+      "epoch": 8.853333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029998952864620924,
+      "loss": 0.5022,
+      "step": 3320
+    },
+    {
+      "epoch": 8.88,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.00029998945422404755,
+      "loss": 0.489,
+      "step": 3330
+    },
+    {
+      "epoch": 8.906666666666666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029998937953836365,
+      "loss": 0.4925,
+      "step": 3340
+    },
+    {
+      "epoch": 8.933333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029998930458915773,
+      "loss": 0.4931,
+      "step": 3350
+    },
+    {
+      "epoch": 8.96,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.00029998922937642987,
+      "loss": 0.4912,
+      "step": 3360
+    },
+    {
+      "epoch": 8.986666666666666,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0002999891539001802,
+      "loss": 0.5072,
+      "step": 3370
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 0.5036345720291138,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3651,
+      "eval_samples_per_second": 1.544,
+      "eval_steps_per_second": 0.096,
+      "step": 3375
+    },
+    {
+      "epoch": 9.013333333333334,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002999890781604088,
+      "loss": 0.4946,
+      "step": 3380
+    },
+    {
+      "epoch": 9.04,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.000299989002157116,
+      "loss": 0.5175,
+      "step": 3390
+    },
+    {
+      "epoch": 9.066666666666666,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0002999889258903017,
+      "loss": 0.5059,
+      "step": 3400
+    },
+    {
+      "epoch": 9.093333333333334,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0002999888493599662,
+      "loss": 0.501,
+      "step": 3410
+    },
+    {
+      "epoch": 9.12,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029998877256610956,
+      "loss": 0.4982,
+      "step": 3420
+    },
+    {
+      "epoch": 9.146666666666667,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002999886955087319,
+      "loss": 0.5069,
+      "step": 3430
+    },
+    {
+      "epoch": 9.173333333333334,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029998861818783344,
+      "loss": 0.4962,
+      "step": 3440
+    },
+    {
+      "epoch": 9.2,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029998854060341414,
+      "loss": 0.4945,
+      "step": 3450
+    },
+    {
+      "epoch": 9.226666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029998846275547435,
+      "loss": 0.4934,
+      "step": 3460
+    },
+    {
+      "epoch": 9.253333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029998838464401407,
+      "loss": 0.4909,
+      "step": 3470
+    },
+    {
+      "epoch": 9.28,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0002999883062690335,
+      "loss": 0.5054,
+      "step": 3480
+    },
+    {
+      "epoch": 9.306666666666667,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0002999882276305327,
+      "loss": 0.4859,
+      "step": 3490
+    },
+    {
+      "epoch": 9.333333333333334,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.00029998814872851193,
+      "loss": 0.502,
+      "step": 3500
+    },
+    {
+      "epoch": 9.36,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0002999880695629712,
+      "loss": 0.506,
+      "step": 3510
+    },
+    {
+      "epoch": 9.386666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002999879901339108,
+      "loss": 0.4957,
+      "step": 3520
+    },
+    {
+      "epoch": 9.413333333333334,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029998791044133066,
+      "loss": 0.4999,
+      "step": 3530
+    },
+    {
+      "epoch": 9.44,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002999878304852311,
+      "loss": 0.5033,
+      "step": 3540
+    },
+    {
+      "epoch": 9.466666666666667,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0002999877502656122,
+      "loss": 0.5134,
+      "step": 3550
+    },
+    {
+      "epoch": 9.493333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002999876697824741,
+      "loss": 0.5036,
+      "step": 3560
+    },
+    {
+      "epoch": 9.52,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00029998758903581696,
+      "loss": 0.498,
+      "step": 3570
+    },
+    {
+      "epoch": 9.546666666666667,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.00029998750802564086,
+      "loss": 0.4943,
+      "step": 3580
+    },
+    {
+      "epoch": 9.573333333333334,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.000299987426751946,
+      "loss": 0.4922,
+      "step": 3590
+    },
+    {
+      "epoch": 9.6,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00029998734521473255,
+      "loss": 0.4867,
+      "step": 3600
+    },
+    {
+      "epoch": 9.626666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029998726341400056,
+      "loss": 0.4718,
+      "step": 3610
+    },
+    {
+      "epoch": 9.653333333333332,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00029998718134975025,
+      "loss": 0.493,
+      "step": 3620
+    },
+    {
+      "epoch": 9.68,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0002999870990219818,
+      "loss": 0.4839,
+      "step": 3630
+    },
+    {
+      "epoch": 9.706666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029998701643069517,
+      "loss": 0.4852,
+      "step": 3640
+    },
+    {
+      "epoch": 9.733333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002999869335758907,
+      "loss": 0.5052,
+      "step": 3650
+    },
+    {
+      "epoch": 9.76,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002999868504575685,
+      "loss": 0.5065,
+      "step": 3660
+    },
+    {
+      "epoch": 9.786666666666667,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.00029998676707572864,
+      "loss": 0.5046,
+      "step": 3670
+    },
+    {
+      "epoch": 9.813333333333333,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002999866834303713,
+      "loss": 0.4886,
+      "step": 3680
+    },
+    {
+      "epoch": 9.84,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00029998659952149665,
+      "loss": 0.4966,
+      "step": 3690
+    },
+    {
+      "epoch": 9.866666666666667,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.00029998651534910483,
+      "loss": 0.494,
+      "step": 3700
+    },
+    {
+      "epoch": 9.893333333333333,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00029998643091319594,
+      "loss": 0.4846,
+      "step": 3710
+    },
+    {
+      "epoch": 9.92,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002999863462137702,
+      "loss": 0.4953,
+      "step": 3720
+    },
+    {
+      "epoch": 9.946666666666667,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.00029998626125082774,
+      "loss": 0.4876,
+      "step": 3730
+    },
+    {
+      "epoch": 9.973333333333333,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002999861760243687,
+      "loss": 0.5001,
+      "step": 3740
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0002999860905343932,
+      "loss": 0.488,
+      "step": 3750
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 0.5015295147895813,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1184,
+      "eval_samples_per_second": 1.581,
+      "eval_steps_per_second": 0.099,
+      "step": 3750
+    },
+    {
+      "epoch": 10.026666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029998600478090145,
+      "loss": 0.5073,
+      "step": 3760
+    },
+    {
+      "epoch": 10.053333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029998591876389353,
+      "loss": 0.5143,
+      "step": 3770
+    },
+    {
+      "epoch": 10.08,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029998583248336963,
+      "loss": 0.5016,
+      "step": 3780
+    },
+    {
+      "epoch": 10.106666666666667,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0002999857459393299,
+      "loss": 0.4949,
+      "step": 3790
+    },
+    {
+      "epoch": 10.133333333333333,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0002999856591317745,
+      "loss": 0.4988,
+      "step": 3800
+    },
+    {
+      "epoch": 10.16,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.00029998557206070356,
+      "loss": 0.5023,
+      "step": 3810
+    },
+    {
+      "epoch": 10.186666666666667,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002999854847261173,
+      "loss": 0.4933,
+      "step": 3820
+    },
+    {
+      "epoch": 10.213333333333333,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00029998539712801576,
+      "loss": 0.4891,
+      "step": 3830
+    },
+    {
+      "epoch": 10.24,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029998530926639923,
+      "loss": 0.4924,
+      "step": 3840
+    },
+    {
+      "epoch": 10.266666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002999852211412677,
+      "loss": 0.4957,
+      "step": 3850
+    },
+    {
+      "epoch": 10.293333333333333,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002999851327526215,
+      "loss": 0.49,
+      "step": 3860
+    },
+    {
+      "epoch": 10.32,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00029998504410046065,
+      "loss": 0.4942,
+      "step": 3870
+    },
+    {
+      "epoch": 10.346666666666668,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0002999849551847853,
+      "loss": 0.5034,
+      "step": 3880
+    },
+    {
+      "epoch": 10.373333333333333,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0002999848660055958,
+      "loss": 0.4996,
+      "step": 3890
+    },
+    {
+      "epoch": 10.4,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029998477656289205,
+      "loss": 0.4943,
+      "step": 3900
+    },
+    {
+      "epoch": 10.426666666666666,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0002999846868566744,
+      "loss": 0.5012,
+      "step": 3910
+    },
+    {
+      "epoch": 10.453333333333333,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0002999845968869429,
+      "loss": 0.5038,
+      "step": 3920
+    },
+    {
+      "epoch": 10.48,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0002999845066536977,
+      "loss": 0.5097,
+      "step": 3930
+    },
+    {
+      "epoch": 10.506666666666666,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0002999844161569391,
+      "loss": 0.4986,
+      "step": 3940
+    },
+    {
+      "epoch": 10.533333333333333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029998432539666707,
+      "loss": 0.492,
+      "step": 3950
+    },
+    {
+      "epoch": 10.56,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0002999842343728819,
+      "loss": 0.493,
+      "step": 3960
+    },
+    {
+      "epoch": 10.586666666666666,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029998414308558374,
+      "loss": 0.4886,
+      "step": 3970
+    },
+    {
+      "epoch": 10.613333333333333,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.00029998405153477264,
+      "loss": 0.4745,
+      "step": 3980
+    },
+    {
+      "epoch": 10.64,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029998395972044887,
+      "loss": 0.4784,
+      "step": 3990
+    },
+    {
+      "epoch": 10.666666666666666,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0002999838676426126,
+      "loss": 0.4955,
+      "step": 4000
+    },
+    {
+      "epoch": 10.693333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002999837753012639,
+      "loss": 0.4781,
+      "step": 4010
+    },
+    {
+      "epoch": 10.72,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.000299983682696403,
+      "loss": 0.4956,
+      "step": 4020
+    },
+    {
+      "epoch": 10.746666666666666,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029998358982803004,
+      "loss": 0.4996,
+      "step": 4030
+    },
+    {
+      "epoch": 10.773333333333333,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002999834966961452,
+      "loss": 0.5088,
+      "step": 4040
+    },
+    {
+      "epoch": 10.8,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0002999834033007487,
+      "loss": 0.4941,
+      "step": 4050
+    },
+    {
+      "epoch": 10.826666666666666,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002999833096418405,
+      "loss": 0.4892,
+      "step": 4060
+    },
+    {
+      "epoch": 10.853333333333333,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.000299983215719421,
+      "loss": 0.4981,
+      "step": 4070
+    },
+    {
+      "epoch": 10.88,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002999831215334902,
+      "loss": 0.4848,
+      "step": 4080
+    },
+    {
+      "epoch": 10.906666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002999830270840484,
+      "loss": 0.4879,
+      "step": 4090
+    },
+    {
+      "epoch": 10.933333333333334,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002999829323710957,
+      "loss": 0.4898,
+      "step": 4100
+    },
+    {
+      "epoch": 10.96,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002999828373946322,
+      "loss": 0.4875,
+      "step": 4110
+    },
+    {
+      "epoch": 10.986666666666666,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0002999827421546582,
+      "loss": 0.5029,
+      "step": 4120
+    },
+    {
+      "epoch": 11.0,
+      "eval_loss": 0.4987983703613281,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.4885,
+      "eval_samples_per_second": 1.525,
+      "eval_steps_per_second": 0.095,
+      "step": 4125
+    },
+    {
+      "epoch": 11.013333333333334,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0002999826466511738,
+      "loss": 0.4904,
+      "step": 4130
+    },
+    {
+      "epoch": 11.04,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0002999825508841791,
+      "loss": 0.5141,
+      "step": 4140
+    },
+    {
+      "epoch": 11.066666666666666,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029998245485367443,
+      "loss": 0.5024,
+      "step": 4150
+    },
+    {
+      "epoch": 11.093333333333334,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0002999823585596598,
+      "loss": 0.4968,
+      "step": 4160
+    },
+    {
+      "epoch": 11.12,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029998226200213544,
+      "loss": 0.4929,
+      "step": 4170
+    },
+    {
+      "epoch": 11.146666666666667,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029998216518110156,
+      "loss": 0.5028,
+      "step": 4180
+    },
+    {
+      "epoch": 11.173333333333334,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029998206809655825,
+      "loss": 0.4931,
+      "step": 4190
+    },
+    {
+      "epoch": 11.2,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0002999819707485058,
+      "loss": 0.4912,
+      "step": 4200
+    },
+    {
+      "epoch": 11.226666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002999818731369442,
+      "loss": 0.4901,
+      "step": 4210
+    },
+    {
+      "epoch": 11.253333333333334,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0002999817752618738,
+      "loss": 0.4877,
+      "step": 4220
+    },
+    {
+      "epoch": 11.28,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029998167712329466,
+      "loss": 0.5003,
+      "step": 4230
+    },
+    {
+      "epoch": 11.306666666666667,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.000299981578721207,
+      "loss": 0.4824,
+      "step": 4240
+    },
+    {
+      "epoch": 11.333333333333334,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.000299981480055611,
+      "loss": 0.4988,
+      "step": 4250
+    },
+    {
+      "epoch": 11.36,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002999813811265068,
+      "loss": 0.5023,
+      "step": 4260
+    },
+    {
+      "epoch": 11.386666666666667,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029998128193389463,
+      "loss": 0.4926,
+      "step": 4270
+    },
+    {
+      "epoch": 11.413333333333334,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0002999811824777746,
+      "loss": 0.4959,
+      "step": 4280
+    },
+    {
+      "epoch": 11.44,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002999810827581469,
+      "loss": 0.5006,
+      "step": 4290
+    },
+    {
+      "epoch": 11.466666666666667,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0002999809827750117,
+      "loss": 0.5089,
+      "step": 4300
+    },
+    {
+      "epoch": 11.493333333333334,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002999808825283692,
+      "loss": 0.5001,
+      "step": 4310
+    },
+    {
+      "epoch": 11.52,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029998078201821964,
+      "loss": 0.4944,
+      "step": 4320
+    },
+    {
+      "epoch": 11.546666666666667,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.000299980681244563,
+      "loss": 0.4906,
+      "step": 4330
+    },
+    {
+      "epoch": 11.573333333333334,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002999805802073997,
+      "loss": 0.489,
+      "step": 4340
+    },
+    {
+      "epoch": 11.6,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.00029998047890672974,
+      "loss": 0.4838,
+      "step": 4350
+    },
+    {
+      "epoch": 11.626666666666667,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00029998037734255335,
+      "loss": 0.4682,
+      "step": 4360
+    },
+    {
+      "epoch": 11.653333333333332,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029998027551487067,
+      "loss": 0.4906,
+      "step": 4370
+    },
+    {
+      "epoch": 11.68,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.00029998017342368203,
+      "loss": 0.4805,
+      "step": 4380
+    },
+    {
+      "epoch": 11.706666666666667,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00029998007106898743,
+      "loss": 0.4818,
+      "step": 4390
+    },
+    {
+      "epoch": 11.733333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029997996845078714,
+      "loss": 0.5014,
+      "step": 4400
+    },
+    {
+      "epoch": 11.76,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029997986556908133,
+      "loss": 0.5031,
+      "step": 4410
+    },
+    {
+      "epoch": 11.786666666666667,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029997976242387015,
+      "loss": 0.5007,
+      "step": 4420
+    },
+    {
+      "epoch": 11.813333333333333,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.00029997965901515383,
+      "loss": 0.4852,
+      "step": 4430
+    },
+    {
+      "epoch": 11.84,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0002999795553429325,
+      "loss": 0.4927,
+      "step": 4440
+    },
+    {
+      "epoch": 11.866666666666667,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00029997945140720645,
+      "loss": 0.4916,
+      "step": 4450
+    },
+    {
+      "epoch": 11.893333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002999793472079757,
+      "loss": 0.4817,
+      "step": 4460
+    },
+    {
+      "epoch": 11.92,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029997924274524054,
+      "loss": 0.4917,
+      "step": 4470
+    },
+    {
+      "epoch": 11.946666666666667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029997913801900114,
+      "loss": 0.4849,
+      "step": 4480
+    },
+    {
+      "epoch": 11.973333333333333,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029997903302925763,
+      "loss": 0.4966,
+      "step": 4490
+    },
+    {
+      "epoch": 12.0,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029997892777601027,
+      "loss": 0.4862,
+      "step": 4500
+    },
+    {
+      "epoch": 12.0,
+      "eval_loss": 0.49906519055366516,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1272,
+      "eval_samples_per_second": 1.58,
+      "eval_steps_per_second": 0.099,
+      "step": 4500
+    },
+    {
+      "epoch": 12.026666666666667,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0002999788222592592,
+      "loss": 0.5031,
+      "step": 4510
+    },
+    {
+      "epoch": 12.053333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002999787164790046,
+      "loss": 0.5101,
+      "step": 4520
+    },
+    {
+      "epoch": 12.08,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.00029997861043524674,
+      "loss": 0.4975,
+      "step": 4530
+    },
+    {
+      "epoch": 12.106666666666667,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0002999785041279857,
+      "loss": 0.4926,
+      "step": 4540
+    },
+    {
+      "epoch": 12.133333333333333,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002999783975572217,
+      "loss": 0.4959,
+      "step": 4550
+    },
+    {
+      "epoch": 12.16,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0002999782907229549,
+      "loss": 0.4991,
+      "step": 4560
+    },
+    {
+      "epoch": 12.186666666666667,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029997818362518554,
+      "loss": 0.49,
+      "step": 4570
+    },
+    {
+      "epoch": 12.213333333333333,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0002999780762639138,
+      "loss": 0.4853,
+      "step": 4580
+    },
+    {
+      "epoch": 12.24,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00029997796863913986,
+      "loss": 0.4893,
+      "step": 4590
+    },
+    {
+      "epoch": 12.266666666666667,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0002999778607508639,
+      "loss": 0.4931,
+      "step": 4600
+    },
+    {
+      "epoch": 12.293333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002999777525990861,
+      "loss": 0.4867,
+      "step": 4610
+    },
+    {
+      "epoch": 12.32,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002999776441838067,
+      "loss": 0.4908,
+      "step": 4620
+    },
+    {
+      "epoch": 12.346666666666668,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002999775355050259,
+      "loss": 0.5002,
+      "step": 4630
+    },
+    {
+      "epoch": 12.373333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002999774265627438,
+      "loss": 0.4969,
+      "step": 4640
+    },
+    {
+      "epoch": 12.4,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002999773173569606,
+      "loss": 0.4909,
+      "step": 4650
+    },
+    {
+      "epoch": 12.426666666666666,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002999772078876766,
+      "loss": 0.4982,
+      "step": 4660
+    },
+    {
+      "epoch": 12.453333333333333,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0002999770981548918,
+      "loss": 0.5013,
+      "step": 4670
+    },
+    {
+      "epoch": 12.48,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002999769881586066,
+      "loss": 0.5066,
+      "step": 4680
+    },
+    {
+      "epoch": 12.506666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002999768778988211,
+      "loss": 0.4964,
+      "step": 4690
+    },
+    {
+      "epoch": 12.533333333333333,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.00029997676737553554,
+      "loss": 0.4901,
+      "step": 4700
+    },
+    {
+      "epoch": 12.56,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029997665658875003,
+      "loss": 0.4899,
+      "step": 4710
+    },
+    {
+      "epoch": 12.586666666666666,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002999765455384649,
+      "loss": 0.4855,
+      "step": 4720
+    },
+    {
+      "epoch": 12.613333333333333,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.00029997643422468017,
+      "loss": 0.4714,
+      "step": 4730
+    },
+    {
+      "epoch": 12.64,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029997632264739613,
+      "loss": 0.4759,
+      "step": 4740
+    },
+    {
+      "epoch": 12.666666666666666,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00029997621080661297,
+      "loss": 0.4927,
+      "step": 4750
+    },
+    {
+      "epoch": 12.693333333333333,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0002999760987023309,
+      "loss": 0.4743,
+      "step": 4760
+    },
+    {
+      "epoch": 12.72,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029997598633455013,
+      "loss": 0.4926,
+      "step": 4770
+    },
+    {
+      "epoch": 12.746666666666666,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0002999758737032707,
+      "loss": 0.4975,
+      "step": 4780
+    },
+    {
+      "epoch": 12.773333333333333,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0002999757608084931,
+      "loss": 0.5057,
+      "step": 4790
+    },
+    {
+      "epoch": 12.8,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029997564765021727,
+      "loss": 0.4902,
+      "step": 4800
+    },
+    {
+      "epoch": 12.826666666666666,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.00029997553422844356,
+      "loss": 0.4851,
+      "step": 4810
+    },
+    {
+      "epoch": 12.853333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002999754205431721,
+      "loss": 0.495,
+      "step": 4820
+    },
+    {
+      "epoch": 12.88,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0002999753065944031,
+      "loss": 0.4817,
+      "step": 4830
+    },
+    {
+      "epoch": 12.906666666666666,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002999751923821367,
+      "loss": 0.4842,
+      "step": 4840
+    },
+    {
+      "epoch": 12.933333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002999750779063732,
+      "loss": 0.4862,
+      "step": 4850
+    },
+    {
+      "epoch": 12.96,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002999749631671128,
+      "loss": 0.4846,
+      "step": 4860
+    },
+    {
+      "epoch": 12.986666666666666,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0002999748481643557,
+      "loss": 0.4995,
+      "step": 4870
+    },
+    {
+      "epoch": 13.0,
+      "eval_loss": 0.49617522954940796,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9648,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 0.1,
+      "step": 4875
+    },
+    {
+      "epoch": 13.013333333333334,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.000299974732898102,
+      "loss": 0.4874,
+      "step": 4880
+    },
+    {
+      "epoch": 13.04,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029997461736835197,
+      "loss": 0.5113,
+      "step": 4890
+    },
+    {
+      "epoch": 13.066666666666666,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.00029997450157510583,
+      "loss": 0.499,
+      "step": 4900
+    },
+    {
+      "epoch": 13.093333333333334,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.00029997438551836376,
+      "loss": 0.4937,
+      "step": 4910
+    },
+    {
+      "epoch": 13.12,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.000299974269198126,
+      "loss": 0.4907,
+      "step": 4920
+    },
+    {
+      "epoch": 13.146666666666667,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.00029997415261439275,
+      "loss": 0.5002,
+      "step": 4930
+    },
+    {
+      "epoch": 13.173333333333334,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029997403576716413,
+      "loss": 0.4902,
+      "step": 4940
+    },
+    {
+      "epoch": 13.2,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029997391865644045,
+      "loss": 0.4879,
+      "step": 4950
+    },
+    {
+      "epoch": 13.226666666666667,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0002999738012822218,
+      "loss": 0.487,
+      "step": 4960
+    },
+    {
+      "epoch": 13.253333333333334,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.00029997368364450855,
+      "loss": 0.4855,
+      "step": 4970
+    },
+    {
+      "epoch": 13.28,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002999735657433007,
+      "loss": 0.4987,
+      "step": 4980
+    },
+    {
+      "epoch": 13.306666666666667,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029997344757859873,
+      "loss": 0.4802,
+      "step": 4990
+    },
+    {
+      "epoch": 13.333333333333334,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002999733291504026,
+      "loss": 0.496,
+      "step": 5000
+    },
+    {
+      "epoch": 13.36,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0002999732104587126,
+      "loss": 0.4988,
+      "step": 5010
+    },
+    {
+      "epoch": 13.386666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029997309150352897,
+      "loss": 0.4898,
+      "step": 5020
+    },
+    {
+      "epoch": 13.413333333333334,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0002999729722848519,
+      "loss": 0.4938,
+      "step": 5030
+    },
+    {
+      "epoch": 13.44,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0002999728528026816,
+      "loss": 0.4966,
+      "step": 5040
+    },
+    {
+      "epoch": 13.466666666666667,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002999727330570182,
+      "loss": 0.5066,
+      "step": 5050
+    },
+    {
+      "epoch": 13.493333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029997261304786206,
+      "loss": 0.4984,
+      "step": 5060
+    },
+    {
+      "epoch": 13.52,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0002999724927752133,
+      "loss": 0.4918,
+      "step": 5070
+    },
+    {
+      "epoch": 13.546666666666667,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029997237223907216,
+      "loss": 0.4883,
+      "step": 5080
+    },
+    {
+      "epoch": 13.573333333333334,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0002999722514394388,
+      "loss": 0.4858,
+      "step": 5090
+    },
+    {
+      "epoch": 13.6,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002999721303763135,
+      "loss": 0.4816,
+      "step": 5100
+    },
+    {
+      "epoch": 13.626666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029997200904969645,
+      "loss": 0.4665,
+      "step": 5110
+    },
+    {
+      "epoch": 13.653333333333332,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0002999718874595878,
+      "loss": 0.4872,
+      "step": 5120
+    },
+    {
+      "epoch": 13.68,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0002999717656059879,
+      "loss": 0.4782,
+      "step": 5130
+    },
+    {
+      "epoch": 13.706666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029997164348889683,
+      "loss": 0.4788,
+      "step": 5140
+    },
+    {
+      "epoch": 13.733333333333333,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.00029997152110831487,
+      "loss": 0.4993,
+      "step": 5150
+    },
+    {
+      "epoch": 13.76,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.00029997139846424224,
+      "loss": 0.5006,
+      "step": 5160
+    },
+    {
+      "epoch": 13.786666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029997127555667915,
+      "loss": 0.4985,
+      "step": 5170
+    },
+    {
+      "epoch": 13.813333333333333,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002999711523856258,
+      "loss": 0.4831,
+      "step": 5180
+    },
+    {
+      "epoch": 13.84,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002999710289510824,
+      "loss": 0.49,
+      "step": 5190
+    },
+    {
+      "epoch": 13.866666666666667,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002999709052530492,
+      "loss": 0.4887,
+      "step": 5200
+    },
+    {
+      "epoch": 13.893333333333333,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029997078129152636,
+      "loss": 0.4787,
+      "step": 5210
+    },
+    {
+      "epoch": 13.92,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002999706570665142,
+      "loss": 0.4892,
+      "step": 5220
+    },
+    {
+      "epoch": 13.946666666666667,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0002999705325780128,
+      "loss": 0.4829,
+      "step": 5230
+    },
+    {
+      "epoch": 13.973333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029997040782602245,
+      "loss": 0.4936,
+      "step": 5240
+    },
+    {
+      "epoch": 14.0,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002999702828105434,
+      "loss": 0.4825,
+      "step": 5250
+    },
+    {
+      "epoch": 14.0,
+      "eval_loss": 0.49614495038986206,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9533,
+      "eval_samples_per_second": 1.608,
+      "eval_steps_per_second": 0.1,
+      "step": 5250
+    },
+    {
+      "epoch": 14.026666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002999701575315759,
+      "loss": 0.5015,
+      "step": 5260
+    },
+    {
+      "epoch": 14.053333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029997003198912,
+      "loss": 0.5078,
+      "step": 5270
+    },
+    {
+      "epoch": 14.08,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00029996990618317605,
+      "loss": 0.4954,
+      "step": 5280
+    },
+    {
+      "epoch": 14.106666666666667,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0002999697801137443,
+      "loss": 0.4893,
+      "step": 5290
+    },
+    {
+      "epoch": 14.133333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002999696537808249,
+      "loss": 0.4927,
+      "step": 5300
+    },
+    {
+      "epoch": 14.16,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002999695271844181,
+      "loss": 0.4963,
+      "step": 5310
+    },
+    {
+      "epoch": 14.186666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029996940032452415,
+      "loss": 0.4881,
+      "step": 5320
+    },
+    {
+      "epoch": 14.213333333333333,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0002999692732011432,
+      "loss": 0.4833,
+      "step": 5330
+    },
+    {
+      "epoch": 14.24,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0002999691458142755,
+      "loss": 0.4867,
+      "step": 5340
+    },
+    {
+      "epoch": 14.266666666666667,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.00029996901816392135,
+      "loss": 0.4908,
+      "step": 5350
+    },
+    {
+      "epoch": 14.293333333333333,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.00029996889025008086,
+      "loss": 0.4845,
+      "step": 5360
+    },
+    {
+      "epoch": 14.32,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0002999687620727543,
+      "loss": 0.4887,
+      "step": 5370
+    },
+    {
+      "epoch": 14.346666666666668,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029996863363194196,
+      "loss": 0.497,
+      "step": 5380
+    },
+    {
+      "epoch": 14.373333333333333,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002999685049276439,
+      "loss": 0.4939,
+      "step": 5390
+    },
+    {
+      "epoch": 14.4,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.00029996837595986053,
+      "loss": 0.4885,
+      "step": 5400
+    },
+    {
+      "epoch": 14.426666666666666,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.000299968246728592,
+      "loss": 0.4952,
+      "step": 5410
+    },
+    {
+      "epoch": 14.453333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029996811723383853,
+      "loss": 0.4975,
+      "step": 5420
+    },
+    {
+      "epoch": 14.48,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029996798747560036,
+      "loss": 0.5044,
+      "step": 5430
+    },
+    {
+      "epoch": 14.506666666666666,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002999678574538777,
+      "loss": 0.4943,
+      "step": 5440
+    },
+    {
+      "epoch": 14.533333333333333,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0002999677271686708,
+      "loss": 0.4877,
+      "step": 5450
+    },
+    {
+      "epoch": 14.56,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0002999675966199799,
+      "loss": 0.4876,
+      "step": 5460
+    },
+    {
+      "epoch": 14.586666666666666,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0002999674658078052,
+      "loss": 0.4833,
+      "step": 5470
+    },
+    {
+      "epoch": 14.613333333333333,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.00029996733473214694,
+      "loss": 0.4686,
+      "step": 5480
+    },
+    {
+      "epoch": 14.64,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.00029996720339300534,
+      "loss": 0.4727,
+      "step": 5490
+    },
+    {
+      "epoch": 14.666666666666666,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0002999670717903806,
+      "loss": 0.4904,
+      "step": 5500
+    },
+    {
+      "epoch": 14.693333333333333,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00029996693992427305,
+      "loss": 0.4725,
+      "step": 5510
+    },
+    {
+      "epoch": 14.72,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.00029996680779468285,
+      "loss": 0.4896,
+      "step": 5520
+    },
+    {
+      "epoch": 14.746666666666666,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002999666754016102,
+      "loss": 0.4947,
+      "step": 5530
+    },
+    {
+      "epoch": 14.773333333333333,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002999665427450555,
+      "loss": 0.5023,
+      "step": 5540
+    },
+    {
+      "epoch": 14.8,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029996640982501874,
+      "loss": 0.4885,
+      "step": 5550
+    },
+    {
+      "epoch": 14.826666666666666,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00029996627664150027,
+      "loss": 0.4836,
+      "step": 5560
+    },
+    {
+      "epoch": 14.853333333333333,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002999661431945004,
+      "loss": 0.4926,
+      "step": 5570
+    },
+    {
+      "epoch": 14.88,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00029996600948401924,
+      "loss": 0.4801,
+      "step": 5580
+    },
+    {
+      "epoch": 14.906666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002999658755100571,
+      "loss": 0.4824,
+      "step": 5590
+    },
+    {
+      "epoch": 14.933333333333334,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002999657412726142,
+      "loss": 0.4838,
+      "step": 5600
+    },
+    {
+      "epoch": 14.96,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029996560677169075,
+      "loss": 0.4822,
+      "step": 5610
+    },
+    {
+      "epoch": 14.986666666666666,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029996547200728706,
+      "loss": 0.4971,
+      "step": 5620
+    },
+    {
+      "epoch": 15.0,
+      "eval_loss": 0.4952986538410187,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7083,
+      "eval_samples_per_second": 1.648,
+      "eval_steps_per_second": 0.103,
+      "step": 5625
+    },
+    {
+      "epoch": 15.013333333333334,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0002999653369794033,
+      "loss": 0.4854,
+      "step": 5630
+    },
+    {
+      "epoch": 15.04,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00029996520168803966,
+      "loss": 0.5091,
+      "step": 5640
+    },
+    {
+      "epoch": 15.066666666666666,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0002999650661331965,
+      "loss": 0.4976,
+      "step": 5650
+    },
+    {
+      "epoch": 15.093333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029996493031487397,
+      "loss": 0.4916,
+      "step": 5660
+    },
+    {
+      "epoch": 15.12,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0002999647942330723,
+      "loss": 0.4879,
+      "step": 5670
+    },
+    {
+      "epoch": 15.146666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002999646578877918,
+      "loss": 0.4976,
+      "step": 5680
+    },
+    {
+      "epoch": 15.173333333333334,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.00029996452127903264,
+      "loss": 0.4874,
+      "step": 5690
+    },
+    {
+      "epoch": 15.2,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029996438440679514,
+      "loss": 0.486,
+      "step": 5700
+    },
+    {
+      "epoch": 15.226666666666667,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0002999642472710795,
+      "loss": 0.4837,
+      "step": 5710
+    },
+    {
+      "epoch": 15.253333333333334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002999641098718859,
+      "loss": 0.4825,
+      "step": 5720
+    },
+    {
+      "epoch": 15.28,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.00029996397220921465,
+      "loss": 0.4955,
+      "step": 5730
+    },
+    {
+      "epoch": 15.306666666666667,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.000299963834283066,
+      "loss": 0.4771,
+      "step": 5740
+    },
+    {
+      "epoch": 15.333333333333334,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00029996369609344015,
+      "loss": 0.4939,
+      "step": 5750
+    },
+    {
+      "epoch": 15.36,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0002999635576403374,
+      "loss": 0.4971,
+      "step": 5760
+    },
+    {
+      "epoch": 15.386666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029996341892375787,
+      "loss": 0.4869,
+      "step": 5770
+    },
+    {
+      "epoch": 15.413333333333334,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.00029996327994370194,
+      "loss": 0.4904,
+      "step": 5780
+    },
+    {
+      "epoch": 15.44,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029996314070016987,
+      "loss": 0.4949,
+      "step": 5790
+    },
+    {
+      "epoch": 15.466666666666667,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029996300119316177,
+      "loss": 0.5034,
+      "step": 5800
+    },
+    {
+      "epoch": 15.493333333333334,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029996286142267795,
+      "loss": 0.4953,
+      "step": 5810
+    },
+    {
+      "epoch": 15.52,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002999627213887187,
+      "loss": 0.4899,
+      "step": 5820
+    },
+    {
+      "epoch": 15.546666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029996258109128417,
+      "loss": 0.4865,
+      "step": 5830
+    },
+    {
+      "epoch": 15.573333333333334,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002999624405303747,
+      "loss": 0.4846,
+      "step": 5840
+    },
+    {
+      "epoch": 15.6,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.00029996229970599047,
+      "loss": 0.4785,
+      "step": 5850
+    },
+    {
+      "epoch": 15.626666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029996215861813173,
+      "loss": 0.4644,
+      "step": 5860
+    },
+    {
+      "epoch": 15.653333333333332,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00029996201726679886,
+      "loss": 0.4848,
+      "step": 5870
+    },
+    {
+      "epoch": 15.68,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002999618756519919,
+      "loss": 0.4757,
+      "step": 5880
+    },
+    {
+      "epoch": 15.706666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029996173377371124,
+      "loss": 0.477,
+      "step": 5890
+    },
+    {
+      "epoch": 15.733333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002999615916319571,
+      "loss": 0.4964,
+      "step": 5900
+    },
+    {
+      "epoch": 15.76,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0002999614492267297,
+      "loss": 0.4983,
+      "step": 5910
+    },
+    {
+      "epoch": 15.786666666666667,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.00029996130655802935,
+      "loss": 0.4963,
+      "step": 5920
+    },
+    {
+      "epoch": 15.813333333333333,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002999611636258562,
+      "loss": 0.4803,
+      "step": 5930
+    },
+    {
+      "epoch": 15.84,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002999610204302106,
+      "loss": 0.4871,
+      "step": 5940
+    },
+    {
+      "epoch": 15.866666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029996087697109274,
+      "loss": 0.4864,
+      "step": 5950
+    },
+    {
+      "epoch": 15.893333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002999607332485029,
+      "loss": 0.477,
+      "step": 5960
+    },
+    {
+      "epoch": 15.92,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029996058926244135,
+      "loss": 0.4876,
+      "step": 5970
+    },
+    {
+      "epoch": 15.946666666666667,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029996044501290835,
+      "loss": 0.4804,
+      "step": 5980
+    },
+    {
+      "epoch": 15.973333333333333,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0002999603004999041,
+      "loss": 0.4919,
+      "step": 5990
+    },
+    {
+      "epoch": 16.0,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029996015572342883,
+      "loss": 0.4805,
+      "step": 6000
+    },
+    {
+      "epoch": 16.0,
+      "eval_loss": 0.4939233958721161,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1895,
+      "eval_samples_per_second": 1.57,
+      "eval_steps_per_second": 0.098,
+      "step": 6000
+    },
+    {
+      "epoch": 16.026666666666667,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002999600106834829,
+      "loss": 0.5004,
+      "step": 6010
+    },
+    {
+      "epoch": 16.053333333333335,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029995986538006647,
+      "loss": 0.5067,
+      "step": 6020
+    },
+    {
+      "epoch": 16.08,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029995971981317985,
+      "loss": 0.493,
+      "step": 6030
+    },
+    {
+      "epoch": 16.106666666666666,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002999595739828232,
+      "loss": 0.4878,
+      "step": 6040
+    },
+    {
+      "epoch": 16.133333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.000299959427888997,
+      "loss": 0.4903,
+      "step": 6050
+    },
+    {
+      "epoch": 16.16,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029995928153170125,
+      "loss": 0.4938,
+      "step": 6060
+    },
+    {
+      "epoch": 16.186666666666667,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0002999591349109364,
+      "loss": 0.4863,
+      "step": 6070
+    },
+    {
+      "epoch": 16.213333333333335,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00029995898802670257,
+      "loss": 0.482,
+      "step": 6080
+    },
+    {
+      "epoch": 16.24,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002999588408790001,
+      "loss": 0.4857,
+      "step": 6090
+    },
+    {
+      "epoch": 16.266666666666666,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0002999586934678292,
+      "loss": 0.4879,
+      "step": 6100
+    },
+    {
+      "epoch": 16.293333333333333,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0002999585457931902,
+      "loss": 0.4824,
+      "step": 6110
+    },
+    {
+      "epoch": 16.32,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029995839785508326,
+      "loss": 0.4872,
+      "step": 6120
+    },
+    {
+      "epoch": 16.346666666666668,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002999582496535087,
+      "loss": 0.4958,
+      "step": 6130
+    },
+    {
+      "epoch": 16.373333333333335,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029995810118846675,
+      "loss": 0.4914,
+      "step": 6140
+    },
+    {
+      "epoch": 16.4,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029995795245995775,
+      "loss": 0.4861,
+      "step": 6150
+    },
+    {
+      "epoch": 16.426666666666666,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0002999578034679819,
+      "loss": 0.4931,
+      "step": 6160
+    },
+    {
+      "epoch": 16.453333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029995765421253945,
+      "loss": 0.496,
+      "step": 6170
+    },
+    {
+      "epoch": 16.48,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029995750469363064,
+      "loss": 0.5016,
+      "step": 6180
+    },
+    {
+      "epoch": 16.506666666666668,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0002999573549112558,
+      "loss": 0.4907,
+      "step": 6190
+    },
+    {
+      "epoch": 16.533333333333335,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0002999572048654152,
+      "loss": 0.4849,
+      "step": 6200
+    },
+    {
+      "epoch": 16.56,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.000299957054556109,
+      "loss": 0.4857,
+      "step": 6210
+    },
+    {
+      "epoch": 16.586666666666666,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.00029995690398333755,
+      "loss": 0.4808,
+      "step": 6220
+    },
+    {
+      "epoch": 16.613333333333333,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0002999567531471011,
+      "loss": 0.467,
+      "step": 6230
+    },
+    {
+      "epoch": 16.64,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00029995660204739993,
+      "loss": 0.4714,
+      "step": 6240
+    },
+    {
+      "epoch": 16.666666666666668,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0002999564506842343,
+      "loss": 0.4881,
+      "step": 6250
+    },
+    {
+      "epoch": 16.693333333333335,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029995629905760446,
+      "loss": 0.47,
+      "step": 6260
+    },
+    {
+      "epoch": 16.72,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0002999561471675106,
+      "loss": 0.4876,
+      "step": 6270
+    },
+    {
+      "epoch": 16.746666666666666,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029995599501395315,
+      "loss": 0.4933,
+      "step": 6280
+    },
+    {
+      "epoch": 16.773333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029995584259693225,
+      "loss": 0.5008,
+      "step": 6290
+    },
+    {
+      "epoch": 16.8,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.00029995568991644827,
+      "loss": 0.4866,
+      "step": 6300
+    },
+    {
+      "epoch": 16.826666666666668,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002999555369725013,
+      "loss": 0.4811,
+      "step": 6310
+    },
+    {
+      "epoch": 16.85333333333333,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029995538376509183,
+      "loss": 0.4908,
+      "step": 6320
+    },
+    {
+      "epoch": 16.88,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.00029995523029422,
+      "loss": 0.478,
+      "step": 6330
+    },
+    {
+      "epoch": 16.906666666666666,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002999550765598861,
+      "loss": 0.4809,
+      "step": 6340
+    },
+    {
+      "epoch": 16.933333333333334,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002999549225620904,
+      "loss": 0.4826,
+      "step": 6350
+    },
+    {
+      "epoch": 16.96,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002999547683008332,
+      "loss": 0.4808,
+      "step": 6360
+    },
+    {
+      "epoch": 16.986666666666668,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.00029995461377611474,
+      "loss": 0.495,
+      "step": 6370
+    },
+    {
+      "epoch": 17.0,
+      "eval_loss": 0.4929427206516266,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0071,
+      "eval_samples_per_second": 1.599,
+      "eval_steps_per_second": 0.1,
+      "step": 6375
+    },
+    {
+      "epoch": 17.013333333333332,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0002999544589879353,
+      "loss": 0.4829,
+      "step": 6380
+    },
+    {
+      "epoch": 17.04,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.00029995430393629514,
+      "loss": 0.5072,
+      "step": 6390
+    },
+    {
+      "epoch": 17.066666666666666,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029995414862119457,
+      "loss": 0.495,
+      "step": 6400
+    },
+    {
+      "epoch": 17.093333333333334,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.00029995399304263385,
+      "loss": 0.4899,
+      "step": 6410
+    },
+    {
+      "epoch": 17.12,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0002999538372006132,
+      "loss": 0.4868,
+      "step": 6420
+    },
+    {
+      "epoch": 17.14666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002999536810951329,
+      "loss": 0.4959,
+      "step": 6430
+    },
+    {
+      "epoch": 17.173333333333332,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002999535247261933,
+      "loss": 0.4858,
+      "step": 6440
+    },
+    {
+      "epoch": 17.2,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029995336809379464,
+      "loss": 0.4845,
+      "step": 6450
+    },
+    {
+      "epoch": 17.226666666666667,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002999532111979372,
+      "loss": 0.482,
+      "step": 6460
+    },
+    {
+      "epoch": 17.253333333333334,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0002999530540386212,
+      "loss": 0.4812,
+      "step": 6470
+    },
+    {
+      "epoch": 17.28,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029995289661584703,
+      "loss": 0.4944,
+      "step": 6480
+    },
+    {
+      "epoch": 17.306666666666665,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029995273892961485,
+      "loss": 0.4761,
+      "step": 6490
+    },
+    {
+      "epoch": 17.333333333333332,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029995258097992495,
+      "loss": 0.4915,
+      "step": 6500
+    },
+    {
+      "epoch": 17.36,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0002999524227667777,
+      "loss": 0.4946,
+      "step": 6510
+    },
+    {
+      "epoch": 17.386666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029995226429017333,
+      "loss": 0.4857,
+      "step": 6520
+    },
+    {
+      "epoch": 17.413333333333334,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029995210555011203,
+      "loss": 0.489,
+      "step": 6530
+    },
+    {
+      "epoch": 17.44,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002999519465465942,
+      "loss": 0.4927,
+      "step": 6540
+    },
+    {
+      "epoch": 17.466666666666665,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002999517872796201,
+      "loss": 0.5021,
+      "step": 6550
+    },
+    {
+      "epoch": 17.493333333333332,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029995162774918994,
+      "loss": 0.4933,
+      "step": 6560
+    },
+    {
+      "epoch": 17.52,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0002999514679553041,
+      "loss": 0.4871,
+      "step": 6570
+    },
+    {
+      "epoch": 17.546666666666667,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0002999513078979628,
+      "loss": 0.4846,
+      "step": 6580
+    },
+    {
+      "epoch": 17.573333333333334,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00029995114757716627,
+      "loss": 0.4837,
+      "step": 6590
+    },
+    {
+      "epoch": 17.6,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.00029995098699291486,
+      "loss": 0.4777,
+      "step": 6600
+    },
+    {
+      "epoch": 17.626666666666665,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002999508261452089,
+      "loss": 0.4628,
+      "step": 6610
+    },
+    {
+      "epoch": 17.653333333333332,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029995066503404855,
+      "loss": 0.4827,
+      "step": 6620
+    },
+    {
+      "epoch": 17.68,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0002999505036594342,
+      "loss": 0.4752,
+      "step": 6630
+    },
+    {
+      "epoch": 17.706666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002999503420213661,
+      "loss": 0.4753,
+      "step": 6640
+    },
+    {
+      "epoch": 17.733333333333334,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002999501801198445,
+      "loss": 0.4949,
+      "step": 6650
+    },
+    {
+      "epoch": 17.76,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002999500179548697,
+      "loss": 0.4968,
+      "step": 6660
+    },
+    {
+      "epoch": 17.786666666666665,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.000299949855526442,
+      "loss": 0.4946,
+      "step": 6670
+    },
+    {
+      "epoch": 17.813333333333333,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002999496928345617,
+      "loss": 0.4783,
+      "step": 6680
+    },
+    {
+      "epoch": 17.84,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.000299949529879229,
+      "loss": 0.4851,
+      "step": 6690
+    },
+    {
+      "epoch": 17.866666666666667,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002999493666604443,
+      "loss": 0.4849,
+      "step": 6700
+    },
+    {
+      "epoch": 17.893333333333334,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.00029994920317820786,
+      "loss": 0.4752,
+      "step": 6710
+    },
+    {
+      "epoch": 17.92,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002999490394325199,
+      "loss": 0.4861,
+      "step": 6720
+    },
+    {
+      "epoch": 17.946666666666665,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029994887542338084,
+      "loss": 0.4786,
+      "step": 6730
+    },
+    {
+      "epoch": 17.973333333333333,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002999487111507908,
+      "loss": 0.4904,
+      "step": 6740
+    },
+    {
+      "epoch": 18.0,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0002999485466147502,
+      "loss": 0.4793,
+      "step": 6750
+    },
+    {
+      "epoch": 18.0,
+      "eval_loss": 0.49115684628486633,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9214,
+      "eval_samples_per_second": 1.613,
+      "eval_steps_per_second": 0.101,
+      "step": 6750
+    },
+    {
+      "epoch": 18.026666666666667,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.00029994838181525923,
+      "loss": 0.4974,
+      "step": 6760
+    },
+    {
+      "epoch": 18.053333333333335,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0002999482167523182,
+      "loss": 0.5044,
+      "step": 6770
+    },
+    {
+      "epoch": 18.08,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0002999480514259275,
+      "loss": 0.4917,
+      "step": 6780
+    },
+    {
+      "epoch": 18.106666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002999478858360873,
+      "loss": 0.4856,
+      "step": 6790
+    },
+    {
+      "epoch": 18.133333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.000299947719982798,
+      "loss": 0.4884,
+      "step": 6800
+    },
+    {
+      "epoch": 18.16,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002999475538660598,
+      "loss": 0.493,
+      "step": 6810
+    },
+    {
+      "epoch": 18.186666666666667,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00029994738748587304,
+      "loss": 0.4845,
+      "step": 6820
+    },
+    {
+      "epoch": 18.213333333333335,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029994722084223795,
+      "loss": 0.4797,
+      "step": 6830
+    },
+    {
+      "epoch": 18.24,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002999470539351549,
+      "loss": 0.4836,
+      "step": 6840
+    },
+    {
+      "epoch": 18.266666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002999468867646241,
+      "loss": 0.4869,
+      "step": 6850
+    },
+    {
+      "epoch": 18.293333333333333,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.000299946719330646,
+      "loss": 0.4809,
+      "step": 6860
+    },
+    {
+      "epoch": 18.32,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002999465516332207,
+      "loss": 0.485,
+      "step": 6870
+    },
+    {
+      "epoch": 18.346666666666668,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029994638367234866,
+      "loss": 0.4931,
+      "step": 6880
+    },
+    {
+      "epoch": 18.373333333333335,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029994621544803,
+      "loss": 0.4901,
+      "step": 6890
+    },
+    {
+      "epoch": 18.4,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002999460469602652,
+      "loss": 0.4846,
+      "step": 6900
+    },
+    {
+      "epoch": 18.426666666666666,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00029994587820905444,
+      "loss": 0.4916,
+      "step": 6910
+    },
+    {
+      "epoch": 18.453333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029994570919439806,
+      "loss": 0.4941,
+      "step": 6920
+    },
+    {
+      "epoch": 18.48,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0002999455399162963,
+      "loss": 0.4998,
+      "step": 6930
+    },
+    {
+      "epoch": 18.506666666666668,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029994537037474956,
+      "loss": 0.4897,
+      "step": 6940
+    },
+    {
+      "epoch": 18.533333333333335,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002999452005697581,
+      "loss": 0.4836,
+      "step": 6950
+    },
+    {
+      "epoch": 18.56,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029994503050132215,
+      "loss": 0.4835,
+      "step": 6960
+    },
+    {
+      "epoch": 18.586666666666666,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00029994486016944204,
+      "loss": 0.48,
+      "step": 6970
+    },
+    {
+      "epoch": 18.613333333333333,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002999446895741181,
+      "loss": 0.4654,
+      "step": 6980
+    },
+    {
+      "epoch": 18.64,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029994451871535063,
+      "loss": 0.4698,
+      "step": 6990
+    },
+    {
+      "epoch": 18.666666666666668,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.00029994434759313994,
+      "loss": 0.4857,
+      "step": 7000
+    },
+    {
+      "epoch": 18.693333333333335,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0002999441762074863,
+      "loss": 0.4688,
+      "step": 7010
+    },
+    {
+      "epoch": 18.72,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002999440045583899,
+      "loss": 0.4867,
+      "step": 7020
+    },
+    {
+      "epoch": 18.746666666666666,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00029994383264585135,
+      "loss": 0.4909,
+      "step": 7030
+    },
+    {
+      "epoch": 18.773333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029994366046987063,
+      "loss": 0.4979,
+      "step": 7040
+    },
+    {
+      "epoch": 18.8,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0002999434880304482,
+      "loss": 0.4848,
+      "step": 7050
+    },
+    {
+      "epoch": 18.826666666666668,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029994331532758435,
+      "loss": 0.4798,
+      "step": 7060
+    },
+    {
+      "epoch": 18.85333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029994314236127934,
+      "loss": 0.4898,
+      "step": 7070
+    },
+    {
+      "epoch": 18.88,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002999429691315335,
+      "loss": 0.4763,
+      "step": 7080
+    },
+    {
+      "epoch": 18.906666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029994279563834723,
+      "loss": 0.4788,
+      "step": 7090
+    },
+    {
+      "epoch": 18.933333333333334,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002999426218817207,
+      "loss": 0.4809,
+      "step": 7100
+    },
+    {
+      "epoch": 18.96,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0002999424478616542,
+      "loss": 0.4789,
+      "step": 7110
+    },
+    {
+      "epoch": 18.986666666666668,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002999422735781482,
+      "loss": 0.4933,
+      "step": 7120
+    },
+    {
+      "epoch": 19.0,
+      "eval_loss": 0.491379976272583,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9578,
+      "eval_samples_per_second": 1.607,
+      "eval_steps_per_second": 0.1,
+      "step": 7125
+    },
+    {
+      "epoch": 19.013333333333332,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0002999420990312028,
+      "loss": 0.4821,
+      "step": 7130
+    },
+    {
+      "epoch": 19.04,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.00029994192422081844,
+      "loss": 0.506,
+      "step": 7140
+    },
+    {
+      "epoch": 19.066666666666666,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002999417491469954,
+      "loss": 0.4937,
+      "step": 7150
+    },
+    {
+      "epoch": 19.093333333333334,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.00029994157380973396,
+      "loss": 0.4883,
+      "step": 7160
+    },
+    {
+      "epoch": 19.12,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029994139820903445,
+      "loss": 0.4846,
+      "step": 7170
+    },
+    {
+      "epoch": 19.14666666666667,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029994122234489723,
+      "loss": 0.4944,
+      "step": 7180
+    },
+    {
+      "epoch": 19.173333333333332,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002999410462173225,
+      "loss": 0.4843,
+      "step": 7190
+    },
+    {
+      "epoch": 19.2,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002999408698263107,
+      "loss": 0.4829,
+      "step": 7200
+    },
+    {
+      "epoch": 19.226666666666667,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.000299940693171862,
+      "loss": 0.4801,
+      "step": 7210
+    },
+    {
+      "epoch": 19.253333333333334,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002999405162539768,
+      "loss": 0.4794,
+      "step": 7220
+    },
+    {
+      "epoch": 19.28,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0002999403390726554,
+      "loss": 0.493,
+      "step": 7230
+    },
+    {
+      "epoch": 19.306666666666665,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00029994016162789803,
+      "loss": 0.4751,
+      "step": 7240
+    },
+    {
+      "epoch": 19.333333333333332,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029993998391970514,
+      "loss": 0.4898,
+      "step": 7250
+    },
+    {
+      "epoch": 19.36,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.000299939805948077,
+      "loss": 0.4928,
+      "step": 7260
+    },
+    {
+      "epoch": 19.386666666666667,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002999396277130138,
+      "loss": 0.4836,
+      "step": 7270
+    },
+    {
+      "epoch": 19.413333333333334,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029993944921451605,
+      "loss": 0.4873,
+      "step": 7280
+    },
+    {
+      "epoch": 19.44,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00029993927045258393,
+      "loss": 0.4913,
+      "step": 7290
+    },
+    {
+      "epoch": 19.466666666666665,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002999390914272178,
+      "loss": 0.5015,
+      "step": 7300
+    },
+    {
+      "epoch": 19.493333333333332,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002999389121384179,
+      "loss": 0.4915,
+      "step": 7310
+    },
+    {
+      "epoch": 19.52,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002999387325861846,
+      "loss": 0.4858,
+      "step": 7320
+    },
+    {
+      "epoch": 19.546666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002999385527705183,
+      "loss": 0.4827,
+      "step": 7330
+    },
+    {
+      "epoch": 19.573333333333334,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0002999383726914192,
+      "loss": 0.4814,
+      "step": 7340
+    },
+    {
+      "epoch": 19.6,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029993819234888766,
+      "loss": 0.4755,
+      "step": 7350
+    },
+    {
+      "epoch": 19.626666666666665,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.000299938011742924,
+      "loss": 0.4616,
+      "step": 7360
+    },
+    {
+      "epoch": 19.653333333333332,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.00029993783087352847,
+      "loss": 0.4811,
+      "step": 7370
+    },
+    {
+      "epoch": 19.68,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0002999376497407015,
+      "loss": 0.4728,
+      "step": 7380
+    },
+    {
+      "epoch": 19.706666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029993746834444337,
+      "loss": 0.4731,
+      "step": 7390
+    },
+    {
+      "epoch": 19.733333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029993728668475434,
+      "loss": 0.4937,
+      "step": 7400
+    },
+    {
+      "epoch": 19.76,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002999371047616348,
+      "loss": 0.495,
+      "step": 7410
+    },
+    {
+      "epoch": 19.786666666666665,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.000299936922575085,
+      "loss": 0.4934,
+      "step": 7420
+    },
+    {
+      "epoch": 19.813333333333333,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029993674012510535,
+      "loss": 0.4772,
+      "step": 7430
+    },
+    {
+      "epoch": 19.84,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002999365574116961,
+      "loss": 0.4843,
+      "step": 7440
+    },
+    {
+      "epoch": 19.866666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029993637443485763,
+      "loss": 0.4831,
+      "step": 7450
+    },
+    {
+      "epoch": 19.893333333333334,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00029993619119459016,
+      "loss": 0.4739,
+      "step": 7460
+    },
+    {
+      "epoch": 19.92,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029993600769089414,
+      "loss": 0.4844,
+      "step": 7470
+    },
+    {
+      "epoch": 19.946666666666665,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029993582392376985,
+      "loss": 0.4779,
+      "step": 7480
+    },
+    {
+      "epoch": 19.973333333333333,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0002999356398932175,
+      "loss": 0.4884,
+      "step": 7490
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029993545559923757,
+      "loss": 0.4784,
+      "step": 7500
+    },
+    {
+      "epoch": 20.0,
+      "eval_loss": 0.4901997447013855,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2864,
+      "eval_samples_per_second": 1.555,
+      "eval_steps_per_second": 0.097,
+      "step": 7500
+    },
+    {
+      "epoch": 20.026666666666667,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00029993527104183024,
+      "loss": 0.4961,
+      "step": 7510
+    },
+    {
+      "epoch": 20.053333333333335,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029993508622099603,
+      "loss": 0.5029,
+      "step": 7520
+    },
+    {
+      "epoch": 20.08,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002999349011367351,
+      "loss": 0.4897,
+      "step": 7530
+    },
+    {
+      "epoch": 20.106666666666666,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002999347157890478,
+      "loss": 0.4848,
+      "step": 7540
+    },
+    {
+      "epoch": 20.133333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002999345301779345,
+      "loss": 0.4869,
+      "step": 7550
+    },
+    {
+      "epoch": 20.16,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002999343443033955,
+      "loss": 0.4904,
+      "step": 7560
+    },
+    {
+      "epoch": 20.186666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002999341581654311,
+      "loss": 0.483,
+      "step": 7570
+    },
+    {
+      "epoch": 20.213333333333335,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0002999339717640417,
+      "loss": 0.4778,
+      "step": 7580
+    },
+    {
+      "epoch": 20.24,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029993378509922755,
+      "loss": 0.4822,
+      "step": 7590
+    },
+    {
+      "epoch": 20.266666666666666,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002999335981709891,
+      "loss": 0.4857,
+      "step": 7600
+    },
+    {
+      "epoch": 20.293333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029993341097932653,
+      "loss": 0.4799,
+      "step": 7610
+    },
+    {
+      "epoch": 20.32,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029993322352424024,
+      "loss": 0.4829,
+      "step": 7620
+    },
+    {
+      "epoch": 20.346666666666668,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0002999330358057305,
+      "loss": 0.4917,
+      "step": 7630
+    },
+    {
+      "epoch": 20.373333333333335,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029993284782379775,
+      "loss": 0.4881,
+      "step": 7640
+    },
+    {
+      "epoch": 20.4,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00029993265957844225,
+      "loss": 0.4839,
+      "step": 7650
+    },
+    {
+      "epoch": 20.426666666666666,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.00029993247106966433,
+      "loss": 0.4897,
+      "step": 7660
+    },
+    {
+      "epoch": 20.453333333333333,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029993228229746436,
+      "loss": 0.493,
+      "step": 7670
+    },
+    {
+      "epoch": 20.48,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0002999320932618426,
+      "loss": 0.4993,
+      "step": 7680
+    },
+    {
+      "epoch": 20.506666666666668,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029993190396279944,
+      "loss": 0.4879,
+      "step": 7690
+    },
+    {
+      "epoch": 20.533333333333335,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0002999317144003353,
+      "loss": 0.4815,
+      "step": 7700
+    },
+    {
+      "epoch": 20.56,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.00029993152457445026,
+      "loss": 0.4829,
+      "step": 7710
+    },
+    {
+      "epoch": 20.586666666666666,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002999313344851449,
+      "loss": 0.4788,
+      "step": 7720
+    },
+    {
+      "epoch": 20.613333333333333,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0002999311441324194,
+      "loss": 0.4645,
+      "step": 7730
+    },
+    {
+      "epoch": 20.64,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002999309535162742,
+      "loss": 0.4677,
+      "step": 7740
+    },
+    {
+      "epoch": 20.666666666666668,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029993076263670954,
+      "loss": 0.4856,
+      "step": 7750
+    },
+    {
+      "epoch": 20.693333333333335,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002999305714937259,
+      "loss": 0.4681,
+      "step": 7760
+    },
+    {
+      "epoch": 20.72,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029993038008732344,
+      "loss": 0.4846,
+      "step": 7770
+    },
+    {
+      "epoch": 20.746666666666666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002999301884175026,
+      "loss": 0.4896,
+      "step": 7780
+    },
+    {
+      "epoch": 20.773333333333333,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002999299964842637,
+      "loss": 0.4977,
+      "step": 7790
+    },
+    {
+      "epoch": 20.8,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029992980428760705,
+      "loss": 0.4839,
+      "step": 7800
+    },
+    {
+      "epoch": 20.826666666666668,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.000299929611827533,
+      "loss": 0.4784,
+      "step": 7810
+    },
+    {
+      "epoch": 20.85333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029992941910404194,
+      "loss": 0.4881,
+      "step": 7820
+    },
+    {
+      "epoch": 20.88,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002999292261171341,
+      "loss": 0.4759,
+      "step": 7830
+    },
+    {
+      "epoch": 20.906666666666666,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.00029992903286680996,
+      "loss": 0.4779,
+      "step": 7840
+    },
+    {
+      "epoch": 20.933333333333334,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002999288393530698,
+      "loss": 0.4793,
+      "step": 7850
+    },
+    {
+      "epoch": 20.96,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002999286455759139,
+      "loss": 0.4772,
+      "step": 7860
+    },
+    {
+      "epoch": 20.986666666666668,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029992845153534257,
+      "loss": 0.4912,
+      "step": 7870
+    },
+    {
+      "epoch": 21.0,
+      "eval_loss": 0.4904196858406067,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3912,
+      "eval_samples_per_second": 1.54,
+      "eval_steps_per_second": 0.096,
+      "step": 7875
+    },
+    {
+      "epoch": 21.013333333333332,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002999282572313563,
+      "loss": 0.4806,
+      "step": 7880
+    },
+    {
+      "epoch": 21.04,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0002999280626639554,
+      "loss": 0.5045,
+      "step": 7890
+    },
+    {
+      "epoch": 21.066666666666666,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029992786783314006,
+      "loss": 0.4914,
+      "step": 7900
+    },
+    {
+      "epoch": 21.093333333333334,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002999276727389108,
+      "loss": 0.4867,
+      "step": 7910
+    },
+    {
+      "epoch": 21.12,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002999274773812679,
+      "loss": 0.4835,
+      "step": 7920
+    },
+    {
+      "epoch": 21.14666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029992728176021164,
+      "loss": 0.4923,
+      "step": 7930
+    },
+    {
+      "epoch": 21.173333333333332,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029992708587574246,
+      "loss": 0.4825,
+      "step": 7940
+    },
+    {
+      "epoch": 21.2,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029992688972786067,
+      "loss": 0.481,
+      "step": 7950
+    },
+    {
+      "epoch": 21.226666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002999266933165666,
+      "loss": 0.4785,
+      "step": 7960
+    },
+    {
+      "epoch": 21.253333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002999264966418606,
+      "loss": 0.4783,
+      "step": 7970
+    },
+    {
+      "epoch": 21.28,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029992629970374305,
+      "loss": 0.4913,
+      "step": 7980
+    },
+    {
+      "epoch": 21.306666666666665,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029992610250221424,
+      "loss": 0.4732,
+      "step": 7990
+    },
+    {
+      "epoch": 21.333333333333332,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.00029992590503727455,
+      "loss": 0.4896,
+      "step": 8000
+    },
+    {
+      "epoch": 21.36,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0002999257073089243,
+      "loss": 0.4911,
+      "step": 8010
+    },
+    {
+      "epoch": 21.386666666666667,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029992550931716384,
+      "loss": 0.4825,
+      "step": 8020
+    },
+    {
+      "epoch": 21.413333333333334,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002999253110619936,
+      "loss": 0.4866,
+      "step": 8030
+    },
+    {
+      "epoch": 21.44,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0002999251125434138,
+      "loss": 0.4895,
+      "step": 8040
+    },
+    {
+      "epoch": 21.466666666666665,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0002999249137614249,
+      "loss": 0.4996,
+      "step": 8050
+    },
+    {
+      "epoch": 21.493333333333332,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.00029992471471602716,
+      "loss": 0.4904,
+      "step": 8060
+    },
+    {
+      "epoch": 21.52,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.000299924515407221,
+      "loss": 0.4855,
+      "step": 8070
+    },
+    {
+      "epoch": 21.546666666666667,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029992431583500677,
+      "loss": 0.4822,
+      "step": 8080
+    },
+    {
+      "epoch": 21.573333333333334,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0002999241159993847,
+      "loss": 0.4808,
+      "step": 8090
+    },
+    {
+      "epoch": 21.6,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0002999239159003553,
+      "loss": 0.4746,
+      "step": 8100
+    },
+    {
+      "epoch": 21.626666666666665,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029992371553791885,
+      "loss": 0.4611,
+      "step": 8110
+    },
+    {
+      "epoch": 21.653333333333332,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029992351491207566,
+      "loss": 0.4807,
+      "step": 8120
+    },
+    {
+      "epoch": 21.68,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0002999233140228262,
+      "loss": 0.4716,
+      "step": 8130
+    },
+    {
+      "epoch": 21.706666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002999231128701707,
+      "loss": 0.4721,
+      "step": 8140
+    },
+    {
+      "epoch": 21.733333333333334,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002999229114541096,
+      "loss": 0.4922,
+      "step": 8150
+    },
+    {
+      "epoch": 21.76,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002999227097746432,
+      "loss": 0.4929,
+      "step": 8160
+    },
+    {
+      "epoch": 21.786666666666665,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029992250783177185,
+      "loss": 0.4918,
+      "step": 8170
+    },
+    {
+      "epoch": 21.813333333333333,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00029992230562549593,
+      "loss": 0.4768,
+      "step": 8180
+    },
+    {
+      "epoch": 21.84,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002999221031558158,
+      "loss": 0.4829,
+      "step": 8190
+    },
+    {
+      "epoch": 21.866666666666667,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.00029992190042273185,
+      "loss": 0.4828,
+      "step": 8200
+    },
+    {
+      "epoch": 21.893333333333334,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029992169742624434,
+      "loss": 0.4728,
+      "step": 8210
+    },
+    {
+      "epoch": 21.92,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002999214941663537,
+      "loss": 0.4837,
+      "step": 8220
+    },
+    {
+      "epoch": 21.946666666666665,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029992129064306027,
+      "loss": 0.4761,
+      "step": 8230
+    },
+    {
+      "epoch": 21.973333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002999210868563644,
+      "loss": 0.4872,
+      "step": 8240
+    },
+    {
+      "epoch": 22.0,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.00029992088280626647,
+      "loss": 0.4761,
+      "step": 8250
+    },
+    {
+      "epoch": 22.0,
+      "eval_loss": 0.48901429772377014,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 11.4555,
+      "eval_samples_per_second": 1.397,
+      "eval_steps_per_second": 0.087,
+      "step": 8250
+    },
+    {
+      "epoch": 22.026666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002999206784927668,
+      "loss": 0.4955,
+      "step": 8260
+    },
+    {
+      "epoch": 22.053333333333335,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002999204739158658,
+      "loss": 0.5019,
+      "step": 8270
+    },
+    {
+      "epoch": 22.08,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002999202690755638,
+      "loss": 0.4882,
+      "step": 8280
+    },
+    {
+      "epoch": 22.106666666666666,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00029992006397186114,
+      "loss": 0.483,
+      "step": 8290
+    },
+    {
+      "epoch": 22.133333333333333,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.00029991985860475825,
+      "loss": 0.4861,
+      "step": 8300
+    },
+    {
+      "epoch": 22.16,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002999196529742554,
+      "loss": 0.49,
+      "step": 8310
+    },
+    {
+      "epoch": 22.186666666666667,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.000299919447080353,
+      "loss": 0.4817,
+      "step": 8320
+    },
+    {
+      "epoch": 22.213333333333335,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002999192409230514,
+      "loss": 0.4765,
+      "step": 8330
+    },
+    {
+      "epoch": 22.24,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029991903450235095,
+      "loss": 0.4801,
+      "step": 8340
+    },
+    {
+      "epoch": 22.266666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029991882781825203,
+      "loss": 0.4845,
+      "step": 8350
+    },
+    {
+      "epoch": 22.293333333333333,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.000299918620870755,
+      "loss": 0.4779,
+      "step": 8360
+    },
+    {
+      "epoch": 22.32,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002999184136598603,
+      "loss": 0.4822,
+      "step": 8370
+    },
+    {
+      "epoch": 22.346666666666668,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029991820618556817,
+      "loss": 0.4911,
+      "step": 8380
+    },
+    {
+      "epoch": 22.373333333333335,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002999179984478791,
+      "loss": 0.4883,
+      "step": 8390
+    },
+    {
+      "epoch": 22.4,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0002999177904467933,
+      "loss": 0.4824,
+      "step": 8400
+    },
+    {
+      "epoch": 22.426666666666666,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00029991758218231117,
+      "loss": 0.4892,
+      "step": 8410
+    },
+    {
+      "epoch": 22.453333333333333,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0002999173736544332,
+      "loss": 0.491,
+      "step": 8420
+    },
+    {
+      "epoch": 22.48,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00029991716486315966,
+      "loss": 0.4978,
+      "step": 8430
+    },
+    {
+      "epoch": 22.506666666666668,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002999169558084909,
+      "loss": 0.487,
+      "step": 8440
+    },
+    {
+      "epoch": 22.533333333333335,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00029991674649042737,
+      "loss": 0.4803,
+      "step": 8450
+    },
+    {
+      "epoch": 22.56,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0002999165369089694,
+      "loss": 0.4812,
+      "step": 8460
+    },
+    {
+      "epoch": 22.586666666666666,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0002999163270641173,
+      "loss": 0.4774,
+      "step": 8470
+    },
+    {
+      "epoch": 22.613333333333333,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0002999161169558715,
+      "loss": 0.4635,
+      "step": 8480
+    },
+    {
+      "epoch": 22.64,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029991590658423237,
+      "loss": 0.4673,
+      "step": 8490
+    },
+    {
+      "epoch": 22.666666666666668,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0002999156959492003,
+      "loss": 0.4844,
+      "step": 8500
+    },
+    {
+      "epoch": 22.693333333333335,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002999154850507756,
+      "loss": 0.4666,
+      "step": 8510
+    },
+    {
+      "epoch": 22.72,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002999152738889586,
+      "loss": 0.4839,
+      "step": 8520
+    },
+    {
+      "epoch": 22.746666666666666,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00029991506246374977,
+      "loss": 0.4882,
+      "step": 8530
+    },
+    {
+      "epoch": 22.773333333333333,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029991485077514947,
+      "loss": 0.4964,
+      "step": 8540
+    },
+    {
+      "epoch": 22.8,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029991463882315803,
+      "loss": 0.4831,
+      "step": 8550
+    },
+    {
+      "epoch": 22.826666666666668,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00029991442660777594,
+      "loss": 0.4767,
+      "step": 8560
+    },
+    {
+      "epoch": 22.85333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029991421412900335,
+      "loss": 0.487,
+      "step": 8570
+    },
+    {
+      "epoch": 22.88,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002999140013868408,
+      "loss": 0.4743,
+      "step": 8580
+    },
+    {
+      "epoch": 22.906666666666666,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029991378838128865,
+      "loss": 0.4764,
+      "step": 8590
+    },
+    {
+      "epoch": 22.933333333333334,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002999135751123472,
+      "loss": 0.4785,
+      "step": 8600
+    },
+    {
+      "epoch": 22.96,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0002999133615800169,
+      "loss": 0.4761,
+      "step": 8610
+    },
+    {
+      "epoch": 22.986666666666668,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029991314778429816,
+      "loss": 0.4899,
+      "step": 8620
+    },
+    {
+      "epoch": 23.0,
+      "eval_loss": 0.4898934066295624,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.792,
+      "eval_samples_per_second": 1.634,
+      "eval_steps_per_second": 0.102,
+      "step": 8625
+    },
+    {
+      "epoch": 23.013333333333332,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002999129337251912,
+      "loss": 0.4788,
+      "step": 8630
+    },
+    {
+      "epoch": 23.04,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029991271940269654,
+      "loss": 0.5037,
+      "step": 8640
+    },
+    {
+      "epoch": 23.066666666666666,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029991250481681443,
+      "loss": 0.4914,
+      "step": 8650
+    },
+    {
+      "epoch": 23.093333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002999122899675454,
+      "loss": 0.4859,
+      "step": 8660
+    },
+    {
+      "epoch": 23.12,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002999120748548897,
+      "loss": 0.4819,
+      "step": 8670
+    },
+    {
+      "epoch": 23.14666666666667,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.00029991185947884783,
+      "loss": 0.4914,
+      "step": 8680
+    },
+    {
+      "epoch": 23.173333333333332,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029991164383942,
+      "loss": 0.4811,
+      "step": 8690
+    },
+    {
+      "epoch": 23.2,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00029991142793660676,
+      "loss": 0.4803,
+      "step": 8700
+    },
+    {
+      "epoch": 23.226666666666667,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002999112117704084,
+      "loss": 0.4783,
+      "step": 8710
+    },
+    {
+      "epoch": 23.253333333333334,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0002999109953408253,
+      "loss": 0.4779,
+      "step": 8720
+    },
+    {
+      "epoch": 23.28,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029991077864785785,
+      "loss": 0.4907,
+      "step": 8730
+    },
+    {
+      "epoch": 23.306666666666665,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002999105616915065,
+      "loss": 0.4724,
+      "step": 8740
+    },
+    {
+      "epoch": 23.333333333333332,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002999103444717715,
+      "loss": 0.4873,
+      "step": 8750
+    },
+    {
+      "epoch": 23.36,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002999101269886533,
+      "loss": 0.4905,
+      "step": 8760
+    },
+    {
+      "epoch": 23.386666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002999099092421523,
+      "loss": 0.4823,
+      "step": 8770
+    },
+    {
+      "epoch": 23.413333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002999096912322688,
+      "loss": 0.4855,
+      "step": 8780
+    },
+    {
+      "epoch": 23.44,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029990947295900335,
+      "loss": 0.4893,
+      "step": 8790
+    },
+    {
+      "epoch": 23.466666666666665,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002999092544223562,
+      "loss": 0.4976,
+      "step": 8800
+    },
+    {
+      "epoch": 23.493333333333332,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002999090356223277,
+      "loss": 0.489,
+      "step": 8810
+    },
+    {
+      "epoch": 23.52,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.00029990881655891834,
+      "loss": 0.4833,
+      "step": 8820
+    },
+    {
+      "epoch": 23.546666666666667,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00029990859723212843,
+      "loss": 0.4801,
+      "step": 8830
+    },
+    {
+      "epoch": 23.573333333333334,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002999083776419584,
+      "loss": 0.4784,
+      "step": 8840
+    },
+    {
+      "epoch": 23.6,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002999081577884087,
+      "loss": 0.4733,
+      "step": 8850
+    },
+    {
+      "epoch": 23.626666666666665,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.00029990793767147955,
+      "loss": 0.4594,
+      "step": 8860
+    },
+    {
+      "epoch": 23.653333333333332,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002999077172911715,
+      "loss": 0.4795,
+      "step": 8870
+    },
+    {
+      "epoch": 23.68,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002999074966474848,
+      "loss": 0.4707,
+      "step": 8880
+    },
+    {
+      "epoch": 23.706666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029990727574041995,
+      "loss": 0.472,
+      "step": 8890
+    },
+    {
+      "epoch": 23.733333333333334,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029990705456997725,
+      "loss": 0.4913,
+      "step": 8900
+    },
+    {
+      "epoch": 23.76,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029990683313615713,
+      "loss": 0.4927,
+      "step": 8910
+    },
+    {
+      "epoch": 23.786666666666665,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029990661143895997,
+      "loss": 0.491,
+      "step": 8920
+    },
+    {
+      "epoch": 23.813333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002999063894783862,
+      "loss": 0.4746,
+      "step": 8930
+    },
+    {
+      "epoch": 23.84,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029990616725443617,
+      "loss": 0.4818,
+      "step": 8940
+    },
+    {
+      "epoch": 23.866666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029990594476711024,
+      "loss": 0.4813,
+      "step": 8950
+    },
+    {
+      "epoch": 23.893333333333334,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002999057220164089,
+      "loss": 0.4719,
+      "step": 8960
+    },
+    {
+      "epoch": 23.92,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002999054990023324,
+      "loss": 0.4812,
+      "step": 8970
+    },
+    {
+      "epoch": 23.946666666666665,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002999052757248813,
+      "loss": 0.4743,
+      "step": 8980
+    },
+    {
+      "epoch": 23.973333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029990505218405585,
+      "loss": 0.4864,
+      "step": 8990
+    },
+    {
+      "epoch": 24.0,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002999048283798565,
+      "loss": 0.4751,
+      "step": 9000
+    },
+    {
+      "epoch": 24.0,
+      "eval_loss": 0.4884170591831207,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8738,
+      "eval_samples_per_second": 1.62,
+      "eval_steps_per_second": 0.101,
+      "step": 9000
+    },
+    {
+      "epoch": 24.026666666666667,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0002999046043122837,
+      "loss": 0.494,
+      "step": 9010
+    },
+    {
+      "epoch": 24.053333333333335,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029990437998133766,
+      "loss": 0.5008,
+      "step": 9020
+    },
+    {
+      "epoch": 24.08,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.000299904155387019,
+      "loss": 0.4875,
+      "step": 9030
+    },
+    {
+      "epoch": 24.106666666666666,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.000299903930529328,
+      "loss": 0.4823,
+      "step": 9040
+    },
+    {
+      "epoch": 24.133333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.000299903705408265,
+      "loss": 0.4848,
+      "step": 9050
+    },
+    {
+      "epoch": 24.16,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029990348002383054,
+      "loss": 0.4882,
+      "step": 9060
+    },
+    {
+      "epoch": 24.186666666666667,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00029990325437602493,
+      "loss": 0.4799,
+      "step": 9070
+    },
+    {
+      "epoch": 24.213333333333335,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002999030284648485,
+      "loss": 0.4751,
+      "step": 9080
+    },
+    {
+      "epoch": 24.24,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002999028022903018,
+      "loss": 0.4801,
+      "step": 9090
+    },
+    {
+      "epoch": 24.266666666666666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002999025758523851,
+      "loss": 0.4834,
+      "step": 9100
+    },
+    {
+      "epoch": 24.293333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002999023491510989,
+      "loss": 0.4776,
+      "step": 9110
+    },
+    {
+      "epoch": 24.32,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002999021221864435,
+      "loss": 0.4814,
+      "step": 9120
+    },
+    {
+      "epoch": 24.346666666666668,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029990189495841937,
+      "loss": 0.4898,
+      "step": 9130
+    },
+    {
+      "epoch": 24.373333333333335,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002999016674670269,
+      "loss": 0.4855,
+      "step": 9140
+    },
+    {
+      "epoch": 24.4,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002999014397122664,
+      "loss": 0.4812,
+      "step": 9150
+    },
+    {
+      "epoch": 24.426666666666666,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002999012116941384,
+      "loss": 0.4882,
+      "step": 9160
+    },
+    {
+      "epoch": 24.453333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029990098341264323,
+      "loss": 0.4903,
+      "step": 9170
+    },
+    {
+      "epoch": 24.48,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002999007548677813,
+      "loss": 0.4965,
+      "step": 9180
+    },
+    {
+      "epoch": 24.506666666666668,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029990052605955304,
+      "loss": 0.4863,
+      "step": 9190
+    },
+    {
+      "epoch": 24.533333333333335,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.00029990029698795883,
+      "loss": 0.4795,
+      "step": 9200
+    },
+    {
+      "epoch": 24.56,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.00029990006765299906,
+      "loss": 0.4806,
+      "step": 9210
+    },
+    {
+      "epoch": 24.586666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002998998380546741,
+      "loss": 0.4768,
+      "step": 9220
+    },
+    {
+      "epoch": 24.613333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002998996081929845,
+      "loss": 0.4619,
+      "step": 9230
+    },
+    {
+      "epoch": 24.64,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0002998993780679305,
+      "loss": 0.4666,
+      "step": 9240
+    },
+    {
+      "epoch": 24.666666666666668,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002998991476795125,
+      "loss": 0.4838,
+      "step": 9250
+    },
+    {
+      "epoch": 24.693333333333335,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029989891702773103,
+      "loss": 0.4655,
+      "step": 9260
+    },
+    {
+      "epoch": 24.72,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.00029989868611258644,
+      "loss": 0.4822,
+      "step": 9270
+    },
+    {
+      "epoch": 24.746666666666666,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029989845493407917,
+      "loss": 0.4878,
+      "step": 9280
+    },
+    {
+      "epoch": 24.773333333333333,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002998982234922095,
+      "loss": 0.4955,
+      "step": 9290
+    },
+    {
+      "epoch": 24.8,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.000299897991786978,
+      "loss": 0.4823,
+      "step": 9300
+    },
+    {
+      "epoch": 24.826666666666668,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.000299897759818385,
+      "loss": 0.4768,
+      "step": 9310
+    },
+    {
+      "epoch": 24.85333333333333,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00029989752758643085,
+      "loss": 0.4863,
+      "step": 9320
+    },
+    {
+      "epoch": 24.88,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.000299897295091116,
+      "loss": 0.4727,
+      "step": 9330
+    },
+    {
+      "epoch": 24.906666666666666,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.000299897062332441,
+      "loss": 0.4751,
+      "step": 9340
+    },
+    {
+      "epoch": 24.933333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.000299896829310406,
+      "loss": 0.4768,
+      "step": 9350
+    },
+    {
+      "epoch": 24.96,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002998965960250116,
+      "loss": 0.4751,
+      "step": 9360
+    },
+    {
+      "epoch": 24.986666666666668,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00029989636247625817,
+      "loss": 0.4897,
+      "step": 9370
+    },
+    {
+      "epoch": 25.0,
+      "eval_loss": 0.488960325717926,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0328,
+      "eval_samples_per_second": 1.595,
+      "eval_steps_per_second": 0.1,
+      "step": 9375
+    },
+    {
+      "epoch": 25.013333333333332,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002998961286641461,
+      "loss": 0.4785,
+      "step": 9380
+    },
+    {
+      "epoch": 25.04,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029989589458867576,
+      "loss": 0.5028,
+      "step": 9390
+    },
+    {
+      "epoch": 25.066666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029989566024984763,
+      "loss": 0.4902,
+      "step": 9400
+    },
+    {
+      "epoch": 25.093333333333334,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029989542564766214,
+      "loss": 0.4858,
+      "step": 9410
+    },
+    {
+      "epoch": 25.12,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002998951907821196,
+      "loss": 0.4812,
+      "step": 9420
+    },
+    {
+      "epoch": 25.14666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002998949556532205,
+      "loss": 0.4902,
+      "step": 9430
+    },
+    {
+      "epoch": 25.173333333333332,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002998947202609652,
+      "loss": 0.4813,
+      "step": 9440
+    },
+    {
+      "epoch": 25.2,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002998944846053542,
+      "loss": 0.4794,
+      "step": 9450
+    },
+    {
+      "epoch": 25.226666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002998942486863879,
+      "loss": 0.4768,
+      "step": 9460
+    },
+    {
+      "epoch": 25.253333333333334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002998940125040666,
+      "loss": 0.4759,
+      "step": 9470
+    },
+    {
+      "epoch": 25.28,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002998937760583908,
+      "loss": 0.4895,
+      "step": 9480
+    },
+    {
+      "epoch": 25.306666666666665,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029989353934936093,
+      "loss": 0.4708,
+      "step": 9490
+    },
+    {
+      "epoch": 25.333333333333332,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002998933023769774,
+      "loss": 0.4874,
+      "step": 9500
+    },
+    {
+      "epoch": 25.36,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002998930651412406,
+      "loss": 0.4898,
+      "step": 9510
+    },
+    {
+      "epoch": 25.386666666666667,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029989282764215095,
+      "loss": 0.4804,
+      "step": 9520
+    },
+    {
+      "epoch": 25.413333333333334,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002998925898797089,
+      "loss": 0.4846,
+      "step": 9530
+    },
+    {
+      "epoch": 25.44,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002998923518539148,
+      "loss": 0.4878,
+      "step": 9540
+    },
+    {
+      "epoch": 25.466666666666665,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00029989211356476914,
+      "loss": 0.4969,
+      "step": 9550
+    },
+    {
+      "epoch": 25.493333333333332,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029989187501227227,
+      "loss": 0.4884,
+      "step": 9560
+    },
+    {
+      "epoch": 25.52,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0002998916361964247,
+      "loss": 0.4829,
+      "step": 9570
+    },
+    {
+      "epoch": 25.546666666666667,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029989139711722674,
+      "loss": 0.4795,
+      "step": 9580
+    },
+    {
+      "epoch": 25.573333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002998911577746789,
+      "loss": 0.4784,
+      "step": 9590
+    },
+    {
+      "epoch": 25.6,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00029989091816878154,
+      "loss": 0.4722,
+      "step": 9600
+    },
+    {
+      "epoch": 25.626666666666665,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0002998906782995351,
+      "loss": 0.4588,
+      "step": 9610
+    },
+    {
+      "epoch": 25.653333333333332,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0002998904381669401,
+      "loss": 0.4786,
+      "step": 9620
+    },
+    {
+      "epoch": 25.68,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0002998901977709968,
+      "loss": 0.4697,
+      "step": 9630
+    },
+    {
+      "epoch": 25.706666666666667,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002998899571117057,
+      "loss": 0.4714,
+      "step": 9640
+    },
+    {
+      "epoch": 25.733333333333334,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002998897161890672,
+      "loss": 0.4905,
+      "step": 9650
+    },
+    {
+      "epoch": 25.76,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002998894750030818,
+      "loss": 0.4924,
+      "step": 9660
+    },
+    {
+      "epoch": 25.786666666666665,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029988923355374987,
+      "loss": 0.4896,
+      "step": 9670
+    },
+    {
+      "epoch": 25.813333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029988899184107173,
+      "loss": 0.4743,
+      "step": 9680
+    },
+    {
+      "epoch": 25.84,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.000299888749865048,
+      "loss": 0.481,
+      "step": 9690
+    },
+    {
+      "epoch": 25.866666666666667,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002998885076256789,
+      "loss": 0.4804,
+      "step": 9700
+    },
+    {
+      "epoch": 25.893333333333334,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0002998882651229651,
+      "loss": 0.4703,
+      "step": 9710
+    },
+    {
+      "epoch": 25.92,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029988802235690685,
+      "loss": 0.4804,
+      "step": 9720
+    },
+    {
+      "epoch": 25.946666666666665,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002998877793275046,
+      "loss": 0.474,
+      "step": 9730
+    },
+    {
+      "epoch": 25.973333333333333,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029988753603475877,
+      "loss": 0.4854,
+      "step": 9740
+    },
+    {
+      "epoch": 26.0,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002998872924786698,
+      "loss": 0.4745,
+      "step": 9750
+    },
+    {
+      "epoch": 26.0,
+      "eval_loss": 0.4890904128551483,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.6338,
+      "eval_samples_per_second": 1.661,
+      "eval_steps_per_second": 0.104,
+      "step": 9750
+    },
+    {
+      "epoch": 26.026666666666667,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029988704865923813,
+      "loss": 0.4934,
+      "step": 9760
+    },
+    {
+      "epoch": 26.053333333333335,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029988680457646423,
+      "loss": 0.4998,
+      "step": 9770
+    },
+    {
+      "epoch": 26.08,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00029988656023034846,
+      "loss": 0.4867,
+      "step": 9780
+    },
+    {
+      "epoch": 26.106666666666666,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0002998863156208913,
+      "loss": 0.4815,
+      "step": 9790
+    },
+    {
+      "epoch": 26.133333333333333,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029988607074809317,
+      "loss": 0.4834,
+      "step": 9800
+    },
+    {
+      "epoch": 26.16,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002998858256119544,
+      "loss": 0.4881,
+      "step": 9810
+    },
+    {
+      "epoch": 26.186666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002998855802124756,
+      "loss": 0.4802,
+      "step": 9820
+    },
+    {
+      "epoch": 26.213333333333335,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002998853345496571,
+      "loss": 0.4749,
+      "step": 9830
+    },
+    {
+      "epoch": 26.24,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002998850886234993,
+      "loss": 0.4787,
+      "step": 9840
+    },
+    {
+      "epoch": 26.266666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029988484243400265,
+      "loss": 0.4824,
+      "step": 9850
+    },
+    {
+      "epoch": 26.293333333333333,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002998845959811676,
+      "loss": 0.4764,
+      "step": 9860
+    },
+    {
+      "epoch": 26.32,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002998843492649947,
+      "loss": 0.4803,
+      "step": 9870
+    },
+    {
+      "epoch": 26.346666666666668,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029988410228548416,
+      "loss": 0.4889,
+      "step": 9880
+    },
+    {
+      "epoch": 26.373333333333335,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029988385504263655,
+      "loss": 0.4846,
+      "step": 9890
+    },
+    {
+      "epoch": 26.4,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002998836075364523,
+      "loss": 0.4805,
+      "step": 9900
+    },
+    {
+      "epoch": 26.426666666666666,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029988335976693177,
+      "loss": 0.4875,
+      "step": 9910
+    },
+    {
+      "epoch": 26.453333333333333,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002998831117340755,
+      "loss": 0.4895,
+      "step": 9920
+    },
+    {
+      "epoch": 26.48,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029988286343788387,
+      "loss": 0.4961,
+      "step": 9930
+    },
+    {
+      "epoch": 26.506666666666668,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0002998826148783573,
+      "loss": 0.4854,
+      "step": 9940
+    },
+    {
+      "epoch": 26.533333333333335,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.00029988236605549626,
+      "loss": 0.4787,
+      "step": 9950
+    },
+    {
+      "epoch": 26.56,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0002998821169693012,
+      "loss": 0.48,
+      "step": 9960
+    },
+    {
+      "epoch": 26.586666666666666,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029988186761977246,
+      "loss": 0.4757,
+      "step": 9970
+    },
+    {
+      "epoch": 26.613333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029988161800691056,
+      "loss": 0.4617,
+      "step": 9980
+    },
+    {
+      "epoch": 26.64,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.000299881368130716,
+      "loss": 0.4657,
+      "step": 9990
+    },
+    {
+      "epoch": 26.666666666666668,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029988111799118914,
+      "loss": 0.4815,
+      "step": 10000
+    },
+    {
+      "epoch": 26.693333333333335,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029988086758833037,
+      "loss": 0.4639,
+      "step": 10010
+    },
+    {
+      "epoch": 26.72,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029988061692214025,
+      "loss": 0.4823,
+      "step": 10020
+    },
+    {
+      "epoch": 26.746666666666666,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029988036599261906,
+      "loss": 0.486,
+      "step": 10030
+    },
+    {
+      "epoch": 26.773333333333333,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029988011479976744,
+      "loss": 0.4952,
+      "step": 10040
+    },
+    {
+      "epoch": 26.8,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.00029987986334358566,
+      "loss": 0.4811,
+      "step": 10050
+    },
+    {
+      "epoch": 26.826666666666668,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029987961162407427,
+      "loss": 0.4755,
+      "step": 10060
+    },
+    {
+      "epoch": 26.85333333333333,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029987935964123364,
+      "loss": 0.4843,
+      "step": 10070
+    },
+    {
+      "epoch": 26.88,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029987910739506427,
+      "loss": 0.4724,
+      "step": 10080
+    },
+    {
+      "epoch": 26.906666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002998788548855666,
+      "loss": 0.4744,
+      "step": 10090
+    },
+    {
+      "epoch": 26.933333333333334,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.000299878602112741,
+      "loss": 0.4763,
+      "step": 10100
+    },
+    {
+      "epoch": 26.96,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029987834907658806,
+      "loss": 0.4745,
+      "step": 10110
+    },
+    {
+      "epoch": 26.986666666666668,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00029987809577710804,
+      "loss": 0.4878,
+      "step": 10120
+    },
+    {
+      "epoch": 27.0,
+      "eval_loss": 0.48758047819137573,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7851,
+      "eval_samples_per_second": 1.635,
+      "eval_steps_per_second": 0.102,
+      "step": 10125
+    },
+    {
+      "epoch": 27.013333333333332,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.00029987784221430154,
+      "loss": 0.4778,
+      "step": 10130
+    },
+    {
+      "epoch": 27.04,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0002998775883881689,
+      "loss": 0.5021,
+      "step": 10140
+    },
+    {
+      "epoch": 27.066666666666666,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0002998773342987106,
+      "loss": 0.4918,
+      "step": 10150
+    },
+    {
+      "epoch": 27.093333333333334,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0002998770799459271,
+      "loss": 0.4876,
+      "step": 10160
+    },
+    {
+      "epoch": 27.12,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0002998768253298189,
+      "loss": 0.482,
+      "step": 10170
+    },
+    {
+      "epoch": 27.14666666666667,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.00029987657045038634,
+      "loss": 0.4903,
+      "step": 10180
+    },
+    {
+      "epoch": 27.173333333333332,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002998763153076299,
+      "loss": 0.48,
+      "step": 10190
+    },
+    {
+      "epoch": 27.2,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029987605990155006,
+      "loss": 0.4781,
+      "step": 10200
+    },
+    {
+      "epoch": 27.226666666666667,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002998758042321473,
+      "loss": 0.4762,
+      "step": 10210
+    },
+    {
+      "epoch": 27.253333333333334,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.000299875548299422,
+      "loss": 0.4759,
+      "step": 10220
+    },
+    {
+      "epoch": 27.28,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002998752921033746,
+      "loss": 0.4884,
+      "step": 10230
+    },
+    {
+      "epoch": 27.306666666666665,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002998750356440056,
+      "loss": 0.4709,
+      "step": 10240
+    },
+    {
+      "epoch": 27.333333333333332,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002998747789213154,
+      "loss": 0.4855,
+      "step": 10250
+    },
+    {
+      "epoch": 27.36,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029987452193530454,
+      "loss": 0.4883,
+      "step": 10260
+    },
+    {
+      "epoch": 27.386666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029987426468597344,
+      "loss": 0.4795,
+      "step": 10270
+    },
+    {
+      "epoch": 27.413333333333334,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002998740071733224,
+      "loss": 0.4837,
+      "step": 10280
+    },
+    {
+      "epoch": 27.44,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002998737493973521,
+      "loss": 0.4873,
+      "step": 10290
+    },
+    {
+      "epoch": 27.466666666666665,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0002998734913580629,
+      "loss": 0.4965,
+      "step": 10300
+    },
+    {
+      "epoch": 27.493333333333332,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029987323305545523,
+      "loss": 0.4875,
+      "step": 10310
+    },
+    {
+      "epoch": 27.52,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.00029987297448952956,
+      "loss": 0.4815,
+      "step": 10320
+    },
+    {
+      "epoch": 27.546666666666667,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029987271566028637,
+      "loss": 0.4788,
+      "step": 10330
+    },
+    {
+      "epoch": 27.573333333333334,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.00029987245656772603,
+      "loss": 0.4777,
+      "step": 10340
+    },
+    {
+      "epoch": 27.6,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0002998721972118491,
+      "loss": 0.4717,
+      "step": 10350
+    },
+    {
+      "epoch": 27.626666666666665,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.00029987193759265597,
+      "loss": 0.4578,
+      "step": 10360
+    },
+    {
+      "epoch": 27.653333333333332,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002998716777101471,
+      "loss": 0.4779,
+      "step": 10370
+    },
+    {
+      "epoch": 27.68,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029987141756432297,
+      "loss": 0.4685,
+      "step": 10380
+    },
+    {
+      "epoch": 27.706666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029987115715518405,
+      "loss": 0.4704,
+      "step": 10390
+    },
+    {
+      "epoch": 27.733333333333334,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029987089648273076,
+      "loss": 0.4887,
+      "step": 10400
+    },
+    {
+      "epoch": 27.76,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002998706355469636,
+      "loss": 0.491,
+      "step": 10410
+    },
+    {
+      "epoch": 27.786666666666665,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.00029987037434788303,
+      "loss": 0.4887,
+      "step": 10420
+    },
+    {
+      "epoch": 27.813333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002998701128854894,
+      "loss": 0.4736,
+      "step": 10430
+    },
+    {
+      "epoch": 27.84,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029986985115978336,
+      "loss": 0.4799,
+      "step": 10440
+    },
+    {
+      "epoch": 27.866666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029986958917076515,
+      "loss": 0.479,
+      "step": 10450
+    },
+    {
+      "epoch": 27.893333333333334,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002998693269184354,
+      "loss": 0.4691,
+      "step": 10460
+    },
+    {
+      "epoch": 27.92,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029986906440279454,
+      "loss": 0.4801,
+      "step": 10470
+    },
+    {
+      "epoch": 27.946666666666665,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029986880162384294,
+      "loss": 0.4726,
+      "step": 10480
+    },
+    {
+      "epoch": 27.973333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002998685385815812,
+      "loss": 0.4839,
+      "step": 10490
+    },
+    {
+      "epoch": 28.0,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002998682752760096,
+      "loss": 0.4734,
+      "step": 10500
+    },
+    {
+      "epoch": 28.0,
+      "eval_loss": 0.4892578721046448,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.052,
+      "eval_samples_per_second": 1.592,
+      "eval_steps_per_second": 0.099,
+      "step": 10500
+    },
+    {
+      "epoch": 28.026666666666667,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0002998680117071288,
+      "loss": 0.4925,
+      "step": 10510
+    },
+    {
+      "epoch": 28.053333333333335,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029986774787493916,
+      "loss": 0.499,
+      "step": 10520
+    },
+    {
+      "epoch": 28.08,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029986748377944114,
+      "loss": 0.4861,
+      "step": 10530
+    },
+    {
+      "epoch": 28.106666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029986721942063525,
+      "loss": 0.4814,
+      "step": 10540
+    },
+    {
+      "epoch": 28.133333333333333,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00029986695479852185,
+      "loss": 0.4827,
+      "step": 10550
+    },
+    {
+      "epoch": 28.16,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029986668991310156,
+      "loss": 0.4871,
+      "step": 10560
+    },
+    {
+      "epoch": 28.186666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029986642476437475,
+      "loss": 0.4784,
+      "step": 10570
+    },
+    {
+      "epoch": 28.213333333333335,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002998661593523419,
+      "loss": 0.4737,
+      "step": 10580
+    },
+    {
+      "epoch": 28.24,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002998658936770035,
+      "loss": 0.4775,
+      "step": 10590
+    },
+    {
+      "epoch": 28.266666666666666,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029986562773835995,
+      "loss": 0.481,
+      "step": 10600
+    },
+    {
+      "epoch": 28.293333333333333,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0002998653615364118,
+      "loss": 0.4758,
+      "step": 10610
+    },
+    {
+      "epoch": 28.32,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.00029986509507115946,
+      "loss": 0.4791,
+      "step": 10620
+    },
+    {
+      "epoch": 28.346666666666668,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002998648283426034,
+      "loss": 0.4881,
+      "step": 10630
+    },
+    {
+      "epoch": 28.373333333333335,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029986456135074416,
+      "loss": 0.4843,
+      "step": 10640
+    },
+    {
+      "epoch": 28.4,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002998642940955821,
+      "loss": 0.4797,
+      "step": 10650
+    },
+    {
+      "epoch": 28.426666666666666,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002998640265771178,
+      "loss": 0.4866,
+      "step": 10660
+    },
+    {
+      "epoch": 28.453333333333333,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0002998637587953516,
+      "loss": 0.4883,
+      "step": 10670
+    },
+    {
+      "epoch": 28.48,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002998634907502841,
+      "loss": 0.4946,
+      "step": 10680
+    },
+    {
+      "epoch": 28.506666666666668,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0002998632224419157,
+      "loss": 0.4844,
+      "step": 10690
+    },
+    {
+      "epoch": 28.533333333333335,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0002998629538702469,
+      "loss": 0.4769,
+      "step": 10700
+    },
+    {
+      "epoch": 28.56,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029986268503527815,
+      "loss": 0.4791,
+      "step": 10710
+    },
+    {
+      "epoch": 28.586666666666666,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029986241593700996,
+      "loss": 0.4751,
+      "step": 10720
+    },
+    {
+      "epoch": 28.613333333333333,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002998621465754428,
+      "loss": 0.4607,
+      "step": 10730
+    },
+    {
+      "epoch": 28.64,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.000299861876950577,
+      "loss": 0.4652,
+      "step": 10740
+    },
+    {
+      "epoch": 28.666666666666668,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.00029986160706241326,
+      "loss": 0.4814,
+      "step": 10750
+    },
+    {
+      "epoch": 28.693333333333335,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029986133691095194,
+      "loss": 0.4642,
+      "step": 10760
+    },
+    {
+      "epoch": 28.72,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002998610664961935,
+      "loss": 0.481,
+      "step": 10770
+    },
+    {
+      "epoch": 28.746666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002998607958181384,
+      "loss": 0.4862,
+      "step": 10780
+    },
+    {
+      "epoch": 28.773333333333333,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002998605248767872,
+      "loss": 0.4936,
+      "step": 10790
+    },
+    {
+      "epoch": 28.8,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002998602536721403,
+      "loss": 0.4805,
+      "step": 10800
+    },
+    {
+      "epoch": 28.826666666666668,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0002998599822041982,
+      "loss": 0.4751,
+      "step": 10810
+    },
+    {
+      "epoch": 28.85333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002998597104729614,
+      "loss": 0.4839,
+      "step": 10820
+    },
+    {
+      "epoch": 28.88,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00029985943847843035,
+      "loss": 0.4706,
+      "step": 10830
+    },
+    {
+      "epoch": 28.906666666666666,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029985916622060556,
+      "loss": 0.4743,
+      "step": 10840
+    },
+    {
+      "epoch": 28.933333333333334,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002998588936994875,
+      "loss": 0.4756,
+      "step": 10850
+    },
+    {
+      "epoch": 28.96,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002998586209150766,
+      "loss": 0.4732,
+      "step": 10860
+    },
+    {
+      "epoch": 28.986666666666668,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002998583478673734,
+      "loss": 0.4879,
+      "step": 10870
+    },
+    {
+      "epoch": 29.0,
+      "eval_loss": 0.48758020997047424,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9573,
+      "eval_samples_per_second": 1.607,
+      "eval_steps_per_second": 0.1,
+      "step": 10875
+    },
+    {
+      "epoch": 29.013333333333332,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002998580745563783,
+      "loss": 0.4765,
+      "step": 10880
+    },
+    {
+      "epoch": 29.04,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002998578009820918,
+      "loss": 0.5008,
+      "step": 10890
+    },
+    {
+      "epoch": 29.066666666666666,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002998575271445145,
+      "loss": 0.4886,
+      "step": 10900
+    },
+    {
+      "epoch": 29.093333333333334,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002998572530436468,
+      "loss": 0.4828,
+      "step": 10910
+    },
+    {
+      "epoch": 29.12,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029985697867948916,
+      "loss": 0.4801,
+      "step": 10920
+    },
+    {
+      "epoch": 29.14666666666667,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.000299856704052042,
+      "loss": 0.4889,
+      "step": 10930
+    },
+    {
+      "epoch": 29.173333333333332,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000299856429161306,
+      "loss": 0.4787,
+      "step": 10940
+    },
+    {
+      "epoch": 29.2,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029985615400728147,
+      "loss": 0.4782,
+      "step": 10950
+    },
+    {
+      "epoch": 29.226666666666667,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002998558785899689,
+      "loss": 0.4747,
+      "step": 10960
+    },
+    {
+      "epoch": 29.253333333333334,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.00029985560290936894,
+      "loss": 0.4749,
+      "step": 10970
+    },
+    {
+      "epoch": 29.28,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029985532696548184,
+      "loss": 0.4883,
+      "step": 10980
+    },
+    {
+      "epoch": 29.306666666666665,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002998550507583082,
+      "loss": 0.469,
+      "step": 10990
+    },
+    {
+      "epoch": 29.333333333333332,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0002998547742878486,
+      "loss": 0.4849,
+      "step": 11000
+    },
+    {
+      "epoch": 29.36,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0002998544975541034,
+      "loss": 0.4875,
+      "step": 11010
+    },
+    {
+      "epoch": 29.386666666666667,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0002998542205570731,
+      "loss": 0.4788,
+      "step": 11020
+    },
+    {
+      "epoch": 29.413333333333334,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0002998539432967582,
+      "loss": 0.4824,
+      "step": 11030
+    },
+    {
+      "epoch": 29.44,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0002998536657731592,
+      "loss": 0.4866,
+      "step": 11040
+    },
+    {
+      "epoch": 29.466666666666665,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00029985338798627656,
+      "loss": 0.4968,
+      "step": 11050
+    },
+    {
+      "epoch": 29.493333333333332,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00029985310993611077,
+      "loss": 0.4861,
+      "step": 11060
+    },
+    {
+      "epoch": 29.52,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002998528316226624,
+      "loss": 0.4808,
+      "step": 11070
+    },
+    {
+      "epoch": 29.546666666666667,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029985255304593183,
+      "loss": 0.4772,
+      "step": 11080
+    },
+    {
+      "epoch": 29.573333333333334,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0002998522742059196,
+      "loss": 0.4768,
+      "step": 11090
+    },
+    {
+      "epoch": 29.6,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002998519951026262,
+      "loss": 0.4709,
+      "step": 11100
+    },
+    {
+      "epoch": 29.626666666666665,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0002998517157360521,
+      "loss": 0.4575,
+      "step": 11110
+    },
+    {
+      "epoch": 29.653333333333332,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0002998514361061978,
+      "loss": 0.4767,
+      "step": 11120
+    },
+    {
+      "epoch": 29.68,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002998511562130639,
+      "loss": 0.4681,
+      "step": 11130
+    },
+    {
+      "epoch": 29.706666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029985087605665067,
+      "loss": 0.469,
+      "step": 11140
+    },
+    {
+      "epoch": 29.733333333333334,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029985059563695875,
+      "loss": 0.4883,
+      "step": 11150
+    },
+    {
+      "epoch": 29.76,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0002998503149539886,
+      "loss": 0.4895,
+      "step": 11160
+    },
+    {
+      "epoch": 29.786666666666665,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029985003400774074,
+      "loss": 0.4883,
+      "step": 11170
+    },
+    {
+      "epoch": 29.813333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002998497527982156,
+      "loss": 0.4727,
+      "step": 11180
+    },
+    {
+      "epoch": 29.84,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.00029984947132541374,
+      "loss": 0.479,
+      "step": 11190
+    },
+    {
+      "epoch": 29.866666666666667,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029984918958933564,
+      "loss": 0.4784,
+      "step": 11200
+    },
+    {
+      "epoch": 29.893333333333334,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00029984890758998176,
+      "loss": 0.469,
+      "step": 11210
+    },
+    {
+      "epoch": 29.92,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.00029984862532735264,
+      "loss": 0.4798,
+      "step": 11220
+    },
+    {
+      "epoch": 29.946666666666665,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029984834280144876,
+      "loss": 0.4725,
+      "step": 11230
+    },
+    {
+      "epoch": 29.973333333333333,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002998480600122705,
+      "loss": 0.4836,
+      "step": 11240
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0002998477769598186,
+      "loss": 0.4727,
+      "step": 11250
+    },
+    {
+      "epoch": 30.0,
+      "eval_loss": 0.4864569306373596,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 8.889,
+      "eval_samples_per_second": 1.8,
+      "eval_steps_per_second": 0.112,
+      "step": 11250
+    },
+    {
+      "epoch": 30.026666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002998474936440934,
+      "loss": 0.4922,
+      "step": 11260
+    },
+    {
+      "epoch": 30.053333333333335,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002998472100650954,
+      "loss": 0.4985,
+      "step": 11270
+    },
+    {
+      "epoch": 30.08,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029984692622282515,
+      "loss": 0.4846,
+      "step": 11280
+    },
+    {
+      "epoch": 30.106666666666666,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002998466421172831,
+      "loss": 0.4798,
+      "step": 11290
+    },
+    {
+      "epoch": 30.133333333333333,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0002998463577484697,
+      "loss": 0.4813,
+      "step": 11300
+    },
+    {
+      "epoch": 30.16,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029984607311638566,
+      "loss": 0.4865,
+      "step": 11310
+    },
+    {
+      "epoch": 30.186666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002998457882210313,
+      "loss": 0.4777,
+      "step": 11320
+    },
+    {
+      "epoch": 30.213333333333335,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0002998455030624071,
+      "loss": 0.4737,
+      "step": 11330
+    },
+    {
+      "epoch": 30.24,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00029984521764051366,
+      "loss": 0.4767,
+      "step": 11340
+    },
+    {
+      "epoch": 30.266666666666666,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002998449319553514,
+      "loss": 0.4809,
+      "step": 11350
+    },
+    {
+      "epoch": 30.293333333333333,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029984464600692093,
+      "loss": 0.4748,
+      "step": 11360
+    },
+    {
+      "epoch": 30.32,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00029984435979522265,
+      "loss": 0.4783,
+      "step": 11370
+    },
+    {
+      "epoch": 30.346666666666668,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002998440733202571,
+      "loss": 0.4871,
+      "step": 11380
+    },
+    {
+      "epoch": 30.373333333333335,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002998437865820247,
+      "loss": 0.4834,
+      "step": 11390
+    },
+    {
+      "epoch": 30.4,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029984349958052614,
+      "loss": 0.4793,
+      "step": 11400
+    },
+    {
+      "epoch": 30.426666666666666,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0002998432123157618,
+      "loss": 0.4862,
+      "step": 11410
+    },
+    {
+      "epoch": 30.453333333333333,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0002998429247877322,
+      "loss": 0.4876,
+      "step": 11420
+    },
+    {
+      "epoch": 30.48,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002998426369964379,
+      "loss": 0.495,
+      "step": 11430
+    },
+    {
+      "epoch": 30.506666666666668,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0002998423489418793,
+      "loss": 0.4836,
+      "step": 11440
+    },
+    {
+      "epoch": 30.533333333333335,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029984206062405697,
+      "loss": 0.4778,
+      "step": 11450
+    },
+    {
+      "epoch": 30.56,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002998417720429714,
+      "loss": 0.4784,
+      "step": 11460
+    },
+    {
+      "epoch": 30.586666666666666,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002998414831986231,
+      "loss": 0.4736,
+      "step": 11470
+    },
+    {
+      "epoch": 30.613333333333333,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0002998411940910126,
+      "loss": 0.4596,
+      "step": 11480
+    },
+    {
+      "epoch": 30.64,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0002998409047201404,
+      "loss": 0.4648,
+      "step": 11490
+    },
+    {
+      "epoch": 30.666666666666668,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.000299840615086007,
+      "loss": 0.4813,
+      "step": 11500
+    },
+    {
+      "epoch": 30.693333333333335,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0002998403251886129,
+      "loss": 0.463,
+      "step": 11510
+    },
+    {
+      "epoch": 30.72,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002998400350279586,
+      "loss": 0.4809,
+      "step": 11520
+    },
+    {
+      "epoch": 30.746666666666666,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002998397446040446,
+      "loss": 0.4851,
+      "step": 11530
+    },
+    {
+      "epoch": 30.773333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002998394539168715,
+      "loss": 0.4928,
+      "step": 11540
+    },
+    {
+      "epoch": 30.8,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029983916296643975,
+      "loss": 0.4789,
+      "step": 11550
+    },
+    {
+      "epoch": 30.826666666666668,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0002998388717527498,
+      "loss": 0.474,
+      "step": 11560
+    },
+    {
+      "epoch": 30.85333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029983858027580226,
+      "loss": 0.4837,
+      "step": 11570
+    },
+    {
+      "epoch": 30.88,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002998382885355976,
+      "loss": 0.471,
+      "step": 11580
+    },
+    {
+      "epoch": 30.906666666666666,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002998379965321364,
+      "loss": 0.4731,
+      "step": 11590
+    },
+    {
+      "epoch": 30.933333333333334,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.000299837704265419,
+      "loss": 0.4754,
+      "step": 11600
+    },
+    {
+      "epoch": 30.96,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0002998374117354461,
+      "loss": 0.472,
+      "step": 11610
+    },
+    {
+      "epoch": 30.986666666666668,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029983711894221806,
+      "loss": 0.4873,
+      "step": 11620
+    },
+    {
+      "epoch": 31.0,
+      "eval_loss": 0.48670804500579834,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0195,
+      "eval_samples_per_second": 1.597,
+      "eval_steps_per_second": 0.1,
+      "step": 11625
+    },
+    {
+      "epoch": 31.013333333333332,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029983682588573555,
+      "loss": 0.4761,
+      "step": 11630
+    },
+    {
+      "epoch": 31.04,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029983653256599894,
+      "loss": 0.5006,
+      "step": 11640
+    },
+    {
+      "epoch": 31.066666666666666,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002998362389830089,
+      "loss": 0.4883,
+      "step": 11650
+    },
+    {
+      "epoch": 31.093333333333334,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00029983594513676576,
+      "loss": 0.4822,
+      "step": 11660
+    },
+    {
+      "epoch": 31.12,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0002998356510272702,
+      "loss": 0.4793,
+      "step": 11670
+    },
+    {
+      "epoch": 31.14666666666667,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002998353566545226,
+      "loss": 0.4883,
+      "step": 11680
+    },
+    {
+      "epoch": 31.173333333333332,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002998350620185236,
+      "loss": 0.478,
+      "step": 11690
+    },
+    {
+      "epoch": 31.2,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029983476711927366,
+      "loss": 0.4774,
+      "step": 11700
+    },
+    {
+      "epoch": 31.226666666666667,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029983447195677327,
+      "loss": 0.475,
+      "step": 11710
+    },
+    {
+      "epoch": 31.253333333333334,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.000299834176531023,
+      "loss": 0.474,
+      "step": 11720
+    },
+    {
+      "epoch": 31.28,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002998338808420234,
+      "loss": 0.4872,
+      "step": 11730
+    },
+    {
+      "epoch": 31.306666666666665,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002998335848897749,
+      "loss": 0.4692,
+      "step": 11740
+    },
+    {
+      "epoch": 31.333333333333332,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.000299833288674278,
+      "loss": 0.4848,
+      "step": 11750
+    },
+    {
+      "epoch": 31.36,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002998329921955334,
+      "loss": 0.4868,
+      "step": 11760
+    },
+    {
+      "epoch": 31.386666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002998326954535414,
+      "loss": 0.4786,
+      "step": 11770
+    },
+    {
+      "epoch": 31.413333333333334,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002998323984483027,
+      "loss": 0.4823,
+      "step": 11780
+    },
+    {
+      "epoch": 31.44,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002998321011798177,
+      "loss": 0.486,
+      "step": 11790
+    },
+    {
+      "epoch": 31.466666666666665,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029983180364808696,
+      "loss": 0.4952,
+      "step": 11800
+    },
+    {
+      "epoch": 31.493333333333332,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029983150585311105,
+      "loss": 0.487,
+      "step": 11810
+    },
+    {
+      "epoch": 31.52,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0002998312077948904,
+      "loss": 0.4806,
+      "step": 11820
+    },
+    {
+      "epoch": 31.546666666666667,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002998309094734256,
+      "loss": 0.4773,
+      "step": 11830
+    },
+    {
+      "epoch": 31.573333333333334,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002998306108887172,
+      "loss": 0.4758,
+      "step": 11840
+    },
+    {
+      "epoch": 31.6,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029983031204076563,
+      "loss": 0.4704,
+      "step": 11850
+    },
+    {
+      "epoch": 31.626666666666665,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00029983001292957154,
+      "loss": 0.4559,
+      "step": 11860
+    },
+    {
+      "epoch": 31.653333333333332,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0002998297135551353,
+      "loss": 0.4764,
+      "step": 11870
+    },
+    {
+      "epoch": 31.68,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002998294139174576,
+      "loss": 0.4677,
+      "step": 11880
+    },
+    {
+      "epoch": 31.706666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002998291140165389,
+      "loss": 0.4684,
+      "step": 11890
+    },
+    {
+      "epoch": 31.733333333333334,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002998288138523796,
+      "loss": 0.4878,
+      "step": 11900
+    },
+    {
+      "epoch": 31.76,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029982851342498046,
+      "loss": 0.4893,
+      "step": 11910
+    },
+    {
+      "epoch": 31.786666666666665,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002998282127343418,
+      "loss": 0.4877,
+      "step": 11920
+    },
+    {
+      "epoch": 31.813333333333333,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002998279117804643,
+      "loss": 0.4725,
+      "step": 11930
+    },
+    {
+      "epoch": 31.84,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002998276105633484,
+      "loss": 0.4786,
+      "step": 11940
+    },
+    {
+      "epoch": 31.866666666666667,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0002998273090829946,
+      "loss": 0.478,
+      "step": 11950
+    },
+    {
+      "epoch": 31.893333333333334,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029982700733940357,
+      "loss": 0.4683,
+      "step": 11960
+    },
+    {
+      "epoch": 31.92,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029982670533257576,
+      "loss": 0.4787,
+      "step": 11970
+    },
+    {
+      "epoch": 31.946666666666665,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029982640306251164,
+      "loss": 0.4719,
+      "step": 11980
+    },
+    {
+      "epoch": 31.973333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029982610052921185,
+      "loss": 0.483,
+      "step": 11990
+    },
+    {
+      "epoch": 32.0,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002998257977326769,
+      "loss": 0.4716,
+      "step": 12000
+    },
+    {
+      "epoch": 32.0,
+      "eval_loss": 0.4881175458431244,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.5564,
+      "eval_samples_per_second": 1.674,
+      "eval_steps_per_second": 0.105,
+      "step": 12000
+    },
+    {
+      "epoch": 32.026666666666664,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0002998254946729072,
+      "loss": 0.4913,
+      "step": 12010
+    },
+    {
+      "epoch": 32.053333333333335,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002998251913499034,
+      "loss": 0.4976,
+      "step": 12020
+    },
+    {
+      "epoch": 32.08,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.00029982488776366604,
+      "loss": 0.4847,
+      "step": 12030
+    },
+    {
+      "epoch": 32.10666666666667,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002998245839141956,
+      "loss": 0.4796,
+      "step": 12040
+    },
+    {
+      "epoch": 32.13333333333333,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00029982427980149265,
+      "loss": 0.482,
+      "step": 12050
+    },
+    {
+      "epoch": 32.16,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029982397542555766,
+      "loss": 0.4858,
+      "step": 12060
+    },
+    {
+      "epoch": 32.18666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029982367078639123,
+      "loss": 0.4772,
+      "step": 12070
+    },
+    {
+      "epoch": 32.21333333333333,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0002998233658839939,
+      "loss": 0.4728,
+      "step": 12080
+    },
+    {
+      "epoch": 32.24,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002998230607183662,
+      "loss": 0.4769,
+      "step": 12090
+    },
+    {
+      "epoch": 32.266666666666666,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00029982275528950866,
+      "loss": 0.4795,
+      "step": 12100
+    },
+    {
+      "epoch": 32.29333333333334,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0002998224495974218,
+      "loss": 0.474,
+      "step": 12110
+    },
+    {
+      "epoch": 32.32,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.00029982214364210607,
+      "loss": 0.4777,
+      "step": 12120
+    },
+    {
+      "epoch": 32.346666666666664,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002998218374235622,
+      "loss": 0.4861,
+      "step": 12130
+    },
+    {
+      "epoch": 32.373333333333335,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0002998215309417906,
+      "loss": 0.4827,
+      "step": 12140
+    },
+    {
+      "epoch": 32.4,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029982122419679186,
+      "loss": 0.479,
+      "step": 12150
+    },
+    {
+      "epoch": 32.42666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002998209171885665,
+      "loss": 0.4853,
+      "step": 12160
+    },
+    {
+      "epoch": 32.45333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.000299820609917115,
+      "loss": 0.4874,
+      "step": 12170
+    },
+    {
+      "epoch": 32.48,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.000299820302382438,
+      "loss": 0.4939,
+      "step": 12180
+    },
+    {
+      "epoch": 32.50666666666667,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.000299819994584536,
+      "loss": 0.4832,
+      "step": 12190
+    },
+    {
+      "epoch": 32.53333333333333,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0002998196865234095,
+      "loss": 0.4762,
+      "step": 12200
+    },
+    {
+      "epoch": 32.56,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0002998193781990591,
+      "loss": 0.4779,
+      "step": 12210
+    },
+    {
+      "epoch": 32.586666666666666,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002998190696114853,
+      "loss": 0.473,
+      "step": 12220
+    },
+    {
+      "epoch": 32.61333333333333,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002998187607606887,
+      "loss": 0.4594,
+      "step": 12230
+    },
+    {
+      "epoch": 32.64,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029981845164666975,
+      "loss": 0.4641,
+      "step": 12240
+    },
+    {
+      "epoch": 32.666666666666664,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029981814226942913,
+      "loss": 0.4799,
+      "step": 12250
+    },
+    {
+      "epoch": 32.693333333333335,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029981783262896726,
+      "loss": 0.4624,
+      "step": 12260
+    },
+    {
+      "epoch": 32.72,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002998175227252847,
+      "loss": 0.4797,
+      "step": 12270
+    },
+    {
+      "epoch": 32.74666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029981721255838205,
+      "loss": 0.4847,
+      "step": 12280
+    },
+    {
+      "epoch": 32.77333333333333,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002998169021282598,
+      "loss": 0.4927,
+      "step": 12290
+    },
+    {
+      "epoch": 32.8,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0002998165914349185,
+      "loss": 0.4794,
+      "step": 12300
+    },
+    {
+      "epoch": 32.82666666666667,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002998162804783588,
+      "loss": 0.4732,
+      "step": 12310
+    },
+    {
+      "epoch": 32.85333333333333,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002998159692585811,
+      "loss": 0.4825,
+      "step": 12320
+    },
+    {
+      "epoch": 32.88,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029981565777558605,
+      "loss": 0.4706,
+      "step": 12330
+    },
+    {
+      "epoch": 32.906666666666666,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029981534602937414,
+      "loss": 0.4733,
+      "step": 12340
+    },
+    {
+      "epoch": 32.93333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002998150340199459,
+      "loss": 0.4748,
+      "step": 12350
+    },
+    {
+      "epoch": 32.96,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.000299814721747302,
+      "loss": 0.4721,
+      "step": 12360
+    },
+    {
+      "epoch": 32.986666666666665,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002998144092114428,
+      "loss": 0.4865,
+      "step": 12370
+    },
+    {
+      "epoch": 33.0,
+      "eval_loss": 0.484887033700943,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.3807,
+      "eval_samples_per_second": 1.706,
+      "eval_steps_per_second": 0.107,
+      "step": 12375
+    },
+    {
+      "epoch": 33.013333333333335,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.000299814096412369,
+      "loss": 0.4753,
+      "step": 12380
+    },
+    {
+      "epoch": 33.04,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002998137833500811,
+      "loss": 0.4997,
+      "step": 12390
+    },
+    {
+      "epoch": 33.06666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029981347002457965,
+      "loss": 0.487,
+      "step": 12400
+    },
+    {
+      "epoch": 33.093333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002998131564358652,
+      "loss": 0.4815,
+      "step": 12410
+    },
+    {
+      "epoch": 33.12,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029981284258393827,
+      "loss": 0.478,
+      "step": 12420
+    },
+    {
+      "epoch": 33.14666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002998125284687995,
+      "loss": 0.4875,
+      "step": 12430
+    },
+    {
+      "epoch": 33.17333333333333,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0002998122140904493,
+      "loss": 0.4773,
+      "step": 12440
+    },
+    {
+      "epoch": 33.2,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002998118994488884,
+      "loss": 0.4762,
+      "step": 12450
+    },
+    {
+      "epoch": 33.22666666666667,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0002998115845441172,
+      "loss": 0.4741,
+      "step": 12460
+    },
+    {
+      "epoch": 33.25333333333333,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029981126937613634,
+      "loss": 0.473,
+      "step": 12470
+    },
+    {
+      "epoch": 33.28,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002998109539449464,
+      "loss": 0.4868,
+      "step": 12480
+    },
+    {
+      "epoch": 33.306666666666665,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00029981063825054775,
+      "loss": 0.4689,
+      "step": 12490
+    },
+    {
+      "epoch": 33.333333333333336,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029981032229294116,
+      "loss": 0.4843,
+      "step": 12500
+    },
+    {
+      "epoch": 33.36,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0002998100060721271,
+      "loss": 0.4862,
+      "step": 12510
+    },
+    {
+      "epoch": 33.38666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029980968958810613,
+      "loss": 0.478,
+      "step": 12520
+    },
+    {
+      "epoch": 33.413333333333334,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002998093728408788,
+      "loss": 0.4821,
+      "step": 12530
+    },
+    {
+      "epoch": 33.44,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002998090558304456,
+      "loss": 0.4852,
+      "step": 12540
+    },
+    {
+      "epoch": 33.46666666666667,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0002998087385568072,
+      "loss": 0.4946,
+      "step": 12550
+    },
+    {
+      "epoch": 33.49333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029980842101996416,
+      "loss": 0.4856,
+      "step": 12560
+    },
+    {
+      "epoch": 33.52,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002998081032199169,
+      "loss": 0.4806,
+      "step": 12570
+    },
+    {
+      "epoch": 33.54666666666667,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00029980778515666616,
+      "loss": 0.477,
+      "step": 12580
+    },
+    {
+      "epoch": 33.57333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029980746683021237,
+      "loss": 0.4764,
+      "step": 12590
+    },
+    {
+      "epoch": 33.6,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029980714824055614,
+      "loss": 0.4699,
+      "step": 12600
+    },
+    {
+      "epoch": 33.626666666666665,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.000299806829387698,
+      "loss": 0.4556,
+      "step": 12610
+    },
+    {
+      "epoch": 33.653333333333336,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00029980651027163854,
+      "loss": 0.4765,
+      "step": 12620
+    },
+    {
+      "epoch": 33.68,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002998061908923783,
+      "loss": 0.4676,
+      "step": 12630
+    },
+    {
+      "epoch": 33.70666666666666,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029980587124991785,
+      "loss": 0.4678,
+      "step": 12640
+    },
+    {
+      "epoch": 33.733333333333334,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002998055513442578,
+      "loss": 0.4875,
+      "step": 12650
+    },
+    {
+      "epoch": 33.76,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002998052311753986,
+      "loss": 0.4879,
+      "step": 12660
+    },
+    {
+      "epoch": 33.78666666666667,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002998049107433409,
+      "loss": 0.4873,
+      "step": 12670
+    },
+    {
+      "epoch": 33.81333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029980459004808524,
+      "loss": 0.4716,
+      "step": 12680
+    },
+    {
+      "epoch": 33.84,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002998042690896321,
+      "loss": 0.478,
+      "step": 12690
+    },
+    {
+      "epoch": 33.86666666666667,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.00029980394786798225,
+      "loss": 0.4781,
+      "step": 12700
+    },
+    {
+      "epoch": 33.89333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002998036263831361,
+      "loss": 0.4691,
+      "step": 12710
+    },
+    {
+      "epoch": 33.92,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002998033046350942,
+      "loss": 0.4785,
+      "step": 12720
+    },
+    {
+      "epoch": 33.946666666666665,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002998029826238572,
+      "loss": 0.471,
+      "step": 12730
+    },
+    {
+      "epoch": 33.973333333333336,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029980266034942564,
+      "loss": 0.4824,
+      "step": 12740
+    },
+    {
+      "epoch": 34.0,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002998023378118,
+      "loss": 0.4717,
+      "step": 12750
+    },
+    {
+      "epoch": 34.0,
+      "eval_loss": 0.48443278670310974,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.9254,
+      "eval_samples_per_second": 1.464,
+      "eval_steps_per_second": 0.092,
+      "step": 12750
+    },
+    {
+      "epoch": 34.026666666666664,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029980201501098096,
+      "loss": 0.4909,
+      "step": 12760
+    },
+    {
+      "epoch": 34.053333333333335,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029980169194696905,
+      "loss": 0.4972,
+      "step": 12770
+    },
+    {
+      "epoch": 34.08,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.00029980136861976485,
+      "loss": 0.4834,
+      "step": 12780
+    },
+    {
+      "epoch": 34.10666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002998010450293689,
+      "loss": 0.479,
+      "step": 12790
+    },
+    {
+      "epoch": 34.13333333333333,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00029980072117578177,
+      "loss": 0.4812,
+      "step": 12800
+    },
+    {
+      "epoch": 34.16,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002998003970590041,
+      "loss": 0.485,
+      "step": 12810
+    },
+    {
+      "epoch": 34.18666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029980007267903626,
+      "loss": 0.477,
+      "step": 12820
+    },
+    {
+      "epoch": 34.21333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029979974803587907,
+      "loss": 0.4715,
+      "step": 12830
+    },
+    {
+      "epoch": 34.24,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.000299799423129533,
+      "loss": 0.4764,
+      "step": 12840
+    },
+    {
+      "epoch": 34.266666666666666,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0002997990979599985,
+      "loss": 0.48,
+      "step": 12850
+    },
+    {
+      "epoch": 34.29333333333334,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00029979877252727635,
+      "loss": 0.474,
+      "step": 12860
+    },
+    {
+      "epoch": 34.32,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.00029979844683136704,
+      "loss": 0.4774,
+      "step": 12870
+    },
+    {
+      "epoch": 34.346666666666664,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029979812087227106,
+      "loss": 0.4868,
+      "step": 12880
+    },
+    {
+      "epoch": 34.373333333333335,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029979779464998907,
+      "loss": 0.4821,
+      "step": 12890
+    },
+    {
+      "epoch": 34.4,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002997974681645216,
+      "loss": 0.4776,
+      "step": 12900
+    },
+    {
+      "epoch": 34.42666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002997971414158693,
+      "loss": 0.4845,
+      "step": 12910
+    },
+    {
+      "epoch": 34.45333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029979681440403267,
+      "loss": 0.4864,
+      "step": 12920
+    },
+    {
+      "epoch": 34.48,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002997964871290122,
+      "loss": 0.4933,
+      "step": 12930
+    },
+    {
+      "epoch": 34.50666666666667,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0002997961595908087,
+      "loss": 0.4826,
+      "step": 12940
+    },
+    {
+      "epoch": 34.53333333333333,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.00029979583178942255,
+      "loss": 0.4766,
+      "step": 12950
+    },
+    {
+      "epoch": 34.56,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00029979550372485447,
+      "loss": 0.4775,
+      "step": 12960
+    },
+    {
+      "epoch": 34.586666666666666,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0002997951753971049,
+      "loss": 0.4732,
+      "step": 12970
+    },
+    {
+      "epoch": 34.61333333333333,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0002997948468061744,
+      "loss": 0.459,
+      "step": 12980
+    },
+    {
+      "epoch": 34.64,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0002997945179520637,
+      "loss": 0.4635,
+      "step": 12990
+    },
+    {
+      "epoch": 34.666666666666664,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002997941888347733,
+      "loss": 0.4791,
+      "step": 13000
+    },
+    {
+      "epoch": 34.693333333333335,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029979385945430375,
+      "loss": 0.4626,
+      "step": 13010
+    },
+    {
+      "epoch": 34.72,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002997935298106557,
+      "loss": 0.4796,
+      "step": 13020
+    },
+    {
+      "epoch": 34.74666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002997931999038296,
+      "loss": 0.4842,
+      "step": 13030
+    },
+    {
+      "epoch": 34.77333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029979286973382625,
+      "loss": 0.4916,
+      "step": 13040
+    },
+    {
+      "epoch": 34.8,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.000299792539300646,
+      "loss": 0.4784,
+      "step": 13050
+    },
+    {
+      "epoch": 34.82666666666667,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002997922086042895,
+      "loss": 0.4728,
+      "step": 13060
+    },
+    {
+      "epoch": 34.85333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002997918776447574,
+      "loss": 0.4821,
+      "step": 13070
+    },
+    {
+      "epoch": 34.88,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029979154642205023,
+      "loss": 0.4698,
+      "step": 13080
+    },
+    {
+      "epoch": 34.906666666666666,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002997912149361686,
+      "loss": 0.4729,
+      "step": 13090
+    },
+    {
+      "epoch": 34.93333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029979088318711304,
+      "loss": 0.4736,
+      "step": 13100
+    },
+    {
+      "epoch": 34.96,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0002997905511748842,
+      "loss": 0.4721,
+      "step": 13110
+    },
+    {
+      "epoch": 34.986666666666665,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029979021889948255,
+      "loss": 0.4854,
+      "step": 13120
+    },
+    {
+      "epoch": 35.0,
+      "eval_loss": 0.4832444489002228,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2491,
+      "eval_samples_per_second": 1.561,
+      "eval_steps_per_second": 0.098,
+      "step": 13125
+    },
+    {
+      "epoch": 35.013333333333335,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002997898863609088,
+      "loss": 0.4752,
+      "step": 13130
+    },
+    {
+      "epoch": 35.04,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029978955355916347,
+      "loss": 0.4986,
+      "step": 13140
+    },
+    {
+      "epoch": 35.06666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029978922049424717,
+      "loss": 0.4866,
+      "step": 13150
+    },
+    {
+      "epoch": 35.093333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002997888871661605,
+      "loss": 0.4816,
+      "step": 13160
+    },
+    {
+      "epoch": 35.12,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.000299788553574904,
+      "loss": 0.4774,
+      "step": 13170
+    },
+    {
+      "epoch": 35.14666666666667,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00029978821972047835,
+      "loss": 0.4871,
+      "step": 13180
+    },
+    {
+      "epoch": 35.17333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002997878856028839,
+      "loss": 0.4766,
+      "step": 13190
+    },
+    {
+      "epoch": 35.2,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029978755122212153,
+      "loss": 0.4755,
+      "step": 13200
+    },
+    {
+      "epoch": 35.22666666666667,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00029978721657819167,
+      "loss": 0.4735,
+      "step": 13210
+    },
+    {
+      "epoch": 35.25333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029978688167109493,
+      "loss": 0.4732,
+      "step": 13220
+    },
+    {
+      "epoch": 35.28,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002997865465008319,
+      "loss": 0.4861,
+      "step": 13230
+    },
+    {
+      "epoch": 35.306666666666665,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002997862110674032,
+      "loss": 0.4682,
+      "step": 13240
+    },
+    {
+      "epoch": 35.333333333333336,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0002997858753708093,
+      "loss": 0.4838,
+      "step": 13250
+    },
+    {
+      "epoch": 35.36,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.000299785539411051,
+      "loss": 0.4856,
+      "step": 13260
+    },
+    {
+      "epoch": 35.38666666666666,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029978520318812876,
+      "loss": 0.4776,
+      "step": 13270
+    },
+    {
+      "epoch": 35.413333333333334,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029978486670204313,
+      "loss": 0.4809,
+      "step": 13280
+    },
+    {
+      "epoch": 35.44,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0002997845299527948,
+      "loss": 0.4841,
+      "step": 13290
+    },
+    {
+      "epoch": 35.46666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029978419294038425,
+      "loss": 0.4943,
+      "step": 13300
+    },
+    {
+      "epoch": 35.49333333333333,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0002997838556648122,
+      "loss": 0.4849,
+      "step": 13310
+    },
+    {
+      "epoch": 35.52,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0002997835181260791,
+      "loss": 0.4797,
+      "step": 13320
+    },
+    {
+      "epoch": 35.54666666666667,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029978318032418573,
+      "loss": 0.4756,
+      "step": 13330
+    },
+    {
+      "epoch": 35.57333333333333,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0002997828422591325,
+      "loss": 0.4751,
+      "step": 13340
+    },
+    {
+      "epoch": 35.6,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002997825039309201,
+      "loss": 0.4702,
+      "step": 13350
+    },
+    {
+      "epoch": 35.626666666666665,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029978216533954914,
+      "loss": 0.4557,
+      "step": 13360
+    },
+    {
+      "epoch": 35.653333333333336,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002997818264850202,
+      "loss": 0.4754,
+      "step": 13370
+    },
+    {
+      "epoch": 35.68,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00029978148736733377,
+      "loss": 0.4661,
+      "step": 13380
+    },
+    {
+      "epoch": 35.70666666666666,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002997811479864906,
+      "loss": 0.468,
+      "step": 13390
+    },
+    {
+      "epoch": 35.733333333333334,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029978080834249123,
+      "loss": 0.487,
+      "step": 13400
+    },
+    {
+      "epoch": 35.76,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002997804684353362,
+      "loss": 0.4881,
+      "step": 13410
+    },
+    {
+      "epoch": 35.78666666666667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029978012826502613,
+      "loss": 0.4863,
+      "step": 13420
+    },
+    {
+      "epoch": 35.81333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002997797878315617,
+      "loss": 0.4714,
+      "step": 13430
+    },
+    {
+      "epoch": 35.84,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002997794471349434,
+      "loss": 0.4779,
+      "step": 13440
+    },
+    {
+      "epoch": 35.86666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002997791061751719,
+      "loss": 0.4774,
+      "step": 13450
+    },
+    {
+      "epoch": 35.89333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002997787649522478,
+      "loss": 0.4679,
+      "step": 13460
+    },
+    {
+      "epoch": 35.92,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029977842346617164,
+      "loss": 0.4774,
+      "step": 13470
+    },
+    {
+      "epoch": 35.946666666666665,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002997780817169441,
+      "loss": 0.4706,
+      "step": 13480
+    },
+    {
+      "epoch": 35.973333333333336,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029977773970456565,
+      "loss": 0.4821,
+      "step": 13490
+    },
+    {
+      "epoch": 36.0,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029977739742903704,
+      "loss": 0.4707,
+      "step": 13500
+    },
+    {
+      "epoch": 36.0,
+      "eval_loss": 0.4858068525791168,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3774,
+      "eval_samples_per_second": 1.542,
+      "eval_steps_per_second": 0.096,
+      "step": 13500
+    },
+    {
+      "epoch": 36.026666666666664,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002997770548903588,
+      "loss": 0.4907,
+      "step": 13510
+    },
+    {
+      "epoch": 36.053333333333335,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002997767120885315,
+      "loss": 0.4971,
+      "step": 13520
+    },
+    {
+      "epoch": 36.08,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002997763690235558,
+      "loss": 0.4841,
+      "step": 13530
+    },
+    {
+      "epoch": 36.10666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029977602569543235,
+      "loss": 0.4787,
+      "step": 13540
+    },
+    {
+      "epoch": 36.13333333333333,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.00029977568210416163,
+      "loss": 0.4803,
+      "step": 13550
+    },
+    {
+      "epoch": 36.16,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002997753382497443,
+      "loss": 0.4852,
+      "step": 13560
+    },
+    {
+      "epoch": 36.18666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.000299774994132181,
+      "loss": 0.4768,
+      "step": 13570
+    },
+    {
+      "epoch": 36.21333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029977464975147224,
+      "loss": 0.4719,
+      "step": 13580
+    },
+    {
+      "epoch": 36.24,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029977430510761874,
+      "loss": 0.4748,
+      "step": 13590
+    },
+    {
+      "epoch": 36.266666666666666,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029977396020062103,
+      "loss": 0.4798,
+      "step": 13600
+    },
+    {
+      "epoch": 36.29333333333334,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029977361503047976,
+      "loss": 0.474,
+      "step": 13610
+    },
+    {
+      "epoch": 36.32,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029977326959719546,
+      "loss": 0.4773,
+      "step": 13620
+    },
+    {
+      "epoch": 36.346666666666664,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029977292390076884,
+      "loss": 0.4857,
+      "step": 13630
+    },
+    {
+      "epoch": 36.373333333333335,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029977257794120045,
+      "loss": 0.4823,
+      "step": 13640
+    },
+    {
+      "epoch": 36.4,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002997722317184909,
+      "loss": 0.478,
+      "step": 13650
+    },
+    {
+      "epoch": 36.42666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029977188523264077,
+      "loss": 0.4842,
+      "step": 13660
+    },
+    {
+      "epoch": 36.45333333333333,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029977153848365074,
+      "loss": 0.4866,
+      "step": 13670
+    },
+    {
+      "epoch": 36.48,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029977119147152136,
+      "loss": 0.4933,
+      "step": 13680
+    },
+    {
+      "epoch": 36.50666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029977084419625325,
+      "loss": 0.4826,
+      "step": 13690
+    },
+    {
+      "epoch": 36.53333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029977049665784705,
+      "loss": 0.4756,
+      "step": 13700
+    },
+    {
+      "epoch": 36.56,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002997701488563034,
+      "loss": 0.4773,
+      "step": 13710
+    },
+    {
+      "epoch": 36.586666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002997698007916228,
+      "loss": 0.4721,
+      "step": 13720
+    },
+    {
+      "epoch": 36.61333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.000299769452463806,
+      "loss": 0.4582,
+      "step": 13730
+    },
+    {
+      "epoch": 36.64,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.00029976910387285343,
+      "loss": 0.4634,
+      "step": 13740
+    },
+    {
+      "epoch": 36.666666666666664,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00029976875501876583,
+      "loss": 0.4796,
+      "step": 13750
+    },
+    {
+      "epoch": 36.693333333333335,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.00029976840590154383,
+      "loss": 0.4618,
+      "step": 13760
+    },
+    {
+      "epoch": 36.72,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.000299768056521188,
+      "loss": 0.4789,
+      "step": 13770
+    },
+    {
+      "epoch": 36.74666666666667,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029976770687769894,
+      "loss": 0.4837,
+      "step": 13780
+    },
+    {
+      "epoch": 36.77333333333333,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029976735697107736,
+      "loss": 0.4919,
+      "step": 13790
+    },
+    {
+      "epoch": 36.8,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002997670068013237,
+      "loss": 0.4775,
+      "step": 13800
+    },
+    {
+      "epoch": 36.82666666666667,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0002997666563684387,
+      "loss": 0.4727,
+      "step": 13810
+    },
+    {
+      "epoch": 36.85333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.000299766305672423,
+      "loss": 0.4822,
+      "step": 13820
+    },
+    {
+      "epoch": 36.88,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002997659547132771,
+      "loss": 0.4692,
+      "step": 13830
+    },
+    {
+      "epoch": 36.906666666666666,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002997656034910017,
+      "loss": 0.4721,
+      "step": 13840
+    },
+    {
+      "epoch": 36.93333333333333,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.00029976525200559745,
+      "loss": 0.4734,
+      "step": 13850
+    },
+    {
+      "epoch": 36.96,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00029976490025706486,
+      "loss": 0.4716,
+      "step": 13860
+    },
+    {
+      "epoch": 36.986666666666665,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002997645482454046,
+      "loss": 0.4849,
+      "step": 13870
+    },
+    {
+      "epoch": 37.0,
+      "eval_loss": 0.485599547624588,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5734,
+      "eval_samples_per_second": 1.513,
+      "eval_steps_per_second": 0.095,
+      "step": 13875
+    },
+    {
+      "epoch": 37.013333333333335,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002997641959706173,
+      "loss": 0.474,
+      "step": 13880
+    },
+    {
+      "epoch": 37.04,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002997638434327036,
+      "loss": 0.4988,
+      "step": 13890
+    },
+    {
+      "epoch": 37.06666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.000299763490631664,
+      "loss": 0.4861,
+      "step": 13900
+    },
+    {
+      "epoch": 37.093333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029976313756749937,
+      "loss": 0.4811,
+      "step": 13910
+    },
+    {
+      "epoch": 37.12,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029976278424021007,
+      "loss": 0.4768,
+      "step": 13920
+    },
+    {
+      "epoch": 37.14666666666667,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002997624306497968,
+      "loss": 0.4863,
+      "step": 13930
+    },
+    {
+      "epoch": 37.17333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002997620767962603,
+      "loss": 0.4767,
+      "step": 13940
+    },
+    {
+      "epoch": 37.2,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.000299761722679601,
+      "loss": 0.4749,
+      "step": 13950
+    },
+    {
+      "epoch": 37.22666666666667,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00029976136829981963,
+      "loss": 0.4727,
+      "step": 13960
+    },
+    {
+      "epoch": 37.25333333333333,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029976101365691686,
+      "loss": 0.4726,
+      "step": 13970
+    },
+    {
+      "epoch": 37.28,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029976065875089323,
+      "loss": 0.4863,
+      "step": 13980
+    },
+    {
+      "epoch": 37.306666666666665,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029976030358174936,
+      "loss": 0.4676,
+      "step": 13990
+    },
+    {
+      "epoch": 37.333333333333336,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029975994814948593,
+      "loss": 0.4835,
+      "step": 14000
+    },
+    {
+      "epoch": 37.36,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002997595924541035,
+      "loss": 0.4859,
+      "step": 14010
+    },
+    {
+      "epoch": 37.38666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029975923649560275,
+      "loss": 0.4769,
+      "step": 14020
+    },
+    {
+      "epoch": 37.413333333333334,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002997588802739843,
+      "loss": 0.4809,
+      "step": 14030
+    },
+    {
+      "epoch": 37.44,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029975852378924875,
+      "loss": 0.4837,
+      "step": 14040
+    },
+    {
+      "epoch": 37.46666666666667,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029975816704139673,
+      "loss": 0.4936,
+      "step": 14050
+    },
+    {
+      "epoch": 37.49333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002997578100304289,
+      "loss": 0.4846,
+      "step": 14060
+    },
+    {
+      "epoch": 37.52,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002997574527563458,
+      "loss": 0.4793,
+      "step": 14070
+    },
+    {
+      "epoch": 37.54666666666667,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029975709521914816,
+      "loss": 0.4752,
+      "step": 14080
+    },
+    {
+      "epoch": 37.57333333333333,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0002997567374188366,
+      "loss": 0.4751,
+      "step": 14090
+    },
+    {
+      "epoch": 37.6,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029975637935541165,
+      "loss": 0.4687,
+      "step": 14100
+    },
+    {
+      "epoch": 37.626666666666665,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002997560210288741,
+      "loss": 0.4549,
+      "step": 14110
+    },
+    {
+      "epoch": 37.653333333333336,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029975566243922437,
+      "loss": 0.4743,
+      "step": 14120
+    },
+    {
+      "epoch": 37.68,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0002997553035864633,
+      "loss": 0.4666,
+      "step": 14130
+    },
+    {
+      "epoch": 37.70666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029975494447059137,
+      "loss": 0.4671,
+      "step": 14140
+    },
+    {
+      "epoch": 37.733333333333334,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0002997545850916093,
+      "loss": 0.4861,
+      "step": 14150
+    },
+    {
+      "epoch": 37.76,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0002997542254495177,
+      "loss": 0.4875,
+      "step": 14160
+    },
+    {
+      "epoch": 37.78666666666667,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002997538655443171,
+      "loss": 0.4864,
+      "step": 14170
+    },
+    {
+      "epoch": 37.81333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002997535053760083,
+      "loss": 0.4709,
+      "step": 14180
+    },
+    {
+      "epoch": 37.84,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002997531449445918,
+      "loss": 0.4773,
+      "step": 14190
+    },
+    {
+      "epoch": 37.86666666666667,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029975278425006834,
+      "loss": 0.4767,
+      "step": 14200
+    },
+    {
+      "epoch": 37.89333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002997524232924385,
+      "loss": 0.4668,
+      "step": 14210
+    },
+    {
+      "epoch": 37.92,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029975206207170284,
+      "loss": 0.4772,
+      "step": 14220
+    },
+    {
+      "epoch": 37.946666666666665,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00029975170058786215,
+      "loss": 0.4692,
+      "step": 14230
+    },
+    {
+      "epoch": 37.973333333333336,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.000299751338840917,
+      "loss": 0.4809,
+      "step": 14240
+    },
+    {
+      "epoch": 38.0,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.000299750976830868,
+      "loss": 0.4702,
+      "step": 14250
+    },
+    {
+      "epoch": 38.0,
+      "eval_loss": 0.48388516902923584,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 11.8805,
+      "eval_samples_per_second": 1.347,
+      "eval_steps_per_second": 0.084,
+      "step": 14250
+    },
+    {
+      "epoch": 38.026666666666664,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0002997506145577157,
+      "loss": 0.4898,
+      "step": 14260
+    },
+    {
+      "epoch": 38.053333333333335,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.000299750252021461,
+      "loss": 0.4959,
+      "step": 14270
+    },
+    {
+      "epoch": 38.08,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002997498892221042,
+      "loss": 0.4826,
+      "step": 14280
+    },
+    {
+      "epoch": 38.10666666666667,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002997495261596462,
+      "loss": 0.478,
+      "step": 14290
+    },
+    {
+      "epoch": 38.13333333333333,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.00029974916283408756,
+      "loss": 0.4796,
+      "step": 14300
+    },
+    {
+      "epoch": 38.16,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029974879924542886,
+      "loss": 0.4838,
+      "step": 14310
+    },
+    {
+      "epoch": 38.18666666666667,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002997484353936708,
+      "loss": 0.476,
+      "step": 14320
+    },
+    {
+      "epoch": 38.21333333333333,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.000299748071278814,
+      "loss": 0.4706,
+      "step": 14330
+    },
+    {
+      "epoch": 38.24,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002997477069008591,
+      "loss": 0.4757,
+      "step": 14340
+    },
+    {
+      "epoch": 38.266666666666666,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029974734225980676,
+      "loss": 0.4785,
+      "step": 14350
+    },
+    {
+      "epoch": 38.29333333333334,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002997469773556576,
+      "loss": 0.473,
+      "step": 14360
+    },
+    {
+      "epoch": 38.32,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002997466121884122,
+      "loss": 0.4763,
+      "step": 14370
+    },
+    {
+      "epoch": 38.346666666666664,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0002997462467580714,
+      "loss": 0.4852,
+      "step": 14380
+    },
+    {
+      "epoch": 38.373333333333335,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029974588106463555,
+      "loss": 0.4808,
+      "step": 14390
+    },
+    {
+      "epoch": 38.4,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029974551510810555,
+      "loss": 0.4773,
+      "step": 14400
+    },
+    {
+      "epoch": 38.42666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002997451488884819,
+      "loss": 0.4839,
+      "step": 14410
+    },
+    {
+      "epoch": 38.45333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029974478240576535,
+      "loss": 0.486,
+      "step": 14420
+    },
+    {
+      "epoch": 38.48,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029974441565995644,
+      "loss": 0.4923,
+      "step": 14430
+    },
+    {
+      "epoch": 38.50666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002997440486510558,
+      "loss": 0.4819,
+      "step": 14440
+    },
+    {
+      "epoch": 38.53333333333333,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.00029974368137906417,
+      "loss": 0.475,
+      "step": 14450
+    },
+    {
+      "epoch": 38.56,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002997433138439822,
+      "loss": 0.4767,
+      "step": 14460
+    },
+    {
+      "epoch": 38.586666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029974294604581046,
+      "loss": 0.4724,
+      "step": 14470
+    },
+    {
+      "epoch": 38.61333333333333,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00029974257798454963,
+      "loss": 0.4582,
+      "step": 14480
+    },
+    {
+      "epoch": 38.64,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002997422096602003,
+      "loss": 0.4631,
+      "step": 14490
+    },
+    {
+      "epoch": 38.666666666666664,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.00029974184107276326,
+      "loss": 0.4781,
+      "step": 14500
+    },
+    {
+      "epoch": 38.693333333333335,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.000299741472222239,
+      "loss": 0.4613,
+      "step": 14510
+    },
+    {
+      "epoch": 38.72,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029974110310862826,
+      "loss": 0.4786,
+      "step": 14520
+    },
+    {
+      "epoch": 38.74666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029974073373193166,
+      "loss": 0.483,
+      "step": 14530
+    },
+    {
+      "epoch": 38.77333333333333,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002997403640921499,
+      "loss": 0.4916,
+      "step": 14540
+    },
+    {
+      "epoch": 38.8,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002997399941892835,
+      "loss": 0.4777,
+      "step": 14550
+    },
+    {
+      "epoch": 38.82666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029973962402333326,
+      "loss": 0.4716,
+      "step": 14560
+    },
+    {
+      "epoch": 38.85333333333333,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.00029973925359429973,
+      "loss": 0.4814,
+      "step": 14570
+    },
+    {
+      "epoch": 38.88,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00029973888290218357,
+      "loss": 0.4692,
+      "step": 14580
+    },
+    {
+      "epoch": 38.906666666666666,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029973851194698553,
+      "loss": 0.4715,
+      "step": 14590
+    },
+    {
+      "epoch": 38.93333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029973814072870616,
+      "loss": 0.4735,
+      "step": 14600
+    },
+    {
+      "epoch": 38.96,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029973776924734606,
+      "loss": 0.4713,
+      "step": 14610
+    },
+    {
+      "epoch": 38.986666666666665,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029973739750290603,
+      "loss": 0.4846,
+      "step": 14620
+    },
+    {
+      "epoch": 39.0,
+      "eval_loss": 0.4854857325553894,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1291,
+      "eval_samples_per_second": 1.58,
+      "eval_steps_per_second": 0.099,
+      "step": 14625
+    },
+    {
+      "epoch": 39.013333333333335,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002997370254953867,
+      "loss": 0.4735,
+      "step": 14630
+    },
+    {
+      "epoch": 39.04,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002997366532247886,
+      "loss": 0.4984,
+      "step": 14640
+    },
+    {
+      "epoch": 39.06666666666667,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002997362806911125,
+      "loss": 0.4856,
+      "step": 14650
+    },
+    {
+      "epoch": 39.093333333333334,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.000299735907894359,
+      "loss": 0.481,
+      "step": 14660
+    },
+    {
+      "epoch": 39.12,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029973553483452875,
+      "loss": 0.4766,
+      "step": 14670
+    },
+    {
+      "epoch": 39.14666666666667,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002997351615116225,
+      "loss": 0.4856,
+      "step": 14680
+    },
+    {
+      "epoch": 39.17333333333333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002997347879256408,
+      "loss": 0.4766,
+      "step": 14690
+    },
+    {
+      "epoch": 39.2,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029973441407658434,
+      "loss": 0.475,
+      "step": 14700
+    },
+    {
+      "epoch": 39.22666666666667,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002997340399644538,
+      "loss": 0.473,
+      "step": 14710
+    },
+    {
+      "epoch": 39.25333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029973366558924973,
+      "loss": 0.4719,
+      "step": 14720
+    },
+    {
+      "epoch": 39.28,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029973329095097295,
+      "loss": 0.4861,
+      "step": 14730
+    },
+    {
+      "epoch": 39.306666666666665,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.000299732916049624,
+      "loss": 0.4677,
+      "step": 14740
+    },
+    {
+      "epoch": 39.333333333333336,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.00029973254088520363,
+      "loss": 0.4827,
+      "step": 14750
+    },
+    {
+      "epoch": 39.36,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002997321654577124,
+      "loss": 0.4844,
+      "step": 14760
+    },
+    {
+      "epoch": 39.38666666666666,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.000299731789767151,
+      "loss": 0.4765,
+      "step": 14770
+    },
+    {
+      "epoch": 39.413333333333334,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029973141381352015,
+      "loss": 0.4805,
+      "step": 14780
+    },
+    {
+      "epoch": 39.44,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002997310375968205,
+      "loss": 0.4841,
+      "step": 14790
+    },
+    {
+      "epoch": 39.46666666666667,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002997306611170526,
+      "loss": 0.4931,
+      "step": 14800
+    },
+    {
+      "epoch": 39.49333333333333,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029973028437421727,
+      "loss": 0.4843,
+      "step": 14810
+    },
+    {
+      "epoch": 39.52,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.000299729907368315,
+      "loss": 0.4787,
+      "step": 14820
+    },
+    {
+      "epoch": 39.54666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002997295300993466,
+      "loss": 0.4752,
+      "step": 14830
+    },
+    {
+      "epoch": 39.57333333333333,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029972915256731267,
+      "loss": 0.475,
+      "step": 14840
+    },
+    {
+      "epoch": 39.6,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002997287747722139,
+      "loss": 0.4684,
+      "step": 14850
+    },
+    {
+      "epoch": 39.626666666666665,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029972839671405096,
+      "loss": 0.4547,
+      "step": 14860
+    },
+    {
+      "epoch": 39.653333333333336,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002997280183928244,
+      "loss": 0.4749,
+      "step": 14870
+    },
+    {
+      "epoch": 39.68,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.000299727639808535,
+      "loss": 0.4658,
+      "step": 14880
+    },
+    {
+      "epoch": 39.70666666666666,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029972726096118345,
+      "loss": 0.4669,
+      "step": 14890
+    },
+    {
+      "epoch": 39.733333333333334,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002997268818507703,
+      "loss": 0.4863,
+      "step": 14900
+    },
+    {
+      "epoch": 39.76,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029972650247729637,
+      "loss": 0.4868,
+      "step": 14910
+    },
+    {
+      "epoch": 39.78666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029972612284076214,
+      "loss": 0.4858,
+      "step": 14920
+    },
+    {
+      "epoch": 39.81333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002997257429411684,
+      "loss": 0.471,
+      "step": 14930
+    },
+    {
+      "epoch": 39.84,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002997253627785158,
+      "loss": 0.4763,
+      "step": 14940
+    },
+    {
+      "epoch": 39.86666666666667,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.000299724982352805,
+      "loss": 0.4761,
+      "step": 14950
+    },
+    {
+      "epoch": 39.89333333333333,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0002997246016640367,
+      "loss": 0.4667,
+      "step": 14960
+    },
+    {
+      "epoch": 39.92,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029972422071221155,
+      "loss": 0.477,
+      "step": 14970
+    },
+    {
+      "epoch": 39.946666666666665,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029972383949733016,
+      "loss": 0.4695,
+      "step": 14980
+    },
+    {
+      "epoch": 39.973333333333336,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029972345801939325,
+      "loss": 0.481,
+      "step": 14990
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0002997230762784015,
+      "loss": 0.4699,
+      "step": 15000
+    },
+    {
+      "epoch": 40.0,
+      "eval_loss": 0.4851369857788086,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0451,
+      "eval_samples_per_second": 1.593,
+      "eval_steps_per_second": 0.1,
+      "step": 15000
+    },
+    {
+      "epoch": 40.026666666666664,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029972269427435555,
+      "loss": 0.4896,
+      "step": 15010
+    },
+    {
+      "epoch": 40.053333333333335,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029972231200725606,
+      "loss": 0.4955,
+      "step": 15020
+    },
+    {
+      "epoch": 40.08,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029972192947710376,
+      "loss": 0.4827,
+      "step": 15030
+    },
+    {
+      "epoch": 40.10666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002997215466838993,
+      "loss": 0.4776,
+      "step": 15040
+    },
+    {
+      "epoch": 40.13333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029972116362764336,
+      "loss": 0.4788,
+      "step": 15050
+    },
+    {
+      "epoch": 40.16,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029972078030833656,
+      "loss": 0.4838,
+      "step": 15060
+    },
+    {
+      "epoch": 40.18666666666667,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029972039672597966,
+      "loss": 0.4755,
+      "step": 15070
+    },
+    {
+      "epoch": 40.21333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002997200128805732,
+      "loss": 0.4709,
+      "step": 15080
+    },
+    {
+      "epoch": 40.24,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.000299719628772118,
+      "loss": 0.4746,
+      "step": 15090
+    },
+    {
+      "epoch": 40.266666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002997192444006147,
+      "loss": 0.4784,
+      "step": 15100
+    },
+    {
+      "epoch": 40.29333333333334,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002997188597660639,
+      "loss": 0.4723,
+      "step": 15110
+    },
+    {
+      "epoch": 40.32,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.00029971847486846636,
+      "loss": 0.4766,
+      "step": 15120
+    },
+    {
+      "epoch": 40.346666666666664,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029971808970782264,
+      "loss": 0.4845,
+      "step": 15130
+    },
+    {
+      "epoch": 40.373333333333335,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002997177042841336,
+      "loss": 0.4809,
+      "step": 15140
+    },
+    {
+      "epoch": 40.4,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029971731859739973,
+      "loss": 0.4772,
+      "step": 15150
+    },
+    {
+      "epoch": 40.42666666666667,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029971693264762184,
+      "loss": 0.4825,
+      "step": 15160
+    },
+    {
+      "epoch": 40.45333333333333,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029971654643480057,
+      "loss": 0.4853,
+      "step": 15170
+    },
+    {
+      "epoch": 40.48,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029971615995893656,
+      "loss": 0.492,
+      "step": 15180
+    },
+    {
+      "epoch": 40.50666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002997157732200305,
+      "loss": 0.4809,
+      "step": 15190
+    },
+    {
+      "epoch": 40.53333333333333,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00029971538621808316,
+      "loss": 0.4752,
+      "step": 15200
+    },
+    {
+      "epoch": 40.56,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0002997149989530951,
+      "loss": 0.476,
+      "step": 15210
+    },
+    {
+      "epoch": 40.586666666666666,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.00029971461142506707,
+      "loss": 0.472,
+      "step": 15220
+    },
+    {
+      "epoch": 40.61333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029971422363399974,
+      "loss": 0.4579,
+      "step": 15230
+    },
+    {
+      "epoch": 40.64,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002997138355798937,
+      "loss": 0.4625,
+      "step": 15240
+    },
+    {
+      "epoch": 40.666666666666664,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002997134472627498,
+      "loss": 0.4782,
+      "step": 15250
+    },
+    {
+      "epoch": 40.693333333333335,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002997130586825686,
+      "loss": 0.4606,
+      "step": 15260
+    },
+    {
+      "epoch": 40.72,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0002997126698393508,
+      "loss": 0.478,
+      "step": 15270
+    },
+    {
+      "epoch": 40.74666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029971228073309713,
+      "loss": 0.4829,
+      "step": 15280
+    },
+    {
+      "epoch": 40.77333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029971189136380826,
+      "loss": 0.4905,
+      "step": 15290
+    },
+    {
+      "epoch": 40.8,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002997115017314848,
+      "loss": 0.4768,
+      "step": 15300
+    },
+    {
+      "epoch": 40.82666666666667,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002997111118361275,
+      "loss": 0.4717,
+      "step": 15310
+    },
+    {
+      "epoch": 40.85333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029971072167773706,
+      "loss": 0.4813,
+      "step": 15320
+    },
+    {
+      "epoch": 40.88,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00029971033125631417,
+      "loss": 0.4684,
+      "step": 15330
+    },
+    {
+      "epoch": 40.906666666666666,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029970994057185944,
+      "loss": 0.4712,
+      "step": 15340
+    },
+    {
+      "epoch": 40.93333333333333,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029970954962437363,
+      "loss": 0.4729,
+      "step": 15350
+    },
+    {
+      "epoch": 40.96,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029970915841385744,
+      "loss": 0.4708,
+      "step": 15360
+    },
+    {
+      "epoch": 40.986666666666665,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002997087669403114,
+      "loss": 0.4847,
+      "step": 15370
+    },
+    {
+      "epoch": 41.0,
+      "eval_loss": 0.48485177755355835,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1975,
+      "eval_samples_per_second": 1.569,
+      "eval_steps_per_second": 0.098,
+      "step": 15375
+    },
+    {
+      "epoch": 41.013333333333335,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029970837520373645,
+      "loss": 0.4734,
+      "step": 15380
+    },
+    {
+      "epoch": 41.04,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002997079832041331,
+      "loss": 0.4973,
+      "step": 15390
+    },
+    {
+      "epoch": 41.06666666666667,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.000299707590941502,
+      "loss": 0.4856,
+      "step": 15400
+    },
+    {
+      "epoch": 41.093333333333334,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00029970719841584405,
+      "loss": 0.4804,
+      "step": 15410
+    },
+    {
+      "epoch": 41.12,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029970680562715977,
+      "loss": 0.4759,
+      "step": 15420
+    },
+    {
+      "epoch": 41.14666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002997064125754499,
+      "loss": 0.485,
+      "step": 15430
+    },
+    {
+      "epoch": 41.17333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002997060192607151,
+      "loss": 0.4758,
+      "step": 15440
+    },
+    {
+      "epoch": 41.2,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029970562568295607,
+      "loss": 0.4742,
+      "step": 15450
+    },
+    {
+      "epoch": 41.22666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029970523184217354,
+      "loss": 0.4717,
+      "step": 15460
+    },
+    {
+      "epoch": 41.25333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029970483773836823,
+      "loss": 0.4713,
+      "step": 15470
+    },
+    {
+      "epoch": 41.28,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029970444337154074,
+      "loss": 0.4845,
+      "step": 15480
+    },
+    {
+      "epoch": 41.306666666666665,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00029970404874169176,
+      "loss": 0.4668,
+      "step": 15490
+    },
+    {
+      "epoch": 41.333333333333336,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.00029970365384882205,
+      "loss": 0.4827,
+      "step": 15500
+    },
+    {
+      "epoch": 41.36,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002997032586929323,
+      "loss": 0.4837,
+      "step": 15510
+    },
+    {
+      "epoch": 41.38666666666666,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002997028632740232,
+      "loss": 0.4762,
+      "step": 15520
+    },
+    {
+      "epoch": 41.413333333333334,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002997024675920954,
+      "loss": 0.4795,
+      "step": 15530
+    },
+    {
+      "epoch": 41.44,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002997020716471496,
+      "loss": 0.4835,
+      "step": 15540
+    },
+    {
+      "epoch": 41.46666666666667,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029970167543918655,
+      "loss": 0.4927,
+      "step": 15550
+    },
+    {
+      "epoch": 41.49333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029970127896820697,
+      "loss": 0.4844,
+      "step": 15560
+    },
+    {
+      "epoch": 41.52,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00029970088223421143,
+      "loss": 0.4781,
+      "step": 15570
+    },
+    {
+      "epoch": 41.54666666666667,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002997004852372007,
+      "loss": 0.4749,
+      "step": 15580
+    },
+    {
+      "epoch": 41.57333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002997000879771755,
+      "loss": 0.4742,
+      "step": 15590
+    },
+    {
+      "epoch": 41.6,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002996996904541365,
+      "loss": 0.467,
+      "step": 15600
+    },
+    {
+      "epoch": 41.626666666666665,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002996992926680844,
+      "loss": 0.4539,
+      "step": 15610
+    },
+    {
+      "epoch": 41.653333333333336,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002996988946190199,
+      "loss": 0.4741,
+      "step": 15620
+    },
+    {
+      "epoch": 41.68,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002996984963069437,
+      "loss": 0.4655,
+      "step": 15630
+    },
+    {
+      "epoch": 41.70666666666666,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029969809773185656,
+      "loss": 0.4663,
+      "step": 15640
+    },
+    {
+      "epoch": 41.733333333333334,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029969769889375903,
+      "loss": 0.4859,
+      "step": 15650
+    },
+    {
+      "epoch": 41.76,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.000299697299792652,
+      "loss": 0.487,
+      "step": 15660
+    },
+    {
+      "epoch": 41.78666666666667,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.000299696900428536,
+      "loss": 0.4858,
+      "step": 15670
+    },
+    {
+      "epoch": 41.81333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029969650080141185,
+      "loss": 0.4703,
+      "step": 15680
+    },
+    {
+      "epoch": 41.84,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029969610091128016,
+      "loss": 0.4762,
+      "step": 15690
+    },
+    {
+      "epoch": 41.86666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002996957007581418,
+      "loss": 0.4758,
+      "step": 15700
+    },
+    {
+      "epoch": 41.89333333333333,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0002996953003419972,
+      "loss": 0.466,
+      "step": 15710
+    },
+    {
+      "epoch": 41.92,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002996948996628473,
+      "loss": 0.4765,
+      "step": 15720
+    },
+    {
+      "epoch": 41.946666666666665,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029969449872069265,
+      "loss": 0.4692,
+      "step": 15730
+    },
+    {
+      "epoch": 41.973333333333336,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002996940975155341,
+      "loss": 0.4812,
+      "step": 15740
+    },
+    {
+      "epoch": 42.0,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029969369604737226,
+      "loss": 0.47,
+      "step": 15750
+    },
+    {
+      "epoch": 42.0,
+      "eval_loss": 0.48487091064453125,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.314,
+      "eval_samples_per_second": 1.718,
+      "eval_steps_per_second": 0.107,
+      "step": 15750
+    },
+    {
+      "epoch": 42.026666666666664,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029969329431620787,
+      "loss": 0.4887,
+      "step": 15760
+    },
+    {
+      "epoch": 42.053333333333335,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002996928923220416,
+      "loss": 0.4949,
+      "step": 15770
+    },
+    {
+      "epoch": 42.08,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029969249006487416,
+      "loss": 0.4822,
+      "step": 15780
+    },
+    {
+      "epoch": 42.10666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029969208754470635,
+      "loss": 0.4768,
+      "step": 15790
+    },
+    {
+      "epoch": 42.13333333333333,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.00029969168476153875,
+      "loss": 0.4793,
+      "step": 15800
+    },
+    {
+      "epoch": 42.16,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002996912817153721,
+      "loss": 0.4829,
+      "step": 15810
+    },
+    {
+      "epoch": 42.18666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029969087840620717,
+      "loss": 0.4756,
+      "step": 15820
+    },
+    {
+      "epoch": 42.21333333333333,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002996904748340446,
+      "loss": 0.4707,
+      "step": 15830
+    },
+    {
+      "epoch": 42.24,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029969007099888513,
+      "loss": 0.4746,
+      "step": 15840
+    },
+    {
+      "epoch": 42.266666666666666,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002996896669007294,
+      "loss": 0.4777,
+      "step": 15850
+    },
+    {
+      "epoch": 42.29333333333334,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002996892625395783,
+      "loss": 0.4722,
+      "step": 15860
+    },
+    {
+      "epoch": 42.32,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002996888579154324,
+      "loss": 0.4754,
+      "step": 15870
+    },
+    {
+      "epoch": 42.346666666666664,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002996884530282924,
+      "loss": 0.4841,
+      "step": 15880
+    },
+    {
+      "epoch": 42.373333333333335,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029968804787815906,
+      "loss": 0.4809,
+      "step": 15890
+    },
+    {
+      "epoch": 42.4,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002996876424650331,
+      "loss": 0.4761,
+      "step": 15900
+    },
+    {
+      "epoch": 42.42666666666667,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029968723678891517,
+      "loss": 0.4831,
+      "step": 15910
+    },
+    {
+      "epoch": 42.45333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.000299686830849806,
+      "loss": 0.4846,
+      "step": 15920
+    },
+    {
+      "epoch": 42.48,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002996864246477064,
+      "loss": 0.4915,
+      "step": 15930
+    },
+    {
+      "epoch": 42.50666666666667,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.000299686018182617,
+      "loss": 0.481,
+      "step": 15940
+    },
+    {
+      "epoch": 42.53333333333333,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029968561145453853,
+      "loss": 0.4742,
+      "step": 15950
+    },
+    {
+      "epoch": 42.56,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029968520446347165,
+      "loss": 0.4758,
+      "step": 15960
+    },
+    {
+      "epoch": 42.586666666666666,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029968479720941717,
+      "loss": 0.4712,
+      "step": 15970
+    },
+    {
+      "epoch": 42.61333333333333,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029968438969237574,
+      "loss": 0.4573,
+      "step": 15980
+    },
+    {
+      "epoch": 42.64,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0002996839819123481,
+      "loss": 0.462,
+      "step": 15990
+    },
+    {
+      "epoch": 42.666666666666664,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.000299683573869335,
+      "loss": 0.478,
+      "step": 16000
+    },
+    {
+      "epoch": 42.693333333333335,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029968316556333704,
+      "loss": 0.4604,
+      "step": 16010
+    },
+    {
+      "epoch": 42.72,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029968275699435507,
+      "loss": 0.4781,
+      "step": 16020
+    },
+    {
+      "epoch": 42.74666666666667,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0002996823481623897,
+      "loss": 0.4821,
+      "step": 16030
+    },
+    {
+      "epoch": 42.77333333333333,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002996819390674418,
+      "loss": 0.4898,
+      "step": 16040
+    },
+    {
+      "epoch": 42.8,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002996815297095119,
+      "loss": 0.4766,
+      "step": 16050
+    },
+    {
+      "epoch": 42.82666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002996811200886009,
+      "loss": 0.4714,
+      "step": 16060
+    },
+    {
+      "epoch": 42.85333333333333,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029968071020470936,
+      "loss": 0.4802,
+      "step": 16070
+    },
+    {
+      "epoch": 42.88,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029968030005783815,
+      "loss": 0.4679,
+      "step": 16080
+    },
+    {
+      "epoch": 42.906666666666666,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029967988964798783,
+      "loss": 0.471,
+      "step": 16090
+    },
+    {
+      "epoch": 42.93333333333333,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029967947897515925,
+      "loss": 0.473,
+      "step": 16100
+    },
+    {
+      "epoch": 42.96,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002996790680393531,
+      "loss": 0.4701,
+      "step": 16110
+    },
+    {
+      "epoch": 42.986666666666665,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.00029967865684057,
+      "loss": 0.4849,
+      "step": 16120
+    },
+    {
+      "epoch": 43.0,
+      "eval_loss": 0.4834711253643036,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.4961,
+      "eval_samples_per_second": 1.524,
+      "eval_steps_per_second": 0.095,
+      "step": 16125
+    },
+    {
+      "epoch": 43.013333333333335,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.00029967824537881086,
+      "loss": 0.4742,
+      "step": 16130
+    },
+    {
+      "epoch": 43.04,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002996778336540762,
+      "loss": 0.4984,
+      "step": 16140
+    },
+    {
+      "epoch": 43.06666666666667,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029967742166636695,
+      "loss": 0.4844,
+      "step": 16150
+    },
+    {
+      "epoch": 43.093333333333334,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029967700941568366,
+      "loss": 0.4798,
+      "step": 16160
+    },
+    {
+      "epoch": 43.12,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002996765969020271,
+      "loss": 0.4762,
+      "step": 16170
+    },
+    {
+      "epoch": 43.14666666666667,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0002996761841253981,
+      "loss": 0.4852,
+      "step": 16180
+    },
+    {
+      "epoch": 43.17333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029967577108579727,
+      "loss": 0.4754,
+      "step": 16190
+    },
+    {
+      "epoch": 43.2,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029967535778322536,
+      "loss": 0.4739,
+      "step": 16200
+    },
+    {
+      "epoch": 43.22666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002996749442176831,
+      "loss": 0.4717,
+      "step": 16210
+    },
+    {
+      "epoch": 43.25333333333333,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002996745303891712,
+      "loss": 0.4714,
+      "step": 16220
+    },
+    {
+      "epoch": 43.28,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029967411629769044,
+      "loss": 0.4848,
+      "step": 16230
+    },
+    {
+      "epoch": 43.306666666666665,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029967370194324157,
+      "loss": 0.4656,
+      "step": 16240
+    },
+    {
+      "epoch": 43.333333333333336,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029967328732582516,
+      "loss": 0.4818,
+      "step": 16250
+    },
+    {
+      "epoch": 43.36,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029967287244544213,
+      "loss": 0.4842,
+      "step": 16260
+    },
+    {
+      "epoch": 43.38666666666666,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029967245730209307,
+      "loss": 0.4753,
+      "step": 16270
+    },
+    {
+      "epoch": 43.413333333333334,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029967204189577874,
+      "loss": 0.4798,
+      "step": 16280
+    },
+    {
+      "epoch": 43.44,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029967162622649996,
+      "loss": 0.4827,
+      "step": 16290
+    },
+    {
+      "epoch": 43.46666666666667,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002996712102942574,
+      "loss": 0.4926,
+      "step": 16300
+    },
+    {
+      "epoch": 43.49333333333333,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002996707940990517,
+      "loss": 0.4836,
+      "step": 16310
+    },
+    {
+      "epoch": 43.52,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0002996703776408837,
+      "loss": 0.4777,
+      "step": 16320
+    },
+    {
+      "epoch": 43.54666666666667,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0002996699609197541,
+      "loss": 0.4747,
+      "step": 16330
+    },
+    {
+      "epoch": 43.57333333333333,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002996695439356636,
+      "loss": 0.4742,
+      "step": 16340
+    },
+    {
+      "epoch": 43.6,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029966912668861307,
+      "loss": 0.4675,
+      "step": 16350
+    },
+    {
+      "epoch": 43.626666666666665,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002996687091786031,
+      "loss": 0.4542,
+      "step": 16360
+    },
+    {
+      "epoch": 43.653333333333336,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0002996682914056345,
+      "loss": 0.4736,
+      "step": 16370
+    },
+    {
+      "epoch": 43.68,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0002996678733697079,
+      "loss": 0.466,
+      "step": 16380
+    },
+    {
+      "epoch": 43.70666666666666,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029966745507082417,
+      "loss": 0.4658,
+      "step": 16390
+    },
+    {
+      "epoch": 43.733333333333334,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002996670365089839,
+      "loss": 0.4854,
+      "step": 16400
+    },
+    {
+      "epoch": 43.76,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.000299666617684188,
+      "loss": 0.4867,
+      "step": 16410
+    },
+    {
+      "epoch": 43.78666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029966619859643707,
+      "loss": 0.4846,
+      "step": 16420
+    },
+    {
+      "epoch": 43.81333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029966577924573186,
+      "loss": 0.4702,
+      "step": 16430
+    },
+    {
+      "epoch": 43.84,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029966535963207316,
+      "loss": 0.4756,
+      "step": 16440
+    },
+    {
+      "epoch": 43.86666666666667,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0002996649397554617,
+      "loss": 0.4759,
+      "step": 16450
+    },
+    {
+      "epoch": 43.89333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029966451961589816,
+      "loss": 0.4658,
+      "step": 16460
+    },
+    {
+      "epoch": 43.92,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00029966409921338333,
+      "loss": 0.4766,
+      "step": 16470
+    },
+    {
+      "epoch": 43.946666666666665,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002996636785479179,
+      "loss": 0.469,
+      "step": 16480
+    },
+    {
+      "epoch": 43.973333333333336,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002996632576195027,
+      "loss": 0.4806,
+      "step": 16490
+    },
+    {
+      "epoch": 44.0,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029966283642813836,
+      "loss": 0.4697,
+      "step": 16500
+    },
+    {
+      "epoch": 44.0,
+      "eval_loss": 0.48331379890441895,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0447,
+      "eval_samples_per_second": 1.593,
+      "eval_steps_per_second": 0.1,
+      "step": 16500
+    },
+    {
+      "epoch": 44.026666666666664,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0002996624149738257,
+      "loss": 0.4885,
+      "step": 16510
+    },
+    {
+      "epoch": 44.053333333333335,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002996619932565655,
+      "loss": 0.4948,
+      "step": 16520
+    },
+    {
+      "epoch": 44.08,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.00029966157127635836,
+      "loss": 0.4815,
+      "step": 16530
+    },
+    {
+      "epoch": 44.10666666666667,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002996611490332051,
+      "loss": 0.4768,
+      "step": 16540
+    },
+    {
+      "epoch": 44.13333333333333,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.00029966072652710653,
+      "loss": 0.4786,
+      "step": 16550
+    },
+    {
+      "epoch": 44.16,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.00029966030375806323,
+      "loss": 0.483,
+      "step": 16560
+    },
+    {
+      "epoch": 44.18666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002996598807260761,
+      "loss": 0.4751,
+      "step": 16570
+    },
+    {
+      "epoch": 44.21333333333333,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002996594574311458,
+      "loss": 0.4699,
+      "step": 16580
+    },
+    {
+      "epoch": 44.24,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002996590338732731,
+      "loss": 0.4738,
+      "step": 16590
+    },
+    {
+      "epoch": 44.266666666666666,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002996586100524587,
+      "loss": 0.4776,
+      "step": 16600
+    },
+    {
+      "epoch": 44.29333333333334,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002996581859687034,
+      "loss": 0.4714,
+      "step": 16610
+    },
+    {
+      "epoch": 44.32,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002996577616220079,
+      "loss": 0.4754,
+      "step": 16620
+    },
+    {
+      "epoch": 44.346666666666664,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.000299657337012373,
+      "loss": 0.4835,
+      "step": 16630
+    },
+    {
+      "epoch": 44.373333333333335,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029965691213979943,
+      "loss": 0.4795,
+      "step": 16640
+    },
+    {
+      "epoch": 44.4,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002996564870042879,
+      "loss": 0.4753,
+      "step": 16650
+    },
+    {
+      "epoch": 44.42666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029965606160583917,
+      "loss": 0.4826,
+      "step": 16660
+    },
+    {
+      "epoch": 44.45333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.000299655635944454,
+      "loss": 0.485,
+      "step": 16670
+    },
+    {
+      "epoch": 44.48,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029965521002013314,
+      "loss": 0.4904,
+      "step": 16680
+    },
+    {
+      "epoch": 44.50666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029965478383287735,
+      "loss": 0.4805,
+      "step": 16690
+    },
+    {
+      "epoch": 44.53333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029965435738268736,
+      "loss": 0.4738,
+      "step": 16700
+    },
+    {
+      "epoch": 44.56,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002996539306695639,
+      "loss": 0.4753,
+      "step": 16710
+    },
+    {
+      "epoch": 44.586666666666666,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00029965350369350773,
+      "loss": 0.4712,
+      "step": 16720
+    },
+    {
+      "epoch": 44.61333333333333,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002996530764545197,
+      "loss": 0.457,
+      "step": 16730
+    },
+    {
+      "epoch": 44.64,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00029965264895260034,
+      "loss": 0.4616,
+      "step": 16740
+    },
+    {
+      "epoch": 44.666666666666664,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002996522211877506,
+      "loss": 0.4769,
+      "step": 16750
+    },
+    {
+      "epoch": 44.693333333333335,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002996517931599712,
+      "loss": 0.4605,
+      "step": 16760
+    },
+    {
+      "epoch": 44.72,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002996513648692628,
+      "loss": 0.477,
+      "step": 16770
+    },
+    {
+      "epoch": 44.74666666666667,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.00029965093631562626,
+      "loss": 0.4824,
+      "step": 16780
+    },
+    {
+      "epoch": 44.77333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029965050749906224,
+      "loss": 0.4896,
+      "step": 16790
+    },
+    {
+      "epoch": 44.8,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002996500784195715,
+      "loss": 0.476,
+      "step": 16800
+    },
+    {
+      "epoch": 44.82666666666667,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002996496490771549,
+      "loss": 0.4719,
+      "step": 16810
+    },
+    {
+      "epoch": 44.85333333333333,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.00029964921947181315,
+      "loss": 0.4802,
+      "step": 16820
+    },
+    {
+      "epoch": 44.88,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002996487896035469,
+      "loss": 0.4676,
+      "step": 16830
+    },
+    {
+      "epoch": 44.906666666666666,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.000299648359472357,
+      "loss": 0.4704,
+      "step": 16840
+    },
+    {
+      "epoch": 44.93333333333333,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0002996479290782442,
+      "loss": 0.4723,
+      "step": 16850
+    },
+    {
+      "epoch": 44.96,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.00029964749842120925,
+      "loss": 0.4709,
+      "step": 16860
+    },
+    {
+      "epoch": 44.986666666666665,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002996470675012529,
+      "loss": 0.4837,
+      "step": 16870
+    },
+    {
+      "epoch": 45.0,
+      "eval_loss": 0.48572611808776855,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8974,
+      "eval_samples_per_second": 1.617,
+      "eval_steps_per_second": 0.101,
+      "step": 16875
+    },
+    {
+      "epoch": 45.013333333333335,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002996466363183759,
+      "loss": 0.4731,
+      "step": 16880
+    },
+    {
+      "epoch": 45.04,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029964620487257906,
+      "loss": 0.4974,
+      "step": 16890
+    },
+    {
+      "epoch": 45.06666666666667,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029964577316386303,
+      "loss": 0.4854,
+      "step": 16900
+    },
+    {
+      "epoch": 45.093333333333334,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029964534119222873,
+      "loss": 0.4791,
+      "step": 16910
+    },
+    {
+      "epoch": 45.12,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002996449089576767,
+      "loss": 0.4765,
+      "step": 16920
+    },
+    {
+      "epoch": 45.14666666666667,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002996444764602079,
+      "loss": 0.4852,
+      "step": 16930
+    },
+    {
+      "epoch": 45.17333333333333,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.000299644043699823,
+      "loss": 0.4749,
+      "step": 16940
+    },
+    {
+      "epoch": 45.2,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029964361067652274,
+      "loss": 0.4732,
+      "step": 16950
+    },
+    {
+      "epoch": 45.22666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029964317739030795,
+      "loss": 0.4711,
+      "step": 16960
+    },
+    {
+      "epoch": 45.25333333333333,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029964274384117933,
+      "loss": 0.4709,
+      "step": 16970
+    },
+    {
+      "epoch": 45.28,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002996423100291377,
+      "loss": 0.4844,
+      "step": 16980
+    },
+    {
+      "epoch": 45.306666666666665,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029964187595418377,
+      "loss": 0.466,
+      "step": 16990
+    },
+    {
+      "epoch": 45.333333333333336,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002996414416163183,
+      "loss": 0.4814,
+      "step": 17000
+    },
+    {
+      "epoch": 45.36,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0002996410070155421,
+      "loss": 0.4836,
+      "step": 17010
+    },
+    {
+      "epoch": 45.38666666666666,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002996405721518559,
+      "loss": 0.4747,
+      "step": 17020
+    },
+    {
+      "epoch": 45.413333333333334,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029964013702526044,
+      "loss": 0.4791,
+      "step": 17030
+    },
+    {
+      "epoch": 45.44,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0002996397016357566,
+      "loss": 0.4826,
+      "step": 17040
+    },
+    {
+      "epoch": 45.46666666666667,
+      "grad_norm": 0.25,
+      "learning_rate": 0.000299639265983345,
+      "loss": 0.4925,
+      "step": 17050
+    },
+    {
+      "epoch": 45.49333333333333,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029963883006802643,
+      "loss": 0.4828,
+      "step": 17060
+    },
+    {
+      "epoch": 45.52,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029963839388980176,
+      "loss": 0.4774,
+      "step": 17070
+    },
+    {
+      "epoch": 45.54666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002996379574486716,
+      "loss": 0.4747,
+      "step": 17080
+    },
+    {
+      "epoch": 45.57333333333333,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0002996375207446369,
+      "loss": 0.4732,
+      "step": 17090
+    },
+    {
+      "epoch": 45.6,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002996370837776983,
+      "loss": 0.4671,
+      "step": 17100
+    },
+    {
+      "epoch": 45.626666666666665,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002996366465478566,
+      "loss": 0.4539,
+      "step": 17110
+    },
+    {
+      "epoch": 45.653333333333336,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00029963620905511257,
+      "loss": 0.4736,
+      "step": 17120
+    },
+    {
+      "epoch": 45.68,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.000299635771299467,
+      "loss": 0.4649,
+      "step": 17130
+    },
+    {
+      "epoch": 45.70666666666666,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002996353332809206,
+      "loss": 0.4655,
+      "step": 17140
+    },
+    {
+      "epoch": 45.733333333333334,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002996348949994742,
+      "loss": 0.4842,
+      "step": 17150
+    },
+    {
+      "epoch": 45.76,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002996344564551285,
+      "loss": 0.486,
+      "step": 17160
+    },
+    {
+      "epoch": 45.78666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002996340176478843,
+      "loss": 0.4849,
+      "step": 17170
+    },
+    {
+      "epoch": 45.81333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029963357857774246,
+      "loss": 0.4692,
+      "step": 17180
+    },
+    {
+      "epoch": 45.84,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029963313924470365,
+      "loss": 0.4752,
+      "step": 17190
+    },
+    {
+      "epoch": 45.86666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002996326996487687,
+      "loss": 0.4756,
+      "step": 17200
+    },
+    {
+      "epoch": 45.89333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029963225978993835,
+      "loss": 0.4657,
+      "step": 17210
+    },
+    {
+      "epoch": 45.92,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002996318196682133,
+      "loss": 0.4763,
+      "step": 17220
+    },
+    {
+      "epoch": 45.946666666666665,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029963137928359446,
+      "loss": 0.469,
+      "step": 17230
+    },
+    {
+      "epoch": 45.973333333333336,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029963093863608254,
+      "loss": 0.4803,
+      "step": 17240
+    },
+    {
+      "epoch": 46.0,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029963049772567833,
+      "loss": 0.4688,
+      "step": 17250
+    },
+    {
+      "epoch": 46.0,
+      "eval_loss": 0.4832427501678467,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9287,
+      "eval_samples_per_second": 1.611,
+      "eval_steps_per_second": 0.101,
+      "step": 17250
+    },
+    {
+      "epoch": 46.026666666666664,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.00029963005655238257,
+      "loss": 0.4883,
+      "step": 17260
+    },
+    {
+      "epoch": 46.053333333333335,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029962961511619603,
+      "loss": 0.495,
+      "step": 17270
+    },
+    {
+      "epoch": 46.08,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029962917341711957,
+      "loss": 0.4812,
+      "step": 17280
+    },
+    {
+      "epoch": 46.10666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002996287314551539,
+      "loss": 0.4773,
+      "step": 17290
+    },
+    {
+      "epoch": 46.13333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002996282892302998,
+      "loss": 0.4782,
+      "step": 17300
+    },
+    {
+      "epoch": 46.16,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.000299627846742558,
+      "loss": 0.4832,
+      "step": 17310
+    },
+    {
+      "epoch": 46.18666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002996274039919294,
+      "loss": 0.4748,
+      "step": 17320
+    },
+    {
+      "epoch": 46.21333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002996269609784146,
+      "loss": 0.4692,
+      "step": 17330
+    },
+    {
+      "epoch": 46.24,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029962651770201455,
+      "loss": 0.4727,
+      "step": 17340
+    },
+    {
+      "epoch": 46.266666666666666,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029962607416273,
+      "loss": 0.4775,
+      "step": 17350
+    },
+    {
+      "epoch": 46.29333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002996256303605617,
+      "loss": 0.4716,
+      "step": 17360
+    },
+    {
+      "epoch": 46.32,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002996251862955104,
+      "loss": 0.4758,
+      "step": 17370
+    },
+    {
+      "epoch": 46.346666666666664,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002996247419675769,
+      "loss": 0.4838,
+      "step": 17380
+    },
+    {
+      "epoch": 46.373333333333335,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0002996242973767619,
+      "loss": 0.4796,
+      "step": 17390
+    },
+    {
+      "epoch": 46.4,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029962385252306636,
+      "loss": 0.476,
+      "step": 17400
+    },
+    {
+      "epoch": 46.42666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.000299623407406491,
+      "loss": 0.4826,
+      "step": 17410
+    },
+    {
+      "epoch": 46.45333333333333,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029962296202703647,
+      "loss": 0.4841,
+      "step": 17420
+    },
+    {
+      "epoch": 46.48,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002996225163847037,
+      "loss": 0.4904,
+      "step": 17430
+    },
+    {
+      "epoch": 46.50666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029962207047949343,
+      "loss": 0.4799,
+      "step": 17440
+    },
+    {
+      "epoch": 46.53333333333333,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.00029962162431140647,
+      "loss": 0.4737,
+      "step": 17450
+    },
+    {
+      "epoch": 46.56,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.00029962117788044347,
+      "loss": 0.4753,
+      "step": 17460
+    },
+    {
+      "epoch": 46.586666666666666,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0002996207311866054,
+      "loss": 0.4707,
+      "step": 17470
+    },
+    {
+      "epoch": 46.61333333333333,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029962028422989294,
+      "loss": 0.456,
+      "step": 17480
+    },
+    {
+      "epoch": 46.64,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.00029961983701030687,
+      "loss": 0.4615,
+      "step": 17490
+    },
+    {
+      "epoch": 46.666666666666664,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.00029961938952784807,
+      "loss": 0.4771,
+      "step": 17500
+    },
+    {
+      "epoch": 46.693333333333335,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002996189417825172,
+      "loss": 0.4605,
+      "step": 17510
+    },
+    {
+      "epoch": 46.72,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002996184937743151,
+      "loss": 0.477,
+      "step": 17520
+    },
+    {
+      "epoch": 46.74666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002996180455032426,
+      "loss": 0.4819,
+      "step": 17530
+    },
+    {
+      "epoch": 46.77333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002996175969693004,
+      "loss": 0.4893,
+      "step": 17540
+    },
+    {
+      "epoch": 46.8,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029961714817248943,
+      "loss": 0.4758,
+      "step": 17550
+    },
+    {
+      "epoch": 46.82666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002996166991128103,
+      "loss": 0.4699,
+      "step": 17560
+    },
+    {
+      "epoch": 46.85333333333333,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002996162497902639,
+      "loss": 0.4798,
+      "step": 17570
+    },
+    {
+      "epoch": 46.88,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.000299615800204851,
+      "loss": 0.4676,
+      "step": 17580
+    },
+    {
+      "epoch": 46.906666666666666,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002996153503565724,
+      "loss": 0.47,
+      "step": 17590
+    },
+    {
+      "epoch": 46.93333333333333,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002996149002454289,
+      "loss": 0.4715,
+      "step": 17600
+    },
+    {
+      "epoch": 46.96,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002996144498714212,
+      "loss": 0.4695,
+      "step": 17610
+    },
+    {
+      "epoch": 46.986666666666665,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.00029961399923455025,
+      "loss": 0.4833,
+      "step": 17620
+    },
+    {
+      "epoch": 47.0,
+      "eval_loss": 0.48381996154785156,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 11.2857,
+      "eval_samples_per_second": 1.418,
+      "eval_steps_per_second": 0.089,
+      "step": 17625
+    },
+    {
+      "epoch": 47.013333333333335,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0002996135483348168,
+      "loss": 0.4723,
+      "step": 17630
+    },
+    {
+      "epoch": 47.04,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0002996130971722215,
+      "loss": 0.4971,
+      "step": 17640
+    },
+    {
+      "epoch": 47.06666666666667,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0002996126457467653,
+      "loss": 0.4871,
+      "step": 17650
+    },
+    {
+      "epoch": 47.093333333333334,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00029961219405844893,
+      "loss": 0.4813,
+      "step": 17660
+    },
+    {
+      "epoch": 47.12,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002996117421072732,
+      "loss": 0.4759,
+      "step": 17670
+    },
+    {
+      "epoch": 47.14666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029961128989323884,
+      "loss": 0.4853,
+      "step": 17680
+    },
+    {
+      "epoch": 47.17333333333333,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029961083741634673,
+      "loss": 0.4748,
+      "step": 17690
+    },
+    {
+      "epoch": 47.2,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002996103846765976,
+      "loss": 0.4733,
+      "step": 17700
+    },
+    {
+      "epoch": 47.22666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029960993167399235,
+      "loss": 0.4716,
+      "step": 17710
+    },
+    {
+      "epoch": 47.25333333333333,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00029960947840853165,
+      "loss": 0.4708,
+      "step": 17720
+    },
+    {
+      "epoch": 47.28,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0002996090248802164,
+      "loss": 0.4833,
+      "step": 17730
+    },
+    {
+      "epoch": 47.306666666666665,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029960857108904734,
+      "loss": 0.4658,
+      "step": 17740
+    },
+    {
+      "epoch": 47.333333333333336,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0002996081170350252,
+      "loss": 0.4807,
+      "step": 17750
+    },
+    {
+      "epoch": 47.36,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.00029960766271815094,
+      "loss": 0.4834,
+      "step": 17760
+    },
+    {
+      "epoch": 47.38666666666666,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00029960720813842524,
+      "loss": 0.4749,
+      "step": 17770
+    },
+    {
+      "epoch": 47.413333333333334,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002996067532958489,
+      "loss": 0.4792,
+      "step": 17780
+    },
+    {
+      "epoch": 47.44,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029960629819042277,
+      "loss": 0.4824,
+      "step": 17790
+    },
+    {
+      "epoch": 47.46666666666667,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0002996058428221477,
+      "loss": 0.4921,
+      "step": 17800
+    },
+    {
+      "epoch": 47.49333333333333,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002996053871910243,
+      "loss": 0.483,
+      "step": 17810
+    },
+    {
+      "epoch": 47.52,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0002996049312970535,
+      "loss": 0.4768,
+      "step": 17820
+    },
+    {
+      "epoch": 47.54666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029960447514023616,
+      "loss": 0.4744,
+      "step": 17830
+    },
+    {
+      "epoch": 47.57333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.000299604018720573,
+      "loss": 0.4739,
+      "step": 17840
+    },
+    {
+      "epoch": 47.6,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0002996035620380648,
+      "loss": 0.4673,
+      "step": 17850
+    },
+    {
+      "epoch": 47.626666666666665,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.00029960310509271243,
+      "loss": 0.4534,
+      "step": 17860
+    },
+    {
+      "epoch": 47.653333333333336,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029960264788451665,
+      "loss": 0.4735,
+      "step": 17870
+    },
+    {
+      "epoch": 47.68,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002996021904134783,
+      "loss": 0.4647,
+      "step": 17880
+    },
+    {
+      "epoch": 47.70666666666666,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029960173267959807,
+      "loss": 0.4652,
+      "step": 17890
+    },
+    {
+      "epoch": 47.733333333333334,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002996012746828769,
+      "loss": 0.4844,
+      "step": 17900
+    },
+    {
+      "epoch": 47.76,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002996008164233156,
+      "loss": 0.4863,
+      "step": 17910
+    },
+    {
+      "epoch": 47.78666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002996003579009148,
+      "loss": 0.4844,
+      "step": 17920
+    },
+    {
+      "epoch": 47.81333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002995998991156755,
+      "loss": 0.4688,
+      "step": 17930
+    },
+    {
+      "epoch": 47.84,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029959944006759847,
+      "loss": 0.4757,
+      "step": 17940
+    },
+    {
+      "epoch": 47.86666666666667,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029959898075668435,
+      "loss": 0.4747,
+      "step": 17950
+    },
+    {
+      "epoch": 47.89333333333333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002995985211829342,
+      "loss": 0.4652,
+      "step": 17960
+    },
+    {
+      "epoch": 47.92,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002995980613463486,
+      "loss": 0.4757,
+      "step": 17970
+    },
+    {
+      "epoch": 47.946666666666665,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029959760124692857,
+      "loss": 0.4678,
+      "step": 17980
+    },
+    {
+      "epoch": 47.973333333333336,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002995971408846747,
+      "loss": 0.4794,
+      "step": 17990
+    },
+    {
+      "epoch": 48.0,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.000299596680259588,
+      "loss": 0.468,
+      "step": 18000
+    },
+    {
+      "epoch": 48.0,
+      "eval_loss": 0.48210230469703674,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1772,
+      "eval_samples_per_second": 1.572,
+      "eval_steps_per_second": 0.098,
+      "step": 18000
+    },
+    {
+      "epoch": 48.026666666666664,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029959621937166917,
+      "loss": 0.4873,
+      "step": 18010
+    },
+    {
+      "epoch": 48.053333333333335,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.000299595758220919,
+      "loss": 0.4939,
+      "step": 18020
+    },
+    {
+      "epoch": 48.08,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029959529680733836,
+      "loss": 0.4816,
+      "step": 18030
+    },
+    {
+      "epoch": 48.10666666666667,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.000299594835130928,
+      "loss": 0.4762,
+      "step": 18040
+    },
+    {
+      "epoch": 48.13333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002995943731916888,
+      "loss": 0.4779,
+      "step": 18050
+    },
+    {
+      "epoch": 48.16,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029959391098962154,
+      "loss": 0.4823,
+      "step": 18060
+    },
+    {
+      "epoch": 48.18666666666667,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.00029959344852472703,
+      "loss": 0.4744,
+      "step": 18070
+    },
+    {
+      "epoch": 48.21333333333333,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002995929857970061,
+      "loss": 0.4694,
+      "step": 18080
+    },
+    {
+      "epoch": 48.24,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029959252280645953,
+      "loss": 0.4732,
+      "step": 18090
+    },
+    {
+      "epoch": 48.266666666666666,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029959205955308816,
+      "loss": 0.4763,
+      "step": 18100
+    },
+    {
+      "epoch": 48.29333333333334,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0002995915960368928,
+      "loss": 0.471,
+      "step": 18110
+    },
+    {
+      "epoch": 48.32,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.00029959113225787423,
+      "loss": 0.4744,
+      "step": 18120
+    },
+    {
+      "epoch": 48.346666666666664,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002995906682160333,
+      "loss": 0.4828,
+      "step": 18130
+    },
+    {
+      "epoch": 48.373333333333335,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00029959020391137083,
+      "loss": 0.479,
+      "step": 18140
+    },
+    {
+      "epoch": 48.4,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002995897393438876,
+      "loss": 0.4753,
+      "step": 18150
+    },
+    {
+      "epoch": 48.42666666666667,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.00029958927451358445,
+      "loss": 0.4823,
+      "step": 18160
+    },
+    {
+      "epoch": 48.45333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002995888094204623,
+      "loss": 0.4848,
+      "step": 18170
+    },
+    {
+      "epoch": 48.48,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029958834406452177,
+      "loss": 0.4902,
+      "step": 18180
+    },
+    {
+      "epoch": 48.50666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002995878784457638,
+      "loss": 0.4799,
+      "step": 18190
+    },
+    {
+      "epoch": 48.53333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029958741256418917,
+      "loss": 0.4733,
+      "step": 18200
+    },
+    {
+      "epoch": 48.56,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002995869464197987,
+      "loss": 0.4745,
+      "step": 18210
+    },
+    {
+      "epoch": 48.586666666666666,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002995864800125932,
+      "loss": 0.4702,
+      "step": 18220
+    },
+    {
+      "epoch": 48.61333333333333,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002995860133425736,
+      "loss": 0.4564,
+      "step": 18230
+    },
+    {
+      "epoch": 48.64,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.00029958554640974053,
+      "loss": 0.4601,
+      "step": 18240
+    },
+    {
+      "epoch": 48.666666666666664,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002995850792140949,
+      "loss": 0.4764,
+      "step": 18250
+    },
+    {
+      "epoch": 48.693333333333335,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.00029958461175563767,
+      "loss": 0.4598,
+      "step": 18260
+    },
+    {
+      "epoch": 48.72,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029958414403436943,
+      "loss": 0.4768,
+      "step": 18270
+    },
+    {
+      "epoch": 48.74666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029958367605029113,
+      "loss": 0.4815,
+      "step": 18280
+    },
+    {
+      "epoch": 48.77333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002995832078034035,
+      "loss": 0.4889,
+      "step": 18290
+    },
+    {
+      "epoch": 48.8,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002995827392937075,
+      "loss": 0.4761,
+      "step": 18300
+    },
+    {
+      "epoch": 48.82666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029958227052120386,
+      "loss": 0.4706,
+      "step": 18310
+    },
+    {
+      "epoch": 48.85333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029958180148589343,
+      "loss": 0.4792,
+      "step": 18320
+    },
+    {
+      "epoch": 48.88,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.000299581332187777,
+      "loss": 0.4676,
+      "step": 18330
+    },
+    {
+      "epoch": 48.906666666666666,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00029958086262685545,
+      "loss": 0.4696,
+      "step": 18340
+    },
+    {
+      "epoch": 48.93333333333333,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00029958039280312957,
+      "loss": 0.4715,
+      "step": 18350
+    },
+    {
+      "epoch": 48.96,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002995799227166002,
+      "loss": 0.4703,
+      "step": 18360
+    },
+    {
+      "epoch": 48.986666666666665,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029957945236726817,
+      "loss": 0.4831,
+      "step": 18370
+    },
+    {
+      "epoch": 49.0,
+      "eval_loss": 0.48411810398101807,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0006,
+      "eval_samples_per_second": 1.6,
+      "eval_steps_per_second": 0.1,
+      "step": 18375
+    },
+    {
+      "epoch": 49.013333333333335,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00029957898175513427,
+      "loss": 0.4721,
+      "step": 18380
+    },
+    {
+      "epoch": 49.04,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029957851088019936,
+      "loss": 0.4971,
+      "step": 18390
+    },
+    {
+      "epoch": 49.06666666666667,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00029957803974246425,
+      "loss": 0.4837,
+      "step": 18400
+    },
+    {
+      "epoch": 49.093333333333334,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029957756834192976,
+      "loss": 0.4786,
+      "step": 18410
+    },
+    {
+      "epoch": 49.12,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002995770966785968,
+      "loss": 0.4751,
+      "step": 18420
+    },
+    {
+      "epoch": 49.14666666666667,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002995766247524661,
+      "loss": 0.4849,
+      "step": 18430
+    },
+    {
+      "epoch": 49.17333333333333,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002995761525635385,
+      "loss": 0.475,
+      "step": 18440
+    },
+    {
+      "epoch": 49.2,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029957568011181485,
+      "loss": 0.473,
+      "step": 18450
+    },
+    {
+      "epoch": 49.22666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.000299575207397296,
+      "loss": 0.4709,
+      "step": 18460
+    },
+    {
+      "epoch": 49.25333333333333,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029957473441998277,
+      "loss": 0.4707,
+      "step": 18470
+    },
+    {
+      "epoch": 49.28,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.000299574261179876,
+      "loss": 0.4839,
+      "step": 18480
+    },
+    {
+      "epoch": 49.306666666666665,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029957378767697646,
+      "loss": 0.465,
+      "step": 18490
+    },
+    {
+      "epoch": 49.333333333333336,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00029957331391128507,
+      "loss": 0.4814,
+      "step": 18500
+    },
+    {
+      "epoch": 49.36,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002995728398828026,
+      "loss": 0.4832,
+      "step": 18510
+    },
+    {
+      "epoch": 49.38666666666666,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002995723655915299,
+      "loss": 0.4752,
+      "step": 18520
+    },
+    {
+      "epoch": 49.413333333333334,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002995718910374678,
+      "loss": 0.4794,
+      "step": 18530
+    },
+    {
+      "epoch": 49.44,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029957141622061716,
+      "loss": 0.482,
+      "step": 18540
+    },
+    {
+      "epoch": 49.46666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029957094114097877,
+      "loss": 0.4915,
+      "step": 18550
+    },
+    {
+      "epoch": 49.49333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029957046579855346,
+      "loss": 0.4822,
+      "step": 18560
+    },
+    {
+      "epoch": 49.52,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002995699901933422,
+      "loss": 0.4771,
+      "step": 18570
+    },
+    {
+      "epoch": 49.54666666666667,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002995695143253457,
+      "loss": 0.474,
+      "step": 18580
+    },
+    {
+      "epoch": 49.57333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002995690381945647,
+      "loss": 0.4731,
+      "step": 18590
+    },
+    {
+      "epoch": 49.6,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029956856180100025,
+      "loss": 0.4667,
+      "step": 18600
+    },
+    {
+      "epoch": 49.626666666666665,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029956808514465305,
+      "loss": 0.4534,
+      "step": 18610
+    },
+    {
+      "epoch": 49.653333333333336,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029956760822552397,
+      "loss": 0.4725,
+      "step": 18620
+    },
+    {
+      "epoch": 49.68,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002995671310436139,
+      "loss": 0.4642,
+      "step": 18630
+    },
+    {
+      "epoch": 49.70666666666666,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002995666535989236,
+      "loss": 0.4648,
+      "step": 18640
+    },
+    {
+      "epoch": 49.733333333333334,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029956617589145393,
+      "loss": 0.4834,
+      "step": 18650
+    },
+    {
+      "epoch": 49.76,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002995656979212058,
+      "loss": 0.4853,
+      "step": 18660
+    },
+    {
+      "epoch": 49.78666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029956521968817993,
+      "loss": 0.4841,
+      "step": 18670
+    },
+    {
+      "epoch": 49.81333333333333,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029956474119237727,
+      "loss": 0.4688,
+      "step": 18680
+    },
+    {
+      "epoch": 49.84,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029956426243379854,
+      "loss": 0.4751,
+      "step": 18690
+    },
+    {
+      "epoch": 49.86666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002995637834124447,
+      "loss": 0.4746,
+      "step": 18700
+    },
+    {
+      "epoch": 49.89333333333333,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029956330412831653,
+      "loss": 0.4646,
+      "step": 18710
+    },
+    {
+      "epoch": 49.92,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029956282458141494,
+      "loss": 0.476,
+      "step": 18720
+    },
+    {
+      "epoch": 49.946666666666665,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029956234477174063,
+      "loss": 0.4674,
+      "step": 18730
+    },
+    {
+      "epoch": 49.973333333333336,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002995618646992946,
+      "loss": 0.4787,
+      "step": 18740
+    },
+    {
+      "epoch": 50.0,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002995613843640776,
+      "loss": 0.4685,
+      "step": 18750
+    },
+    {
+      "epoch": 50.0,
+      "eval_loss": 0.48534244298934937,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.7591,
+      "eval_samples_per_second": 1.487,
+      "eval_steps_per_second": 0.093,
+      "step": 18750
+    },
+    {
+      "epoch": 50.026666666666664,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029956090376609054,
+      "loss": 0.4884,
+      "step": 18760
+    },
+    {
+      "epoch": 50.053333333333335,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029956042290533416,
+      "loss": 0.4938,
+      "step": 18770
+    },
+    {
+      "epoch": 50.08,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002995599417818094,
+      "loss": 0.4814,
+      "step": 18780
+    },
+    {
+      "epoch": 50.10666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029955946039551703,
+      "loss": 0.4759,
+      "step": 18790
+    },
+    {
+      "epoch": 50.13333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029955897874645797,
+      "loss": 0.478,
+      "step": 18800
+    },
+    {
+      "epoch": 50.16,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029955849683463307,
+      "loss": 0.4813,
+      "step": 18810
+    },
+    {
+      "epoch": 50.18666666666667,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002995580146600431,
+      "loss": 0.474,
+      "step": 18820
+    },
+    {
+      "epoch": 50.21333333333333,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.000299557532222689,
+      "loss": 0.4684,
+      "step": 18830
+    },
+    {
+      "epoch": 50.24,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00029955704952257153,
+      "loss": 0.473,
+      "step": 18840
+    },
+    {
+      "epoch": 50.266666666666666,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029955656655969155,
+      "loss": 0.4763,
+      "step": 18850
+    },
+    {
+      "epoch": 50.29333333333334,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.00029955608333405,
+      "loss": 0.4711,
+      "step": 18860
+    },
+    {
+      "epoch": 50.32,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002995555998456476,
+      "loss": 0.4745,
+      "step": 18870
+    },
+    {
+      "epoch": 50.346666666666664,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002995551160944853,
+      "loss": 0.4826,
+      "step": 18880
+    },
+    {
+      "epoch": 50.373333333333335,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002995546320805639,
+      "loss": 0.479,
+      "step": 18890
+    },
+    {
+      "epoch": 50.4,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029955414780388426,
+      "loss": 0.4751,
+      "step": 18900
+    },
+    {
+      "epoch": 50.42666666666667,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.00029955366326444725,
+      "loss": 0.4818,
+      "step": 18910
+    },
+    {
+      "epoch": 50.45333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002995531784622537,
+      "loss": 0.4842,
+      "step": 18920
+    },
+    {
+      "epoch": 50.48,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002995526933973044,
+      "loss": 0.4912,
+      "step": 18930
+    },
+    {
+      "epoch": 50.50666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029955220806960036,
+      "loss": 0.4796,
+      "step": 18940
+    },
+    {
+      "epoch": 50.53333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029955172247914233,
+      "loss": 0.4727,
+      "step": 18950
+    },
+    {
+      "epoch": 50.56,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00029955123662593115,
+      "loss": 0.4741,
+      "step": 18960
+    },
+    {
+      "epoch": 50.586666666666666,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0002995507505099677,
+      "loss": 0.4708,
+      "step": 18970
+    },
+    {
+      "epoch": 50.61333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029955026413125283,
+      "loss": 0.4556,
+      "step": 18980
+    },
+    {
+      "epoch": 50.64,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002995497774897874,
+      "loss": 0.4604,
+      "step": 18990
+    },
+    {
+      "epoch": 50.666666666666664,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029954929058557223,
+      "loss": 0.4768,
+      "step": 19000
+    },
+    {
+      "epoch": 50.693333333333335,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029954880341860824,
+      "loss": 0.4595,
+      "step": 19010
+    },
+    {
+      "epoch": 50.72,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029954831598889625,
+      "loss": 0.476,
+      "step": 19020
+    },
+    {
+      "epoch": 50.74666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002995478282964372,
+      "loss": 0.481,
+      "step": 19030
+    },
+    {
+      "epoch": 50.77333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029954734034123176,
+      "loss": 0.4885,
+      "step": 19040
+    },
+    {
+      "epoch": 50.8,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029954685212328093,
+      "loss": 0.4756,
+      "step": 19050
+    },
+    {
+      "epoch": 50.82666666666667,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029954636364258553,
+      "loss": 0.4701,
+      "step": 19060
+    },
+    {
+      "epoch": 50.85333333333333,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029954587489914637,
+      "loss": 0.4793,
+      "step": 19070
+    },
+    {
+      "epoch": 50.88,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029954538589296447,
+      "loss": 0.4665,
+      "step": 19080
+    },
+    {
+      "epoch": 50.906666666666666,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002995448966240405,
+      "loss": 0.47,
+      "step": 19090
+    },
+    {
+      "epoch": 50.93333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002995444070923754,
+      "loss": 0.4712,
+      "step": 19100
+    },
+    {
+      "epoch": 50.96,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029954391729797005,
+      "loss": 0.4684,
+      "step": 19110
+    },
+    {
+      "epoch": 50.986666666666665,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002995434272408252,
+      "loss": 0.4824,
+      "step": 19120
+    },
+    {
+      "epoch": 51.0,
+      "eval_loss": 0.482730507850647,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0939,
+      "eval_samples_per_second": 1.585,
+      "eval_steps_per_second": 0.099,
+      "step": 19125
+    },
+    {
+      "epoch": 51.013333333333335,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029954293692094195,
+      "loss": 0.4726,
+      "step": 19130
+    },
+    {
+      "epoch": 51.04,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002995424463383209,
+      "loss": 0.4961,
+      "step": 19140
+    },
+    {
+      "epoch": 51.06666666666667,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029954195549296303,
+      "loss": 0.4835,
+      "step": 19150
+    },
+    {
+      "epoch": 51.093333333333334,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002995414643848693,
+      "loss": 0.4782,
+      "step": 19160
+    },
+    {
+      "epoch": 51.12,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002995409730140403,
+      "loss": 0.4743,
+      "step": 19170
+    },
+    {
+      "epoch": 51.14666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002995404813804772,
+      "loss": 0.4845,
+      "step": 19180
+    },
+    {
+      "epoch": 51.17333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029953998948418066,
+      "loss": 0.4738,
+      "step": 19190
+    },
+    {
+      "epoch": 51.2,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029953949732515163,
+      "loss": 0.4724,
+      "step": 19200
+    },
+    {
+      "epoch": 51.22666666666667,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029953900490339093,
+      "loss": 0.4707,
+      "step": 19210
+    },
+    {
+      "epoch": 51.25333333333333,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002995385122188995,
+      "loss": 0.4706,
+      "step": 19220
+    },
+    {
+      "epoch": 51.28,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002995380192716781,
+      "loss": 0.4839,
+      "step": 19230
+    },
+    {
+      "epoch": 51.306666666666665,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029953752606172766,
+      "loss": 0.4646,
+      "step": 19240
+    },
+    {
+      "epoch": 51.333333333333336,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002995370325890491,
+      "loss": 0.4808,
+      "step": 19250
+    },
+    {
+      "epoch": 51.36,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002995365388536431,
+      "loss": 0.4835,
+      "step": 19260
+    },
+    {
+      "epoch": 51.38666666666666,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029953604485551073,
+      "loss": 0.4741,
+      "step": 19270
+    },
+    {
+      "epoch": 51.413333333333334,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0002995355505946528,
+      "loss": 0.478,
+      "step": 19280
+    },
+    {
+      "epoch": 51.44,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002995350560710701,
+      "loss": 0.4813,
+      "step": 19290
+    },
+    {
+      "epoch": 51.46666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002995345612847636,
+      "loss": 0.4909,
+      "step": 19300
+    },
+    {
+      "epoch": 51.49333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029953406623573414,
+      "loss": 0.4821,
+      "step": 19310
+    },
+    {
+      "epoch": 51.52,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002995335709239825,
+      "loss": 0.4767,
+      "step": 19320
+    },
+    {
+      "epoch": 51.54666666666667,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.00029953307534950967,
+      "loss": 0.4732,
+      "step": 19330
+    },
+    {
+      "epoch": 51.57333333333333,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029953257951231653,
+      "loss": 0.4726,
+      "step": 19340
+    },
+    {
+      "epoch": 51.6,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0002995320834124038,
+      "loss": 0.4667,
+      "step": 19350
+    },
+    {
+      "epoch": 51.626666666666665,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029953158704977247,
+      "loss": 0.4532,
+      "step": 19360
+    },
+    {
+      "epoch": 51.653333333333336,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029953109042442344,
+      "loss": 0.4729,
+      "step": 19370
+    },
+    {
+      "epoch": 51.68,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029953059353635754,
+      "loss": 0.464,
+      "step": 19380
+    },
+    {
+      "epoch": 51.70666666666666,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029953009638557563,
+      "loss": 0.4647,
+      "step": 19390
+    },
+    {
+      "epoch": 51.733333333333334,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029952959897207857,
+      "loss": 0.4839,
+      "step": 19400
+    },
+    {
+      "epoch": 51.76,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029952910129586724,
+      "loss": 0.4847,
+      "step": 19410
+    },
+    {
+      "epoch": 51.78666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029952860335694256,
+      "loss": 0.4835,
+      "step": 19420
+    },
+    {
+      "epoch": 51.81333333333333,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029952810515530533,
+      "loss": 0.4688,
+      "step": 19430
+    },
+    {
+      "epoch": 51.84,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002995276066909565,
+      "loss": 0.4757,
+      "step": 19440
+    },
+    {
+      "epoch": 51.86666666666667,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0002995271079638969,
+      "loss": 0.4742,
+      "step": 19450
+    },
+    {
+      "epoch": 51.89333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029952660897412746,
+      "loss": 0.4646,
+      "step": 19460
+    },
+    {
+      "epoch": 51.92,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029952610972164897,
+      "loss": 0.4758,
+      "step": 19470
+    },
+    {
+      "epoch": 51.946666666666665,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002995256102064624,
+      "loss": 0.4672,
+      "step": 19480
+    },
+    {
+      "epoch": 51.973333333333336,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029952511042856855,
+      "loss": 0.4792,
+      "step": 19490
+    },
+    {
+      "epoch": 52.0,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029952461038796827,
+      "loss": 0.4684,
+      "step": 19500
+    },
+    {
+      "epoch": 52.0,
+      "eval_loss": 0.4829648435115814,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5574,
+      "eval_samples_per_second": 1.516,
+      "eval_steps_per_second": 0.095,
+      "step": 19500
+    },
+    {
+      "epoch": 52.026666666666664,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0002995241100846626,
+      "loss": 0.4875,
+      "step": 19510
+    },
+    {
+      "epoch": 52.053333333333335,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.00029952360951865224,
+      "loss": 0.494,
+      "step": 19520
+    },
+    {
+      "epoch": 52.08,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029952310868993823,
+      "loss": 0.4803,
+      "step": 19530
+    },
+    {
+      "epoch": 52.10666666666667,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029952260759852134,
+      "loss": 0.4758,
+      "step": 19540
+    },
+    {
+      "epoch": 52.13333333333333,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029952210624440247,
+      "loss": 0.4775,
+      "step": 19550
+    },
+    {
+      "epoch": 52.16,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0002995216046275825,
+      "loss": 0.4816,
+      "step": 19560
+    },
+    {
+      "epoch": 52.18666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002995211027480623,
+      "loss": 0.4742,
+      "step": 19570
+    },
+    {
+      "epoch": 52.21333333333333,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00029952060060584284,
+      "loss": 0.4687,
+      "step": 19580
+    },
+    {
+      "epoch": 52.24,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002995200982009249,
+      "loss": 0.4729,
+      "step": 19590
+    },
+    {
+      "epoch": 52.266666666666666,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029951959553330934,
+      "loss": 0.476,
+      "step": 19600
+    },
+    {
+      "epoch": 52.29333333333334,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029951909260299715,
+      "loss": 0.4701,
+      "step": 19610
+    },
+    {
+      "epoch": 52.32,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00029951858940998916,
+      "loss": 0.474,
+      "step": 19620
+    },
+    {
+      "epoch": 52.346666666666664,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029951808595428625,
+      "loss": 0.4828,
+      "step": 19630
+    },
+    {
+      "epoch": 52.373333333333335,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.00029951758223588934,
+      "loss": 0.4786,
+      "step": 19640
+    },
+    {
+      "epoch": 52.4,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00029951707825479925,
+      "loss": 0.4749,
+      "step": 19650
+    },
+    {
+      "epoch": 52.42666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029951657401101694,
+      "loss": 0.4814,
+      "step": 19660
+    },
+    {
+      "epoch": 52.45333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029951606950454323,
+      "loss": 0.483,
+      "step": 19670
+    },
+    {
+      "epoch": 52.48,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029951556473537905,
+      "loss": 0.4897,
+      "step": 19680
+    },
+    {
+      "epoch": 52.50666666666667,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029951505970352525,
+      "loss": 0.4798,
+      "step": 19690
+    },
+    {
+      "epoch": 52.53333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029951455440898277,
+      "loss": 0.4734,
+      "step": 19700
+    },
+    {
+      "epoch": 52.56,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002995140488517524,
+      "loss": 0.4747,
+      "step": 19710
+    },
+    {
+      "epoch": 52.586666666666666,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029951354303183516,
+      "loss": 0.47,
+      "step": 19720
+    },
+    {
+      "epoch": 52.61333333333333,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00029951303694923187,
+      "loss": 0.4562,
+      "step": 19730
+    },
+    {
+      "epoch": 52.64,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002995125306039434,
+      "loss": 0.4611,
+      "step": 19740
+    },
+    {
+      "epoch": 52.666666666666664,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00029951202399597067,
+      "loss": 0.4764,
+      "step": 19750
+    },
+    {
+      "epoch": 52.693333333333335,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0002995115171253146,
+      "loss": 0.459,
+      "step": 19760
+    },
+    {
+      "epoch": 52.72,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.000299511009991976,
+      "loss": 0.4761,
+      "step": 19770
+    },
+    {
+      "epoch": 52.74666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002995105025959558,
+      "loss": 0.4807,
+      "step": 19780
+    },
+    {
+      "epoch": 52.77333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002995099949372549,
+      "loss": 0.4885,
+      "step": 19790
+    },
+    {
+      "epoch": 52.8,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002995094870158742,
+      "loss": 0.4751,
+      "step": 19800
+    },
+    {
+      "epoch": 52.82666666666667,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029950897883181457,
+      "loss": 0.47,
+      "step": 19810
+    },
+    {
+      "epoch": 52.85333333333333,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002995084703850769,
+      "loss": 0.4788,
+      "step": 19820
+    },
+    {
+      "epoch": 52.88,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029950796167566214,
+      "loss": 0.4669,
+      "step": 19830
+    },
+    {
+      "epoch": 52.906666666666666,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002995074527035711,
+      "loss": 0.4696,
+      "step": 19840
+    },
+    {
+      "epoch": 52.93333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029950694346880477,
+      "loss": 0.472,
+      "step": 19850
+    },
+    {
+      "epoch": 52.96,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002995064339713639,
+      "loss": 0.4686,
+      "step": 19860
+    },
+    {
+      "epoch": 52.986666666666665,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002995059242112495,
+      "loss": 0.4825,
+      "step": 19870
+    },
+    {
+      "epoch": 53.0,
+      "eval_loss": 0.4844491183757782,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1674,
+      "eval_samples_per_second": 1.574,
+      "eval_steps_per_second": 0.098,
+      "step": 19875
+    },
+    {
+      "epoch": 53.013333333333335,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029950541418846254,
+      "loss": 0.4711,
+      "step": 19880
+    },
+    {
+      "epoch": 53.04,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0002995049039030037,
+      "loss": 0.4956,
+      "step": 19890
+    },
+    {
+      "epoch": 53.06666666666667,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029950439335487403,
+      "loss": 0.4834,
+      "step": 19900
+    },
+    {
+      "epoch": 53.093333333333334,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002995038825440744,
+      "loss": 0.4783,
+      "step": 19910
+    },
+    {
+      "epoch": 53.12,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029950337147060564,
+      "loss": 0.475,
+      "step": 19920
+    },
+    {
+      "epoch": 53.14666666666667,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.00029950286013446874,
+      "loss": 0.4838,
+      "step": 19930
+    },
+    {
+      "epoch": 53.17333333333333,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002995023485356646,
+      "loss": 0.4739,
+      "step": 19940
+    },
+    {
+      "epoch": 53.2,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029950183667419404,
+      "loss": 0.4727,
+      "step": 19950
+    },
+    {
+      "epoch": 53.22666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.000299501324550058,
+      "loss": 0.4705,
+      "step": 19960
+    },
+    {
+      "epoch": 53.25333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002995008121632574,
+      "loss": 0.47,
+      "step": 19970
+    },
+    {
+      "epoch": 53.28,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002995002995137931,
+      "loss": 0.4838,
+      "step": 19980
+    },
+    {
+      "epoch": 53.306666666666665,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002994997866016661,
+      "loss": 0.4646,
+      "step": 19990
+    },
+    {
+      "epoch": 53.333333333333336,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002994992734268771,
+      "loss": 0.4803,
+      "step": 20000
+    },
+    {
+      "epoch": 53.36,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002994987599894272,
+      "loss": 0.4829,
+      "step": 20010
+    },
+    {
+      "epoch": 53.38666666666666,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002994982462893172,
+      "loss": 0.4741,
+      "step": 20020
+    },
+    {
+      "epoch": 53.413333333333334,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029949773232654806,
+      "loss": 0.4777,
+      "step": 20030
+    },
+    {
+      "epoch": 53.44,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029949721810112063,
+      "loss": 0.4813,
+      "step": 20040
+    },
+    {
+      "epoch": 53.46666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029949670361303584,
+      "loss": 0.4914,
+      "step": 20050
+    },
+    {
+      "epoch": 53.49333333333333,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00029949618886229457,
+      "loss": 0.4828,
+      "step": 20060
+    },
+    {
+      "epoch": 53.52,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002994956738488978,
+      "loss": 0.4765,
+      "step": 20070
+    },
+    {
+      "epoch": 53.54666666666667,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002994951585728463,
+      "loss": 0.4729,
+      "step": 20080
+    },
+    {
+      "epoch": 53.57333333333333,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0002994946430341411,
+      "loss": 0.4725,
+      "step": 20090
+    },
+    {
+      "epoch": 53.6,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029949412723278304,
+      "loss": 0.4656,
+      "step": 20100
+    },
+    {
+      "epoch": 53.626666666666665,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029949361116877305,
+      "loss": 0.4528,
+      "step": 20110
+    },
+    {
+      "epoch": 53.653333333333336,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002994930948421121,
+      "loss": 0.4729,
+      "step": 20120
+    },
+    {
+      "epoch": 53.68,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029949257825280095,
+      "loss": 0.464,
+      "step": 20130
+    },
+    {
+      "epoch": 53.70666666666666,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002994920614008406,
+      "loss": 0.4649,
+      "step": 20140
+    },
+    {
+      "epoch": 53.733333333333334,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0002994915442862319,
+      "loss": 0.4834,
+      "step": 20150
+    },
+    {
+      "epoch": 53.76,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002994910269089759,
+      "loss": 0.4847,
+      "step": 20160
+    },
+    {
+      "epoch": 53.78666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002994905092690734,
+      "loss": 0.4835,
+      "step": 20170
+    },
+    {
+      "epoch": 53.81333333333333,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002994899913665253,
+      "loss": 0.4685,
+      "step": 20180
+    },
+    {
+      "epoch": 53.84,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029948947320133245,
+      "loss": 0.4747,
+      "step": 20190
+    },
+    {
+      "epoch": 53.86666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029948895477349593,
+      "loss": 0.4737,
+      "step": 20200
+    },
+    {
+      "epoch": 53.89333333333333,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002994884360830166,
+      "loss": 0.4646,
+      "step": 20210
+    },
+    {
+      "epoch": 53.92,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002994879171298953,
+      "loss": 0.475,
+      "step": 20220
+    },
+    {
+      "epoch": 53.946666666666665,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029948739791413293,
+      "loss": 0.4669,
+      "step": 20230
+    },
+    {
+      "epoch": 53.973333333333336,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029948687843573047,
+      "loss": 0.4792,
+      "step": 20240
+    },
+    {
+      "epoch": 54.0,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002994863586946888,
+      "loss": 0.4675,
+      "step": 20250
+    },
+    {
+      "epoch": 54.0,
+      "eval_loss": 0.48338034749031067,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0245,
+      "eval_samples_per_second": 1.596,
+      "eval_steps_per_second": 0.1,
+      "step": 20250
+    },
+    {
+      "epoch": 54.026666666666664,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002994858386910089,
+      "loss": 0.4889,
+      "step": 20260
+    },
+    {
+      "epoch": 54.053333333333335,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0002994853184246916,
+      "loss": 0.4937,
+      "step": 20270
+    },
+    {
+      "epoch": 54.08,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029948479789573785,
+      "loss": 0.4812,
+      "step": 20280
+    },
+    {
+      "epoch": 54.10666666666667,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029948427710414853,
+      "loss": 0.4758,
+      "step": 20290
+    },
+    {
+      "epoch": 54.13333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002994837560499246,
+      "loss": 0.4774,
+      "step": 20300
+    },
+    {
+      "epoch": 54.16,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.000299483234733067,
+      "loss": 0.4816,
+      "step": 20310
+    },
+    {
+      "epoch": 54.18666666666667,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00029948271315357656,
+      "loss": 0.474,
+      "step": 20320
+    },
+    {
+      "epoch": 54.21333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002994821913114542,
+      "loss": 0.4682,
+      "step": 20330
+    },
+    {
+      "epoch": 54.24,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029948166920670095,
+      "loss": 0.4729,
+      "step": 20340
+    },
+    {
+      "epoch": 54.266666666666666,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002994811468393176,
+      "loss": 0.4773,
+      "step": 20350
+    },
+    {
+      "epoch": 54.29333333333334,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029948062420930515,
+      "loss": 0.4703,
+      "step": 20360
+    },
+    {
+      "epoch": 54.32,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002994801013166645,
+      "loss": 0.4741,
+      "step": 20370
+    },
+    {
+      "epoch": 54.346666666666664,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029947957816139654,
+      "loss": 0.4822,
+      "step": 20380
+    },
+    {
+      "epoch": 54.373333333333335,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002994790547435022,
+      "loss": 0.4783,
+      "step": 20390
+    },
+    {
+      "epoch": 54.4,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029947853106298243,
+      "loss": 0.475,
+      "step": 20400
+    },
+    {
+      "epoch": 54.42666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002994780071198381,
+      "loss": 0.4812,
+      "step": 20410
+    },
+    {
+      "epoch": 54.45333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002994774829140702,
+      "loss": 0.4833,
+      "step": 20420
+    },
+    {
+      "epoch": 54.48,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029947695844567956,
+      "loss": 0.4894,
+      "step": 20430
+    },
+    {
+      "epoch": 54.50666666666667,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029947643371466717,
+      "loss": 0.4791,
+      "step": 20440
+    },
+    {
+      "epoch": 54.53333333333333,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00029947590872103397,
+      "loss": 0.4729,
+      "step": 20450
+    },
+    {
+      "epoch": 54.56,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002994753834647808,
+      "loss": 0.473,
+      "step": 20460
+    },
+    {
+      "epoch": 54.586666666666666,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0002994748579459086,
+      "loss": 0.4698,
+      "step": 20470
+    },
+    {
+      "epoch": 54.61333333333333,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029947433216441835,
+      "loss": 0.4553,
+      "step": 20480
+    },
+    {
+      "epoch": 54.64,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029947380612031094,
+      "loss": 0.4606,
+      "step": 20490
+    },
+    {
+      "epoch": 54.666666666666664,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029947327981358736,
+      "loss": 0.4757,
+      "step": 20500
+    },
+    {
+      "epoch": 54.693333333333335,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0002994727532442484,
+      "loss": 0.4592,
+      "step": 20510
+    },
+    {
+      "epoch": 54.72,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002994722264122951,
+      "loss": 0.476,
+      "step": 20520
+    },
+    {
+      "epoch": 54.74666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002994716993177283,
+      "loss": 0.4808,
+      "step": 20530
+    },
+    {
+      "epoch": 54.77333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029947117196054904,
+      "loss": 0.4881,
+      "step": 20540
+    },
+    {
+      "epoch": 54.8,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029947064434075814,
+      "loss": 0.4746,
+      "step": 20550
+    },
+    {
+      "epoch": 54.82666666666667,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002994701164583565,
+      "loss": 0.4696,
+      "step": 20560
+    },
+    {
+      "epoch": 54.85333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002994695883133452,
+      "loss": 0.4786,
+      "step": 20570
+    },
+    {
+      "epoch": 54.88,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002994690599057251,
+      "loss": 0.4663,
+      "step": 20580
+    },
+    {
+      "epoch": 54.906666666666666,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.000299468531235497,
+      "loss": 0.4689,
+      "step": 20590
+    },
+    {
+      "epoch": 54.93333333333333,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.000299468002302662,
+      "loss": 0.4707,
+      "step": 20600
+    },
+    {
+      "epoch": 54.96,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0002994674731072209,
+      "loss": 0.469,
+      "step": 20610
+    },
+    {
+      "epoch": 54.986666666666665,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029946694364917483,
+      "loss": 0.4826,
+      "step": 20620
+    },
+    {
+      "epoch": 55.0,
+      "eval_loss": 0.48227477073669434,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2026,
+      "eval_samples_per_second": 1.568,
+      "eval_steps_per_second": 0.098,
+      "step": 20625
+    },
+    {
+      "epoch": 55.013333333333335,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002994664139285245,
+      "loss": 0.4724,
+      "step": 20630
+    },
+    {
+      "epoch": 55.04,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00029946588394527094,
+      "loss": 0.4966,
+      "step": 20640
+    },
+    {
+      "epoch": 55.06666666666667,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0002994653536994151,
+      "loss": 0.4838,
+      "step": 20650
+    },
+    {
+      "epoch": 55.093333333333334,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029946482319095783,
+      "loss": 0.4785,
+      "step": 20660
+    },
+    {
+      "epoch": 55.12,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029946429241990013,
+      "loss": 0.4745,
+      "step": 20670
+    },
+    {
+      "epoch": 55.14666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029946376138624293,
+      "loss": 0.4834,
+      "step": 20680
+    },
+    {
+      "epoch": 55.17333333333333,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029946323008998713,
+      "loss": 0.4745,
+      "step": 20690
+    },
+    {
+      "epoch": 55.2,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00029946269853113367,
+      "loss": 0.472,
+      "step": 20700
+    },
+    {
+      "epoch": 55.22666666666667,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002994621667096835,
+      "loss": 0.4699,
+      "step": 20710
+    },
+    {
+      "epoch": 55.25333333333333,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002994616346256375,
+      "loss": 0.4695,
+      "step": 20720
+    },
+    {
+      "epoch": 55.28,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002994611022789967,
+      "loss": 0.4839,
+      "step": 20730
+    },
+    {
+      "epoch": 55.306666666666665,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029946056966976203,
+      "loss": 0.465,
+      "step": 20740
+    },
+    {
+      "epoch": 55.333333333333336,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002994600367979343,
+      "loss": 0.4798,
+      "step": 20750
+    },
+    {
+      "epoch": 55.36,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002994595036635146,
+      "loss": 0.4819,
+      "step": 20760
+    },
+    {
+      "epoch": 55.38666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002994589702665038,
+      "loss": 0.4736,
+      "step": 20770
+    },
+    {
+      "epoch": 55.413333333333334,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0002994584366069028,
+      "loss": 0.478,
+      "step": 20780
+    },
+    {
+      "epoch": 55.44,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002994579026847126,
+      "loss": 0.4808,
+      "step": 20790
+    },
+    {
+      "epoch": 55.46666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002994573684999341,
+      "loss": 0.4905,
+      "step": 20800
+    },
+    {
+      "epoch": 55.49333333333333,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002994568340525682,
+      "loss": 0.4821,
+      "step": 20810
+    },
+    {
+      "epoch": 55.52,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.000299456299342616,
+      "loss": 0.4765,
+      "step": 20820
+    },
+    {
+      "epoch": 55.54666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029945576437007826,
+      "loss": 0.473,
+      "step": 20830
+    },
+    {
+      "epoch": 55.57333333333333,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0002994552291349559,
+      "loss": 0.4722,
+      "step": 20840
+    },
+    {
+      "epoch": 55.6,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0002994546936372501,
+      "loss": 0.466,
+      "step": 20850
+    },
+    {
+      "epoch": 55.626666666666665,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029945415787696155,
+      "loss": 0.4526,
+      "step": 20860
+    },
+    {
+      "epoch": 55.653333333333336,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029945362185409137,
+      "loss": 0.4721,
+      "step": 20870
+    },
+    {
+      "epoch": 55.68,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029945308556864037,
+      "loss": 0.4638,
+      "step": 20880
+    },
+    {
+      "epoch": 55.70666666666666,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002994525490206095,
+      "loss": 0.464,
+      "step": 20890
+    },
+    {
+      "epoch": 55.733333333333334,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002994520122099998,
+      "loss": 0.4832,
+      "step": 20900
+    },
+    {
+      "epoch": 55.76,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002994514751368121,
+      "loss": 0.4849,
+      "step": 20910
+    },
+    {
+      "epoch": 55.78666666666667,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002994509378010475,
+      "loss": 0.4829,
+      "step": 20920
+    },
+    {
+      "epoch": 55.81333333333333,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002994504002027068,
+      "loss": 0.4678,
+      "step": 20930
+    },
+    {
+      "epoch": 55.84,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.000299449862341791,
+      "loss": 0.4743,
+      "step": 20940
+    },
+    {
+      "epoch": 55.86666666666667,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029944932421830105,
+      "loss": 0.474,
+      "step": 20950
+    },
+    {
+      "epoch": 55.89333333333333,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029944878583223783,
+      "loss": 0.4645,
+      "step": 20960
+    },
+    {
+      "epoch": 55.92,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002994482471836024,
+      "loss": 0.4752,
+      "step": 20970
+    },
+    {
+      "epoch": 55.946666666666665,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002994477082723956,
+      "loss": 0.4673,
+      "step": 20980
+    },
+    {
+      "epoch": 55.973333333333336,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029944716909861846,
+      "loss": 0.4788,
+      "step": 20990
+    },
+    {
+      "epoch": 56.0,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029944662966227185,
+      "loss": 0.4679,
+      "step": 21000
+    },
+    {
+      "epoch": 56.0,
+      "eval_loss": 0.4822176694869995,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3155,
+      "eval_samples_per_second": 1.551,
+      "eval_steps_per_second": 0.097,
+      "step": 21000
+    },
+    {
+      "epoch": 56.026666666666664,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002994460899633568,
+      "loss": 0.4876,
+      "step": 21010
+    },
+    {
+      "epoch": 56.053333333333335,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.00029944555000187414,
+      "loss": 0.493,
+      "step": 21020
+    },
+    {
+      "epoch": 56.08,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029944500977782494,
+      "loss": 0.4808,
+      "step": 21030
+    },
+    {
+      "epoch": 56.10666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029944446929121006,
+      "loss": 0.4754,
+      "step": 21040
+    },
+    {
+      "epoch": 56.13333333333333,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002994439285420306,
+      "loss": 0.4769,
+      "step": 21050
+    },
+    {
+      "epoch": 56.16,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002994433875302873,
+      "loss": 0.4814,
+      "step": 21060
+    },
+    {
+      "epoch": 56.18666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002994428462559813,
+      "loss": 0.473,
+      "step": 21070
+    },
+    {
+      "epoch": 56.21333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002994423047191134,
+      "loss": 0.4678,
+      "step": 21080
+    },
+    {
+      "epoch": 56.24,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029944176291968463,
+      "loss": 0.4719,
+      "step": 21090
+    },
+    {
+      "epoch": 56.266666666666666,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002994412208576959,
+      "loss": 0.4762,
+      "step": 21100
+    },
+    {
+      "epoch": 56.29333333333334,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002994406785331482,
+      "loss": 0.4699,
+      "step": 21110
+    },
+    {
+      "epoch": 56.32,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002994401359460425,
+      "loss": 0.4745,
+      "step": 21120
+    },
+    {
+      "epoch": 56.346666666666664,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002994395930963797,
+      "loss": 0.4827,
+      "step": 21130
+    },
+    {
+      "epoch": 56.373333333333335,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002994390499841608,
+      "loss": 0.4779,
+      "step": 21140
+    },
+    {
+      "epoch": 56.4,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0002994385066093867,
+      "loss": 0.4743,
+      "step": 21150
+    },
+    {
+      "epoch": 56.42666666666667,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029943796297205843,
+      "loss": 0.4808,
+      "step": 21160
+    },
+    {
+      "epoch": 56.45333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029943741907217686,
+      "loss": 0.483,
+      "step": 21170
+    },
+    {
+      "epoch": 56.48,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029943687490974305,
+      "loss": 0.4892,
+      "step": 21180
+    },
+    {
+      "epoch": 56.50666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002994363304847578,
+      "loss": 0.4786,
+      "step": 21190
+    },
+    {
+      "epoch": 56.53333333333333,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029943578579722224,
+      "loss": 0.4723,
+      "step": 21200
+    },
+    {
+      "epoch": 56.56,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029943524084713723,
+      "loss": 0.4739,
+      "step": 21210
+    },
+    {
+      "epoch": 56.586666666666666,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.00029943469563450373,
+      "loss": 0.4698,
+      "step": 21220
+    },
+    {
+      "epoch": 56.61333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029943415015932275,
+      "loss": 0.4556,
+      "step": 21230
+    },
+    {
+      "epoch": 56.64,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002994336044215952,
+      "loss": 0.4601,
+      "step": 21240
+    },
+    {
+      "epoch": 56.666666666666664,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.000299433058421322,
+      "loss": 0.4757,
+      "step": 21250
+    },
+    {
+      "epoch": 56.693333333333335,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002994325121585042,
+      "loss": 0.4589,
+      "step": 21260
+    },
+    {
+      "epoch": 56.72,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029943196563314274,
+      "loss": 0.4759,
+      "step": 21270
+    },
+    {
+      "epoch": 56.74666666666667,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.00029943141884523856,
+      "loss": 0.4803,
+      "step": 21280
+    },
+    {
+      "epoch": 56.77333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002994308717947926,
+      "loss": 0.488,
+      "step": 21290
+    },
+    {
+      "epoch": 56.8,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002994303244818058,
+      "loss": 0.4749,
+      "step": 21300
+    },
+    {
+      "epoch": 56.82666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002994297769062792,
+      "loss": 0.4693,
+      "step": 21310
+    },
+    {
+      "epoch": 56.85333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029942922906821376,
+      "loss": 0.479,
+      "step": 21320
+    },
+    {
+      "epoch": 56.88,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002994286809676104,
+      "loss": 0.4673,
+      "step": 21330
+    },
+    {
+      "epoch": 56.906666666666666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029942813260447,
+      "loss": 0.4684,
+      "step": 21340
+    },
+    {
+      "epoch": 56.93333333333333,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002994275839787937,
+      "loss": 0.4706,
+      "step": 21350
+    },
+    {
+      "epoch": 56.96,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002994270350905824,
+      "loss": 0.4689,
+      "step": 21360
+    },
+    {
+      "epoch": 56.986666666666665,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029942648593983697,
+      "loss": 0.4821,
+      "step": 21370
+    },
+    {
+      "epoch": 57.0,
+      "eval_loss": 0.4817401170730591,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8223,
+      "eval_samples_per_second": 1.629,
+      "eval_steps_per_second": 0.102,
+      "step": 21375
+    },
+    {
+      "epoch": 57.013333333333335,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00029942593652655853,
+      "loss": 0.4709,
+      "step": 21380
+    },
+    {
+      "epoch": 57.04,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002994253868507479,
+      "loss": 0.4956,
+      "step": 21390
+    },
+    {
+      "epoch": 57.06666666666667,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002994248369124061,
+      "loss": 0.4832,
+      "step": 21400
+    },
+    {
+      "epoch": 57.093333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029942428671153414,
+      "loss": 0.4776,
+      "step": 21410
+    },
+    {
+      "epoch": 57.12,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029942373624813293,
+      "loss": 0.4744,
+      "step": 21420
+    },
+    {
+      "epoch": 57.14666666666667,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029942318552220344,
+      "loss": 0.4837,
+      "step": 21430
+    },
+    {
+      "epoch": 57.17333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002994226345337467,
+      "loss": 0.473,
+      "step": 21440
+    },
+    {
+      "epoch": 57.2,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00029942208328276364,
+      "loss": 0.472,
+      "step": 21450
+    },
+    {
+      "epoch": 57.22666666666667,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002994215317692552,
+      "loss": 0.4694,
+      "step": 21460
+    },
+    {
+      "epoch": 57.25333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029942097999322237,
+      "loss": 0.4695,
+      "step": 21470
+    },
+    {
+      "epoch": 57.28,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002994204279546661,
+      "loss": 0.4827,
+      "step": 21480
+    },
+    {
+      "epoch": 57.306666666666665,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00029941987565358744,
+      "loss": 0.4643,
+      "step": 21490
+    },
+    {
+      "epoch": 57.333333333333336,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029941932308998726,
+      "loss": 0.4799,
+      "step": 21500
+    },
+    {
+      "epoch": 57.36,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002994187702638666,
+      "loss": 0.4823,
+      "step": 21510
+    },
+    {
+      "epoch": 57.38666666666666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029941821717522634,
+      "loss": 0.4732,
+      "step": 21520
+    },
+    {
+      "epoch": 57.413333333333334,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002994176638240676,
+      "loss": 0.4783,
+      "step": 21530
+    },
+    {
+      "epoch": 57.44,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002994171102103912,
+      "loss": 0.4808,
+      "step": 21540
+    },
+    {
+      "epoch": 57.46666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002994165563341982,
+      "loss": 0.4903,
+      "step": 21550
+    },
+    {
+      "epoch": 57.49333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029941600219548955,
+      "loss": 0.4812,
+      "step": 21560
+    },
+    {
+      "epoch": 57.52,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.00029941544779426627,
+      "loss": 0.476,
+      "step": 21570
+    },
+    {
+      "epoch": 57.54666666666667,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029941489313052924,
+      "loss": 0.4725,
+      "step": 21580
+    },
+    {
+      "epoch": 57.57333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029941433820427954,
+      "loss": 0.4722,
+      "step": 21590
+    },
+    {
+      "epoch": 57.6,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029941378301551804,
+      "loss": 0.4654,
+      "step": 21600
+    },
+    {
+      "epoch": 57.626666666666665,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029941322756424577,
+      "loss": 0.4532,
+      "step": 21610
+    },
+    {
+      "epoch": 57.653333333333336,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002994126718504637,
+      "loss": 0.4716,
+      "step": 21620
+    },
+    {
+      "epoch": 57.68,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0002994121158741728,
+      "loss": 0.4638,
+      "step": 21630
+    },
+    {
+      "epoch": 57.70666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002994115596353741,
+      "loss": 0.4642,
+      "step": 21640
+    },
+    {
+      "epoch": 57.733333333333334,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002994110031340685,
+      "loss": 0.4833,
+      "step": 21650
+    },
+    {
+      "epoch": 57.76,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.000299410446370257,
+      "loss": 0.485,
+      "step": 21660
+    },
+    {
+      "epoch": 57.78666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002994098893439406,
+      "loss": 0.4828,
+      "step": 21670
+    },
+    {
+      "epoch": 57.81333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002994093320551203,
+      "loss": 0.4677,
+      "step": 21680
+    },
+    {
+      "epoch": 57.84,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.000299408774503797,
+      "loss": 0.4738,
+      "step": 21690
+    },
+    {
+      "epoch": 57.86666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002994082166899717,
+      "loss": 0.4738,
+      "step": 21700
+    },
+    {
+      "epoch": 57.89333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002994076586136455,
+      "loss": 0.4642,
+      "step": 21710
+    },
+    {
+      "epoch": 57.92,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002994071002748192,
+      "loss": 0.4743,
+      "step": 21720
+    },
+    {
+      "epoch": 57.946666666666665,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029940654167349393,
+      "loss": 0.467,
+      "step": 21730
+    },
+    {
+      "epoch": 57.973333333333336,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002994059828096706,
+      "loss": 0.4781,
+      "step": 21740
+    },
+    {
+      "epoch": 58.0,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029940542368335016,
+      "loss": 0.4677,
+      "step": 21750
+    },
+    {
+      "epoch": 58.0,
+      "eval_loss": 0.482833594083786,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.7365,
+      "eval_samples_per_second": 1.49,
+      "eval_steps_per_second": 0.093,
+      "step": 21750
+    },
+    {
+      "epoch": 58.026666666666664,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.00029940486429453364,
+      "loss": 0.4869,
+      "step": 21760
+    },
+    {
+      "epoch": 58.053333333333335,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002994043046432221,
+      "loss": 0.4935,
+      "step": 21770
+    },
+    {
+      "epoch": 58.08,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029940374472941627,
+      "loss": 0.4806,
+      "step": 21780
+    },
+    {
+      "epoch": 58.10666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002994031845531174,
+      "loss": 0.4761,
+      "step": 21790
+    },
+    {
+      "epoch": 58.13333333333333,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0002994026241143264,
+      "loss": 0.4771,
+      "step": 21800
+    },
+    {
+      "epoch": 58.16,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029940206341304423,
+      "loss": 0.4811,
+      "step": 21810
+    },
+    {
+      "epoch": 58.18666666666667,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029940150244927183,
+      "loss": 0.4731,
+      "step": 21820
+    },
+    {
+      "epoch": 58.21333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029940094122301024,
+      "loss": 0.4678,
+      "step": 21830
+    },
+    {
+      "epoch": 58.24,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0002994003797342605,
+      "loss": 0.4721,
+      "step": 21840
+    },
+    {
+      "epoch": 58.266666666666666,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002993998179830235,
+      "loss": 0.4767,
+      "step": 21850
+    },
+    {
+      "epoch": 58.29333333333334,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029939925596930017,
+      "loss": 0.4705,
+      "step": 21860
+    },
+    {
+      "epoch": 58.32,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002993986936930917,
+      "loss": 0.4732,
+      "step": 21870
+    },
+    {
+      "epoch": 58.346666666666664,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029939813115439896,
+      "loss": 0.4823,
+      "step": 21880
+    },
+    {
+      "epoch": 58.373333333333335,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002993975683532229,
+      "loss": 0.4786,
+      "step": 21890
+    },
+    {
+      "epoch": 58.4,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002993970052895646,
+      "loss": 0.4743,
+      "step": 21900
+    },
+    {
+      "epoch": 58.42666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029939644196342495,
+      "loss": 0.4806,
+      "step": 21910
+    },
+    {
+      "epoch": 58.45333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.000299395878374805,
+      "loss": 0.4826,
+      "step": 21920
+    },
+    {
+      "epoch": 58.48,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029939531452370573,
+      "loss": 0.489,
+      "step": 21930
+    },
+    {
+      "epoch": 58.50666666666667,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002993947504101282,
+      "loss": 0.479,
+      "step": 21940
+    },
+    {
+      "epoch": 58.53333333333333,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029939418603407326,
+      "loss": 0.4727,
+      "step": 21950
+    },
+    {
+      "epoch": 58.56,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0002993936213955421,
+      "loss": 0.4739,
+      "step": 21960
+    },
+    {
+      "epoch": 58.586666666666666,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029939305649453546,
+      "loss": 0.4694,
+      "step": 21970
+    },
+    {
+      "epoch": 58.61333333333333,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002993924913310545,
+      "loss": 0.4546,
+      "step": 21980
+    },
+    {
+      "epoch": 58.64,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002993919259051002,
+      "loss": 0.4597,
+      "step": 21990
+    },
+    {
+      "epoch": 58.666666666666664,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029939136021667347,
+      "loss": 0.476,
+      "step": 22000
+    },
+    {
+      "epoch": 58.693333333333335,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0002993907942657754,
+      "loss": 0.4589,
+      "step": 22010
+    },
+    {
+      "epoch": 58.72,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002993902280524069,
+      "loss": 0.4755,
+      "step": 22020
+    },
+    {
+      "epoch": 58.74666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029938966157656903,
+      "loss": 0.4802,
+      "step": 22030
+    },
+    {
+      "epoch": 58.77333333333333,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002993890948382628,
+      "loss": 0.4877,
+      "step": 22040
+    },
+    {
+      "epoch": 58.8,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00029938852783748914,
+      "loss": 0.4744,
+      "step": 22050
+    },
+    {
+      "epoch": 58.82666666666667,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002993879605742491,
+      "loss": 0.4692,
+      "step": 22060
+    },
+    {
+      "epoch": 58.85333333333333,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029938739304854366,
+      "loss": 0.4784,
+      "step": 22070
+    },
+    {
+      "epoch": 58.88,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002993868252603738,
+      "loss": 0.4663,
+      "step": 22080
+    },
+    {
+      "epoch": 58.906666666666666,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0002993862572097405,
+      "loss": 0.469,
+      "step": 22090
+    },
+    {
+      "epoch": 58.93333333333333,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00029938568889664486,
+      "loss": 0.4704,
+      "step": 22100
+    },
+    {
+      "epoch": 58.96,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002993851203210877,
+      "loss": 0.4678,
+      "step": 22110
+    },
+    {
+      "epoch": 58.986666666666665,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029938455148307023,
+      "loss": 0.4824,
+      "step": 22120
+    },
+    {
+      "epoch": 59.0,
+      "eval_loss": 0.48143550753593445,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.9892,
+      "eval_samples_per_second": 1.456,
+      "eval_steps_per_second": 0.091,
+      "step": 22125
+    },
+    {
+      "epoch": 59.013333333333335,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029938398238259326,
+      "loss": 0.4708,
+      "step": 22130
+    },
+    {
+      "epoch": 59.04,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029938341301965793,
+      "loss": 0.4954,
+      "step": 22140
+    },
+    {
+      "epoch": 59.06666666666667,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002993828433942651,
+      "loss": 0.4836,
+      "step": 22150
+    },
+    {
+      "epoch": 59.093333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.000299382273506416,
+      "loss": 0.4775,
+      "step": 22160
+    },
+    {
+      "epoch": 59.12,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002993817033561114,
+      "loss": 0.4737,
+      "step": 22170
+    },
+    {
+      "epoch": 59.14666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002993811329433524,
+      "loss": 0.4832,
+      "step": 22180
+    },
+    {
+      "epoch": 59.17333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029938056226813995,
+      "loss": 0.4728,
+      "step": 22190
+    },
+    {
+      "epoch": 59.2,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029937999133047513,
+      "loss": 0.4715,
+      "step": 22200
+    },
+    {
+      "epoch": 59.22666666666667,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0002993794201303589,
+      "loss": 0.4693,
+      "step": 22210
+    },
+    {
+      "epoch": 59.25333333333333,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002993788486677923,
+      "loss": 0.4697,
+      "step": 22220
+    },
+    {
+      "epoch": 59.28,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029937827694277627,
+      "loss": 0.4821,
+      "step": 22230
+    },
+    {
+      "epoch": 59.306666666666665,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002993777049553118,
+      "loss": 0.4642,
+      "step": 22240
+    },
+    {
+      "epoch": 59.333333333333336,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029937713270540003,
+      "loss": 0.4797,
+      "step": 22250
+    },
+    {
+      "epoch": 59.36,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002993765601930418,
+      "loss": 0.4818,
+      "step": 22260
+    },
+    {
+      "epoch": 59.38666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029937598741823826,
+      "loss": 0.4734,
+      "step": 22270
+    },
+    {
+      "epoch": 59.413333333333334,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029937541438099035,
+      "loss": 0.4778,
+      "step": 22280
+    },
+    {
+      "epoch": 59.44,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002993748410812991,
+      "loss": 0.4805,
+      "step": 22290
+    },
+    {
+      "epoch": 59.46666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029937426751916544,
+      "loss": 0.4899,
+      "step": 22300
+    },
+    {
+      "epoch": 59.49333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002993736936945904,
+      "loss": 0.4816,
+      "step": 22310
+    },
+    {
+      "epoch": 59.52,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029937311960757506,
+      "loss": 0.4761,
+      "step": 22320
+    },
+    {
+      "epoch": 59.54666666666667,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002993725452581204,
+      "loss": 0.4726,
+      "step": 22330
+    },
+    {
+      "epoch": 59.57333333333333,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0002993719706462274,
+      "loss": 0.4709,
+      "step": 22340
+    },
+    {
+      "epoch": 59.6,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002993713957718971,
+      "loss": 0.4652,
+      "step": 22350
+    },
+    {
+      "epoch": 59.626666666666665,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002993708206351305,
+      "loss": 0.4518,
+      "step": 22360
+    },
+    {
+      "epoch": 59.653333333333336,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002993702452359286,
+      "loss": 0.4718,
+      "step": 22370
+    },
+    {
+      "epoch": 59.68,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002993696695742924,
+      "loss": 0.4637,
+      "step": 22380
+    },
+    {
+      "epoch": 59.70666666666666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029936909365022295,
+      "loss": 0.4642,
+      "step": 22390
+    },
+    {
+      "epoch": 59.733333333333334,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029936851746372123,
+      "loss": 0.4835,
+      "step": 22400
+    },
+    {
+      "epoch": 59.76,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0002993679410147883,
+      "loss": 0.4845,
+      "step": 22410
+    },
+    {
+      "epoch": 59.78666666666667,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029936736430342514,
+      "loss": 0.4832,
+      "step": 22420
+    },
+    {
+      "epoch": 59.81333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002993667873296327,
+      "loss": 0.468,
+      "step": 22430
+    },
+    {
+      "epoch": 59.84,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0002993662100934121,
+      "loss": 0.474,
+      "step": 22440
+    },
+    {
+      "epoch": 59.86666666666667,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029936563259476434,
+      "loss": 0.4738,
+      "step": 22450
+    },
+    {
+      "epoch": 59.89333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002993650548336903,
+      "loss": 0.4639,
+      "step": 22460
+    },
+    {
+      "epoch": 59.92,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0002993644768101912,
+      "loss": 0.475,
+      "step": 22470
+    },
+    {
+      "epoch": 59.946666666666665,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029936389852426793,
+      "loss": 0.4671,
+      "step": 22480
+    },
+    {
+      "epoch": 59.973333333333336,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002993633199759215,
+      "loss": 0.4786,
+      "step": 22490
+    },
+    {
+      "epoch": 60.0,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002993627411651529,
+      "loss": 0.4681,
+      "step": 22500
+    },
+    {
+      "epoch": 60.0,
+      "eval_loss": 0.4817136228084564,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 8.9804,
+      "eval_samples_per_second": 1.782,
+      "eval_steps_per_second": 0.111,
+      "step": 22500
+    },
+    {
+      "epoch": 60.026666666666664,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002993621620919633,
+      "loss": 0.4871,
+      "step": 22510
+    },
+    {
+      "epoch": 60.053333333333335,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029936158275635364,
+      "loss": 0.4927,
+      "step": 22520
+    },
+    {
+      "epoch": 60.08,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029936100315832483,
+      "loss": 0.4794,
+      "step": 22530
+    },
+    {
+      "epoch": 60.10666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.000299360423297878,
+      "loss": 0.475,
+      "step": 22540
+    },
+    {
+      "epoch": 60.13333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002993598431750142,
+      "loss": 0.4765,
+      "step": 22550
+    },
+    {
+      "epoch": 60.16,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029935926278973433,
+      "loss": 0.4807,
+      "step": 22560
+    },
+    {
+      "epoch": 60.18666666666667,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0002993586821420395,
+      "loss": 0.4733,
+      "step": 22570
+    },
+    {
+      "epoch": 60.21333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002993581012319307,
+      "loss": 0.4679,
+      "step": 22580
+    },
+    {
+      "epoch": 60.24,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.000299357520059409,
+      "loss": 0.4718,
+      "step": 22590
+    },
+    {
+      "epoch": 60.266666666666666,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029935693862447537,
+      "loss": 0.4757,
+      "step": 22600
+    },
+    {
+      "epoch": 60.29333333333334,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002993563569271308,
+      "loss": 0.4703,
+      "step": 22610
+    },
+    {
+      "epoch": 60.32,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029935577496737636,
+      "loss": 0.4741,
+      "step": 22620
+    },
+    {
+      "epoch": 60.346666666666664,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029935519274521303,
+      "loss": 0.4817,
+      "step": 22630
+    },
+    {
+      "epoch": 60.373333333333335,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029935461026064187,
+      "loss": 0.4779,
+      "step": 22640
+    },
+    {
+      "epoch": 60.4,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.000299354027513664,
+      "loss": 0.474,
+      "step": 22650
+    },
+    {
+      "epoch": 60.42666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002993534445042802,
+      "loss": 0.4799,
+      "step": 22660
+    },
+    {
+      "epoch": 60.45333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002993528612324917,
+      "loss": 0.483,
+      "step": 22670
+    },
+    {
+      "epoch": 60.48,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002993522776982995,
+      "loss": 0.4892,
+      "step": 22680
+    },
+    {
+      "epoch": 60.50666666666667,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029935169390170454,
+      "loss": 0.4786,
+      "step": 22690
+    },
+    {
+      "epoch": 60.53333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002993511098427079,
+      "loss": 0.4722,
+      "step": 22700
+    },
+    {
+      "epoch": 60.56,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002993505255213106,
+      "loss": 0.4728,
+      "step": 22710
+    },
+    {
+      "epoch": 60.586666666666666,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002993499409375137,
+      "loss": 0.4689,
+      "step": 22720
+    },
+    {
+      "epoch": 60.61333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029934935609131813,
+      "loss": 0.4551,
+      "step": 22730
+    },
+    {
+      "epoch": 60.64,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029934877098272504,
+      "loss": 0.4596,
+      "step": 22740
+    },
+    {
+      "epoch": 60.666666666666664,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029934818561173535,
+      "loss": 0.4755,
+      "step": 22750
+    },
+    {
+      "epoch": 60.693333333333335,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029934759997835014,
+      "loss": 0.4584,
+      "step": 22760
+    },
+    {
+      "epoch": 60.72,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029934701408257044,
+      "loss": 0.4758,
+      "step": 22770
+    },
+    {
+      "epoch": 60.74666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002993464279243973,
+      "loss": 0.4804,
+      "step": 22780
+    },
+    {
+      "epoch": 60.77333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002993458415038317,
+      "loss": 0.4879,
+      "step": 22790
+    },
+    {
+      "epoch": 60.8,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002993452548208747,
+      "loss": 0.4741,
+      "step": 22800
+    },
+    {
+      "epoch": 60.82666666666667,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00029934466787552734,
+      "loss": 0.4689,
+      "step": 22810
+    },
+    {
+      "epoch": 60.85333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029934408066779064,
+      "loss": 0.4778,
+      "step": 22820
+    },
+    {
+      "epoch": 60.88,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029934349319766563,
+      "loss": 0.4657,
+      "step": 22830
+    },
+    {
+      "epoch": 60.906666666666666,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002993429054651533,
+      "loss": 0.4687,
+      "step": 22840
+    },
+    {
+      "epoch": 60.93333333333333,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029934231747025475,
+      "loss": 0.4699,
+      "step": 22850
+    },
+    {
+      "epoch": 60.96,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.000299341729212971,
+      "loss": 0.4681,
+      "step": 22860
+    },
+    {
+      "epoch": 60.986666666666665,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00029934114069330304,
+      "loss": 0.4823,
+      "step": 22870
+    },
+    {
+      "epoch": 61.0,
+      "eval_loss": 0.48083022236824036,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2915,
+      "eval_samples_per_second": 1.555,
+      "eval_steps_per_second": 0.097,
+      "step": 22875
+    },
+    {
+      "epoch": 61.013333333333335,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00029934055191125193,
+      "loss": 0.4713,
+      "step": 22880
+    },
+    {
+      "epoch": 61.04,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029933996286681875,
+      "loss": 0.496,
+      "step": 22890
+    },
+    {
+      "epoch": 61.06666666666667,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0002993393735600045,
+      "loss": 0.483,
+      "step": 22900
+    },
+    {
+      "epoch": 61.093333333333334,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029933878399081016,
+      "loss": 0.4772,
+      "step": 22910
+    },
+    {
+      "epoch": 61.12,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002993381941592369,
+      "loss": 0.4739,
+      "step": 22920
+    },
+    {
+      "epoch": 61.14666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029933760406528553,
+      "loss": 0.4822,
+      "step": 22930
+    },
+    {
+      "epoch": 61.17333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002993370137089573,
+      "loss": 0.4729,
+      "step": 22940
+    },
+    {
+      "epoch": 61.2,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002993364230902532,
+      "loss": 0.4714,
+      "step": 22950
+    },
+    {
+      "epoch": 61.22666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002993358322091742,
+      "loss": 0.4688,
+      "step": 22960
+    },
+    {
+      "epoch": 61.25333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002993352410657214,
+      "loss": 0.4691,
+      "step": 22970
+    },
+    {
+      "epoch": 61.28,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002993346496598958,
+      "loss": 0.4828,
+      "step": 22980
+    },
+    {
+      "epoch": 61.306666666666665,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002993340579916985,
+      "loss": 0.464,
+      "step": 22990
+    },
+    {
+      "epoch": 61.333333333333336,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002993334660611305,
+      "loss": 0.4792,
+      "step": 23000
+    },
+    {
+      "epoch": 61.36,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002993328738681928,
+      "loss": 0.4809,
+      "step": 23010
+    },
+    {
+      "epoch": 61.38666666666666,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002993322814128865,
+      "loss": 0.4732,
+      "step": 23020
+    },
+    {
+      "epoch": 61.413333333333334,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002993316886952126,
+      "loss": 0.4774,
+      "step": 23030
+    },
+    {
+      "epoch": 61.44,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002993310957151722,
+      "loss": 0.4801,
+      "step": 23040
+    },
+    {
+      "epoch": 61.46666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029933050247276624,
+      "loss": 0.4894,
+      "step": 23050
+    },
+    {
+      "epoch": 61.49333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029932990896799585,
+      "loss": 0.4816,
+      "step": 23060
+    },
+    {
+      "epoch": 61.52,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.000299329315200862,
+      "loss": 0.4755,
+      "step": 23070
+    },
+    {
+      "epoch": 61.54666666666667,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002993287211713659,
+      "loss": 0.4724,
+      "step": 23080
+    },
+    {
+      "epoch": 61.57333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002993281268795084,
+      "loss": 0.4713,
+      "step": 23090
+    },
+    {
+      "epoch": 61.6,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002993275323252906,
+      "loss": 0.4655,
+      "step": 23100
+    },
+    {
+      "epoch": 61.626666666666665,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002993269375087136,
+      "loss": 0.4521,
+      "step": 23110
+    },
+    {
+      "epoch": 61.653333333333336,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0002993263424297784,
+      "loss": 0.472,
+      "step": 23120
+    },
+    {
+      "epoch": 61.68,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002993257470884861,
+      "loss": 0.4629,
+      "step": 23130
+    },
+    {
+      "epoch": 61.70666666666666,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002993251514848376,
+      "loss": 0.4632,
+      "step": 23140
+    },
+    {
+      "epoch": 61.733333333333334,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029932455561883415,
+      "loss": 0.4831,
+      "step": 23150
+    },
+    {
+      "epoch": 61.76,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002993239594904766,
+      "loss": 0.4841,
+      "step": 23160
+    },
+    {
+      "epoch": 61.78666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002993233630997662,
+      "loss": 0.483,
+      "step": 23170
+    },
+    {
+      "epoch": 61.81333333333333,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029932276644670377,
+      "loss": 0.4675,
+      "step": 23180
+    },
+    {
+      "epoch": 61.84,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002993221695312905,
+      "loss": 0.4741,
+      "step": 23190
+    },
+    {
+      "epoch": 61.86666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029932157235352744,
+      "loss": 0.4731,
+      "step": 23200
+    },
+    {
+      "epoch": 61.89333333333333,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002993209749134156,
+      "loss": 0.4642,
+      "step": 23210
+    },
+    {
+      "epoch": 61.92,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002993203772109561,
+      "loss": 0.4753,
+      "step": 23220
+    },
+    {
+      "epoch": 61.946666666666665,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002993197792461499,
+      "loss": 0.4674,
+      "step": 23230
+    },
+    {
+      "epoch": 61.973333333333336,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029931918101899806,
+      "loss": 0.4785,
+      "step": 23240
+    },
+    {
+      "epoch": 62.0,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029931858252950166,
+      "loss": 0.4676,
+      "step": 23250
+    },
+    {
+      "epoch": 62.0,
+      "eval_loss": 0.4819358289241791,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.6535,
+      "eval_samples_per_second": 1.657,
+      "eval_steps_per_second": 0.104,
+      "step": 23250
+    },
+    {
+      "epoch": 62.026666666666664,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029931798377766177,
+      "loss": 0.4871,
+      "step": 23260
+    },
+    {
+      "epoch": 62.053333333333335,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029931738476347944,
+      "loss": 0.4929,
+      "step": 23270
+    },
+    {
+      "epoch": 62.08,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029931678548695564,
+      "loss": 0.4796,
+      "step": 23280
+    },
+    {
+      "epoch": 62.10666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029931618594809155,
+      "loss": 0.4743,
+      "step": 23290
+    },
+    {
+      "epoch": 62.13333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002993155861468881,
+      "loss": 0.4772,
+      "step": 23300
+    },
+    {
+      "epoch": 62.16,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029931498608334645,
+      "loss": 0.4811,
+      "step": 23310
+    },
+    {
+      "epoch": 62.18666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002993143857574676,
+      "loss": 0.4729,
+      "step": 23320
+    },
+    {
+      "epoch": 62.21333333333333,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00029931378516925253,
+      "loss": 0.4676,
+      "step": 23330
+    },
+    {
+      "epoch": 62.24,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029931318431870244,
+      "loss": 0.4716,
+      "step": 23340
+    },
+    {
+      "epoch": 62.266666666666666,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002993125832058183,
+      "loss": 0.4759,
+      "step": 23350
+    },
+    {
+      "epoch": 62.29333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002993119818306012,
+      "loss": 0.4701,
+      "step": 23360
+    },
+    {
+      "epoch": 62.32,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029931138019305225,
+      "loss": 0.4734,
+      "step": 23370
+    },
+    {
+      "epoch": 62.346666666666664,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002993107782931724,
+      "loss": 0.4815,
+      "step": 23380
+    },
+    {
+      "epoch": 62.373333333333335,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0002993101761309627,
+      "loss": 0.478,
+      "step": 23390
+    },
+    {
+      "epoch": 62.4,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029930957370642426,
+      "loss": 0.4735,
+      "step": 23400
+    },
+    {
+      "epoch": 62.42666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002993089710195582,
+      "loss": 0.4802,
+      "step": 23410
+    },
+    {
+      "epoch": 62.45333333333333,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029930836807036545,
+      "loss": 0.4831,
+      "step": 23420
+    },
+    {
+      "epoch": 62.48,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029930776485884717,
+      "loss": 0.4888,
+      "step": 23430
+    },
+    {
+      "epoch": 62.50666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002993071613850044,
+      "loss": 0.4782,
+      "step": 23440
+    },
+    {
+      "epoch": 62.53333333333333,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029930655764883813,
+      "loss": 0.472,
+      "step": 23450
+    },
+    {
+      "epoch": 62.56,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002993059536503495,
+      "loss": 0.4731,
+      "step": 23460
+    },
+    {
+      "epoch": 62.586666666666666,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0002993053493895395,
+      "loss": 0.4696,
+      "step": 23470
+    },
+    {
+      "epoch": 62.61333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029930474486640935,
+      "loss": 0.4552,
+      "step": 23480
+    },
+    {
+      "epoch": 62.64,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002993041400809599,
+      "loss": 0.4603,
+      "step": 23490
+    },
+    {
+      "epoch": 62.666666666666664,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029930353503319234,
+      "loss": 0.4749,
+      "step": 23500
+    },
+    {
+      "epoch": 62.693333333333335,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029930292972310773,
+      "loss": 0.458,
+      "step": 23510
+    },
+    {
+      "epoch": 62.72,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002993023241507071,
+      "loss": 0.4755,
+      "step": 23520
+    },
+    {
+      "epoch": 62.74666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029930171831599146,
+      "loss": 0.4794,
+      "step": 23530
+    },
+    {
+      "epoch": 62.77333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.000299301112218962,
+      "loss": 0.4874,
+      "step": 23540
+    },
+    {
+      "epoch": 62.8,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002993005058596197,
+      "loss": 0.4747,
+      "step": 23550
+    },
+    {
+      "epoch": 62.82666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029929989923796566,
+      "loss": 0.4695,
+      "step": 23560
+    },
+    {
+      "epoch": 62.85333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.000299299292354001,
+      "loss": 0.4779,
+      "step": 23570
+    },
+    {
+      "epoch": 62.88,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002992986852077266,
+      "loss": 0.4657,
+      "step": 23580
+    },
+    {
+      "epoch": 62.906666666666666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002992980777991437,
+      "loss": 0.469,
+      "step": 23590
+    },
+    {
+      "epoch": 62.93333333333333,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002992974701282533,
+      "loss": 0.4699,
+      "step": 23600
+    },
+    {
+      "epoch": 62.96,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029929686219505645,
+      "loss": 0.4676,
+      "step": 23610
+    },
+    {
+      "epoch": 62.986666666666665,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002992962539995543,
+      "loss": 0.4818,
+      "step": 23620
+    },
+    {
+      "epoch": 63.0,
+      "eval_loss": 0.48129552602767944,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0491,
+      "eval_samples_per_second": 1.592,
+      "eval_steps_per_second": 0.1,
+      "step": 23625
+    },
+    {
+      "epoch": 63.013333333333335,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029929564554174786,
+      "loss": 0.4699,
+      "step": 23630
+    },
+    {
+      "epoch": 63.04,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029929503682163817,
+      "loss": 0.4947,
+      "step": 23640
+    },
+    {
+      "epoch": 63.06666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029929442783922637,
+      "loss": 0.4823,
+      "step": 23650
+    },
+    {
+      "epoch": 63.093333333333334,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002992938185945135,
+      "loss": 0.4777,
+      "step": 23660
+    },
+    {
+      "epoch": 63.12,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002992932090875006,
+      "loss": 0.474,
+      "step": 23670
+    },
+    {
+      "epoch": 63.14666666666667,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002992925993181888,
+      "loss": 0.4827,
+      "step": 23680
+    },
+    {
+      "epoch": 63.17333333333333,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002992919892865791,
+      "loss": 0.4732,
+      "step": 23690
+    },
+    {
+      "epoch": 63.2,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002992913789926726,
+      "loss": 0.4716,
+      "step": 23700
+    },
+    {
+      "epoch": 63.22666666666667,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029929076843647045,
+      "loss": 0.469,
+      "step": 23710
+    },
+    {
+      "epoch": 63.25333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002992901576179736,
+      "loss": 0.469,
+      "step": 23720
+    },
+    {
+      "epoch": 63.28,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0002992895465371832,
+      "loss": 0.483,
+      "step": 23730
+    },
+    {
+      "epoch": 63.306666666666665,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029928893519410026,
+      "loss": 0.4638,
+      "step": 23740
+    },
+    {
+      "epoch": 63.333333333333336,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029928832358872595,
+      "loss": 0.4795,
+      "step": 23750
+    },
+    {
+      "epoch": 63.36,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029928771172106123,
+      "loss": 0.4811,
+      "step": 23760
+    },
+    {
+      "epoch": 63.38666666666666,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002992870995911073,
+      "loss": 0.4723,
+      "step": 23770
+    },
+    {
+      "epoch": 63.413333333333334,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029928648719886513,
+      "loss": 0.4768,
+      "step": 23780
+    },
+    {
+      "epoch": 63.44,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002992858745443359,
+      "loss": 0.4805,
+      "step": 23790
+    },
+    {
+      "epoch": 63.46666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002992852616275206,
+      "loss": 0.4897,
+      "step": 23800
+    },
+    {
+      "epoch": 63.49333333333333,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029928464844842026,
+      "loss": 0.4807,
+      "step": 23810
+    },
+    {
+      "epoch": 63.52,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002992840350070361,
+      "loss": 0.4756,
+      "step": 23820
+    },
+    {
+      "epoch": 63.54666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029928342130336905,
+      "loss": 0.4723,
+      "step": 23830
+    },
+    {
+      "epoch": 63.57333333333333,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029928280733742033,
+      "loss": 0.4713,
+      "step": 23840
+    },
+    {
+      "epoch": 63.6,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029928219310919096,
+      "loss": 0.4648,
+      "step": 23850
+    },
+    {
+      "epoch": 63.626666666666665,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.000299281578618682,
+      "loss": 0.4516,
+      "step": 23860
+    },
+    {
+      "epoch": 63.653333333333336,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002992809638658945,
+      "loss": 0.4717,
+      "step": 23870
+    },
+    {
+      "epoch": 63.68,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002992803488508296,
+      "loss": 0.463,
+      "step": 23880
+    },
+    {
+      "epoch": 63.70666666666666,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029927973357348836,
+      "loss": 0.4638,
+      "step": 23890
+    },
+    {
+      "epoch": 63.733333333333334,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029927911803387185,
+      "loss": 0.4824,
+      "step": 23900
+    },
+    {
+      "epoch": 63.76,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002992785022319812,
+      "loss": 0.4843,
+      "step": 23910
+    },
+    {
+      "epoch": 63.78666666666667,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029927788616781743,
+      "loss": 0.4828,
+      "step": 23920
+    },
+    {
+      "epoch": 63.81333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002992772698413817,
+      "loss": 0.4674,
+      "step": 23930
+    },
+    {
+      "epoch": 63.84,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000299276653252675,
+      "loss": 0.4734,
+      "step": 23940
+    },
+    {
+      "epoch": 63.86666666666667,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0002992760364016984,
+      "loss": 0.4723,
+      "step": 23950
+    },
+    {
+      "epoch": 63.89333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029927541928845313,
+      "loss": 0.4637,
+      "step": 23960
+    },
+    {
+      "epoch": 63.92,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002992748019129401,
+      "loss": 0.4736,
+      "step": 23970
+    },
+    {
+      "epoch": 63.946666666666665,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029927418427516054,
+      "loss": 0.4668,
+      "step": 23980
+    },
+    {
+      "epoch": 63.973333333333336,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002992735663751155,
+      "loss": 0.4781,
+      "step": 23990
+    },
+    {
+      "epoch": 64.0,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029927294821280597,
+      "loss": 0.4674,
+      "step": 24000
+    },
+    {
+      "epoch": 64.0,
+      "eval_loss": 0.48202311992645264,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2055,
+      "eval_samples_per_second": 1.568,
+      "eval_steps_per_second": 0.098,
+      "step": 24000
+    },
+    {
+      "epoch": 64.02666666666667,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00029927232978823313,
+      "loss": 0.4864,
+      "step": 24010
+    },
+    {
+      "epoch": 64.05333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.000299271711101398,
+      "loss": 0.4927,
+      "step": 24020
+    },
+    {
+      "epoch": 64.08,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029927109215230173,
+      "loss": 0.4798,
+      "step": 24030
+    },
+    {
+      "epoch": 64.10666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029927047294094544,
+      "loss": 0.4745,
+      "step": 24040
+    },
+    {
+      "epoch": 64.13333333333334,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029926985346733013,
+      "loss": 0.4766,
+      "step": 24050
+    },
+    {
+      "epoch": 64.16,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002992692337314569,
+      "loss": 0.4812,
+      "step": 24060
+    },
+    {
+      "epoch": 64.18666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029926861373332685,
+      "loss": 0.4725,
+      "step": 24070
+    },
+    {
+      "epoch": 64.21333333333334,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002992679934729411,
+      "loss": 0.4676,
+      "step": 24080
+    },
+    {
+      "epoch": 64.24,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002992673729503007,
+      "loss": 0.4714,
+      "step": 24090
+    },
+    {
+      "epoch": 64.26666666666667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002992667521654068,
+      "loss": 0.4752,
+      "step": 24100
+    },
+    {
+      "epoch": 64.29333333333334,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002992661311182604,
+      "loss": 0.47,
+      "step": 24110
+    },
+    {
+      "epoch": 64.32,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029926550980886266,
+      "loss": 0.4734,
+      "step": 24120
+    },
+    {
+      "epoch": 64.34666666666666,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029926488823721467,
+      "loss": 0.481,
+      "step": 24130
+    },
+    {
+      "epoch": 64.37333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002992642664033175,
+      "loss": 0.4777,
+      "step": 24140
+    },
+    {
+      "epoch": 64.4,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029926364430717226,
+      "loss": 0.4731,
+      "step": 24150
+    },
+    {
+      "epoch": 64.42666666666666,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029926302194878,
+      "loss": 0.48,
+      "step": 24160
+    },
+    {
+      "epoch": 64.45333333333333,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002992623993281419,
+      "loss": 0.4828,
+      "step": 24170
+    },
+    {
+      "epoch": 64.48,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029926177644525894,
+      "loss": 0.4887,
+      "step": 24180
+    },
+    {
+      "epoch": 64.50666666666666,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002992611533001323,
+      "loss": 0.4779,
+      "step": 24190
+    },
+    {
+      "epoch": 64.53333333333333,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.000299260529892763,
+      "loss": 0.4719,
+      "step": 24200
+    },
+    {
+      "epoch": 64.56,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029925990622315225,
+      "loss": 0.4733,
+      "step": 24210
+    },
+    {
+      "epoch": 64.58666666666667,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029925928229130103,
+      "loss": 0.4682,
+      "step": 24220
+    },
+    {
+      "epoch": 64.61333333333333,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0002992586580972105,
+      "loss": 0.4552,
+      "step": 24230
+    },
+    {
+      "epoch": 64.64,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029925803364088175,
+      "loss": 0.4596,
+      "step": 24240
+    },
+    {
+      "epoch": 64.66666666666667,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029925740892231586,
+      "loss": 0.4754,
+      "step": 24250
+    },
+    {
+      "epoch": 64.69333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002992567839415139,
+      "loss": 0.4591,
+      "step": 24260
+    },
+    {
+      "epoch": 64.72,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029925615869847707,
+      "loss": 0.4752,
+      "step": 24270
+    },
+    {
+      "epoch": 64.74666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029925553319320634,
+      "loss": 0.4798,
+      "step": 24280
+    },
+    {
+      "epoch": 64.77333333333333,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002992549074257029,
+      "loss": 0.4873,
+      "step": 24290
+    },
+    {
+      "epoch": 64.8,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002992542813959678,
+      "loss": 0.4738,
+      "step": 24300
+    },
+    {
+      "epoch": 64.82666666666667,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029925365510400216,
+      "loss": 0.4687,
+      "step": 24310
+    },
+    {
+      "epoch": 64.85333333333334,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029925302854980713,
+      "loss": 0.4777,
+      "step": 24320
+    },
+    {
+      "epoch": 64.88,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.00029925240173338377,
+      "loss": 0.4653,
+      "step": 24330
+    },
+    {
+      "epoch": 64.90666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002992517746547331,
+      "loss": 0.4687,
+      "step": 24340
+    },
+    {
+      "epoch": 64.93333333333334,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002992511473138563,
+      "loss": 0.4704,
+      "step": 24350
+    },
+    {
+      "epoch": 64.96,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002992505197107545,
+      "loss": 0.4671,
+      "step": 24360
+    },
+    {
+      "epoch": 64.98666666666666,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002992498918454288,
+      "loss": 0.482,
+      "step": 24370
+    },
+    {
+      "epoch": 65.0,
+      "eval_loss": 0.4812326729297638,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.9729,
+      "eval_samples_per_second": 1.458,
+      "eval_steps_per_second": 0.091,
+      "step": 24375
+    },
+    {
+      "epoch": 65.01333333333334,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029924926371788024,
+      "loss": 0.4705,
+      "step": 24380
+    },
+    {
+      "epoch": 65.04,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.00029924863532810995,
+      "loss": 0.4951,
+      "step": 24390
+    },
+    {
+      "epoch": 65.06666666666666,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.000299248006676119,
+      "loss": 0.4827,
+      "step": 24400
+    },
+    {
+      "epoch": 65.09333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002992473777619086,
+      "loss": 0.4766,
+      "step": 24410
+    },
+    {
+      "epoch": 65.12,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029924674858547976,
+      "loss": 0.4738,
+      "step": 24420
+    },
+    {
+      "epoch": 65.14666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029924611914683365,
+      "loss": 0.4829,
+      "step": 24430
+    },
+    {
+      "epoch": 65.17333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002992454894459713,
+      "loss": 0.473,
+      "step": 24440
+    },
+    {
+      "epoch": 65.2,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029924485948289387,
+      "loss": 0.4713,
+      "step": 24450
+    },
+    {
+      "epoch": 65.22666666666667,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029924422925760243,
+      "loss": 0.4691,
+      "step": 24460
+    },
+    {
+      "epoch": 65.25333333333333,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029924359877009815,
+      "loss": 0.469,
+      "step": 24470
+    },
+    {
+      "epoch": 65.28,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002992429680203821,
+      "loss": 0.483,
+      "step": 24480
+    },
+    {
+      "epoch": 65.30666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029924233700845534,
+      "loss": 0.4634,
+      "step": 24490
+    },
+    {
+      "epoch": 65.33333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002992417057343191,
+      "loss": 0.4794,
+      "step": 24500
+    },
+    {
+      "epoch": 65.36,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002992410741979744,
+      "loss": 0.4812,
+      "step": 24510
+    },
+    {
+      "epoch": 65.38666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002992404423994223,
+      "loss": 0.473,
+      "step": 24520
+    },
+    {
+      "epoch": 65.41333333333333,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029923981033866403,
+      "loss": 0.4765,
+      "step": 24530
+    },
+    {
+      "epoch": 65.44,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002992391780157006,
+      "loss": 0.4798,
+      "step": 24540
+    },
+    {
+      "epoch": 65.46666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029923854543053316,
+      "loss": 0.4896,
+      "step": 24550
+    },
+    {
+      "epoch": 65.49333333333334,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002992379125831629,
+      "loss": 0.4805,
+      "step": 24560
+    },
+    {
+      "epoch": 65.52,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029923727947359086,
+      "loss": 0.4752,
+      "step": 24570
+    },
+    {
+      "epoch": 65.54666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002992366461018181,
+      "loss": 0.4723,
+      "step": 24580
+    },
+    {
+      "epoch": 65.57333333333334,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002992360124678458,
+      "loss": 0.4708,
+      "step": 24590
+    },
+    {
+      "epoch": 65.6,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029923537857167506,
+      "loss": 0.4649,
+      "step": 24600
+    },
+    {
+      "epoch": 65.62666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029923474441330693,
+      "loss": 0.4517,
+      "step": 24610
+    },
+    {
+      "epoch": 65.65333333333334,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029923410999274267,
+      "loss": 0.4713,
+      "step": 24620
+    },
+    {
+      "epoch": 65.68,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00029923347530998325,
+      "loss": 0.4626,
+      "step": 24630
+    },
+    {
+      "epoch": 65.70666666666666,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002992328403650299,
+      "loss": 0.4633,
+      "step": 24640
+    },
+    {
+      "epoch": 65.73333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029923220515788365,
+      "loss": 0.4826,
+      "step": 24650
+    },
+    {
+      "epoch": 65.76,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002992315696885457,
+      "loss": 0.4841,
+      "step": 24660
+    },
+    {
+      "epoch": 65.78666666666666,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.000299230933957017,
+      "loss": 0.482,
+      "step": 24670
+    },
+    {
+      "epoch": 65.81333333333333,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002992302979632989,
+      "loss": 0.4666,
+      "step": 24680
+    },
+    {
+      "epoch": 65.84,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002992296617073923,
+      "loss": 0.473,
+      "step": 24690
+    },
+    {
+      "epoch": 65.86666666666666,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002992290251892985,
+      "loss": 0.4729,
+      "step": 24700
+    },
+    {
+      "epoch": 65.89333333333333,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00029922838840901845,
+      "loss": 0.4635,
+      "step": 24710
+    },
+    {
+      "epoch": 65.92,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002992277513665534,
+      "loss": 0.4739,
+      "step": 24720
+    },
+    {
+      "epoch": 65.94666666666667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029922711406190445,
+      "loss": 0.4668,
+      "step": 24730
+    },
+    {
+      "epoch": 65.97333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029922647649507266,
+      "loss": 0.4777,
+      "step": 24740
+    },
+    {
+      "epoch": 66.0,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029922583866605915,
+      "loss": 0.4668,
+      "step": 24750
+    },
+    {
+      "epoch": 66.0,
+      "eval_loss": 0.4806465804576874,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9252,
+      "eval_samples_per_second": 1.612,
+      "eval_steps_per_second": 0.101,
+      "step": 24750
+    },
+    {
+      "epoch": 66.02666666666667,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0002992252005748651,
+      "loss": 0.4868,
+      "step": 24760
+    },
+    {
+      "epoch": 66.05333333333333,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.00029922456222149165,
+      "loss": 0.4924,
+      "step": 24770
+    },
+    {
+      "epoch": 66.08,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0002992239236059398,
+      "loss": 0.4821,
+      "step": 24780
+    },
+    {
+      "epoch": 66.10666666666667,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0002992232847282108,
+      "loss": 0.4756,
+      "step": 24790
+    },
+    {
+      "epoch": 66.13333333333334,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00029922264558830566,
+      "loss": 0.4774,
+      "step": 24800
+    },
+    {
+      "epoch": 66.16,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029922200618622566,
+      "loss": 0.4799,
+      "step": 24810
+    },
+    {
+      "epoch": 66.18666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029922136652197175,
+      "loss": 0.4734,
+      "step": 24820
+    },
+    {
+      "epoch": 66.21333333333334,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002992207265955451,
+      "loss": 0.4676,
+      "step": 24830
+    },
+    {
+      "epoch": 66.24,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029922008640694697,
+      "loss": 0.4714,
+      "step": 24840
+    },
+    {
+      "epoch": 66.26666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002992194459561783,
+      "loss": 0.4755,
+      "step": 24850
+    },
+    {
+      "epoch": 66.29333333333334,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002992188052432403,
+      "loss": 0.4696,
+      "step": 24860
+    },
+    {
+      "epoch": 66.32,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00029921816426813416,
+      "loss": 0.4732,
+      "step": 24870
+    },
+    {
+      "epoch": 66.34666666666666,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029921752303086087,
+      "loss": 0.4811,
+      "step": 24880
+    },
+    {
+      "epoch": 66.37333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029921688153142163,
+      "loss": 0.4776,
+      "step": 24890
+    },
+    {
+      "epoch": 66.4,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.00029921623976981754,
+      "loss": 0.4733,
+      "step": 24900
+    },
+    {
+      "epoch": 66.42666666666666,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029921559774604973,
+      "loss": 0.4797,
+      "step": 24910
+    },
+    {
+      "epoch": 66.45333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029921495546011946,
+      "loss": 0.4826,
+      "step": 24920
+    },
+    {
+      "epoch": 66.48,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029921431291202763,
+      "loss": 0.4885,
+      "step": 24930
+    },
+    {
+      "epoch": 66.50666666666666,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0002992136701017755,
+      "loss": 0.4776,
+      "step": 24940
+    },
+    {
+      "epoch": 66.53333333333333,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0002992130270293642,
+      "loss": 0.4714,
+      "step": 24950
+    },
+    {
+      "epoch": 66.56,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002992123836947949,
+      "loss": 0.4735,
+      "step": 24960
+    },
+    {
+      "epoch": 66.58666666666667,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.00029921174009806857,
+      "loss": 0.4695,
+      "step": 24970
+    },
+    {
+      "epoch": 66.61333333333333,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002992110962391865,
+      "loss": 0.4543,
+      "step": 24980
+    },
+    {
+      "epoch": 66.64,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0002992104521181497,
+      "loss": 0.4594,
+      "step": 24990
+    },
+    {
+      "epoch": 66.66666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029920980773495947,
+      "loss": 0.4749,
+      "step": 25000
+    },
+    {
+      "epoch": 66.69333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002992091630896168,
+      "loss": 0.4582,
+      "step": 25010
+    },
+    {
+      "epoch": 66.72,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029920851818212283,
+      "loss": 0.4752,
+      "step": 25020
+    },
+    {
+      "epoch": 66.74666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002992078730124787,
+      "loss": 0.4795,
+      "step": 25030
+    },
+    {
+      "epoch": 66.77333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002992072275806857,
+      "loss": 0.4867,
+      "step": 25040
+    },
+    {
+      "epoch": 66.8,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0002992065818867447,
+      "loss": 0.4742,
+      "step": 25050
+    },
+    {
+      "epoch": 66.82666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.000299205935930657,
+      "loss": 0.4685,
+      "step": 25060
+    },
+    {
+      "epoch": 66.85333333333334,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002992052897124237,
+      "loss": 0.4778,
+      "step": 25070
+    },
+    {
+      "epoch": 66.88,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029920464323204593,
+      "loss": 0.4655,
+      "step": 25080
+    },
+    {
+      "epoch": 66.90666666666667,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.00029920399648952485,
+      "loss": 0.4676,
+      "step": 25090
+    },
+    {
+      "epoch": 66.93333333333334,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.00029920334948486157,
+      "loss": 0.4697,
+      "step": 25100
+    },
+    {
+      "epoch": 66.96,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029920270221805723,
+      "loss": 0.4676,
+      "step": 25110
+    },
+    {
+      "epoch": 66.98666666666666,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.000299202054689113,
+      "loss": 0.4815,
+      "step": 25120
+    },
+    {
+      "epoch": 67.0,
+      "eval_loss": 0.481969952583313,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.4397,
+      "eval_samples_per_second": 1.533,
+      "eval_steps_per_second": 0.096,
+      "step": 25125
+    },
+    {
+      "epoch": 67.01333333333334,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029920140689802994,
+      "loss": 0.4708,
+      "step": 25130
+    },
+    {
+      "epoch": 67.04,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029920075884480926,
+      "loss": 0.4952,
+      "step": 25140
+    },
+    {
+      "epoch": 67.06666666666666,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029920011052945203,
+      "loss": 0.482,
+      "step": 25150
+    },
+    {
+      "epoch": 67.09333333333333,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029919946195195953,
+      "loss": 0.4772,
+      "step": 25160
+    },
+    {
+      "epoch": 67.12,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029919881311233276,
+      "loss": 0.4734,
+      "step": 25170
+    },
+    {
+      "epoch": 67.14666666666666,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002991981640105729,
+      "loss": 0.4818,
+      "step": 25180
+    },
+    {
+      "epoch": 67.17333333333333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029919751464668114,
+      "loss": 0.4733,
+      "step": 25190
+    },
+    {
+      "epoch": 67.2,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029919686502065846,
+      "loss": 0.471,
+      "step": 25200
+    },
+    {
+      "epoch": 67.22666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002991962151325062,
+      "loss": 0.4689,
+      "step": 25210
+    },
+    {
+      "epoch": 67.25333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029919556498222543,
+      "loss": 0.4691,
+      "step": 25220
+    },
+    {
+      "epoch": 67.28,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.00029919491456981725,
+      "loss": 0.4819,
+      "step": 25230
+    },
+    {
+      "epoch": 67.30666666666667,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029919426389528287,
+      "loss": 0.4635,
+      "step": 25240
+    },
+    {
+      "epoch": 67.33333333333333,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002991936129586233,
+      "loss": 0.4792,
+      "step": 25250
+    },
+    {
+      "epoch": 67.36,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002991929617598399,
+      "loss": 0.4815,
+      "step": 25260
+    },
+    {
+      "epoch": 67.38666666666667,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002991923102989337,
+      "loss": 0.473,
+      "step": 25270
+    },
+    {
+      "epoch": 67.41333333333333,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.00029919165857590575,
+      "loss": 0.4775,
+      "step": 25280
+    },
+    {
+      "epoch": 67.44,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002991910065907573,
+      "loss": 0.4801,
+      "step": 25290
+    },
+    {
+      "epoch": 67.46666666666667,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002991903543434895,
+      "loss": 0.489,
+      "step": 25300
+    },
+    {
+      "epoch": 67.49333333333334,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029918970183410347,
+      "loss": 0.4808,
+      "step": 25310
+    },
+    {
+      "epoch": 67.52,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002991890490626004,
+      "loss": 0.4752,
+      "step": 25320
+    },
+    {
+      "epoch": 67.54666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002991883960289814,
+      "loss": 0.4722,
+      "step": 25330
+    },
+    {
+      "epoch": 67.57333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029918774273324754,
+      "loss": 0.4711,
+      "step": 25340
+    },
+    {
+      "epoch": 67.6,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002991870891754001,
+      "loss": 0.4649,
+      "step": 25350
+    },
+    {
+      "epoch": 67.62666666666667,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0002991864353554402,
+      "loss": 0.4521,
+      "step": 25360
+    },
+    {
+      "epoch": 67.65333333333334,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002991857812733689,
+      "loss": 0.4719,
+      "step": 25370
+    },
+    {
+      "epoch": 67.68,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002991851269291875,
+      "loss": 0.4628,
+      "step": 25380
+    },
+    {
+      "epoch": 67.70666666666666,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029918447232289697,
+      "loss": 0.4633,
+      "step": 25390
+    },
+    {
+      "epoch": 67.73333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002991838174544986,
+      "loss": 0.4821,
+      "step": 25400
+    },
+    {
+      "epoch": 67.76,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029918316232399345,
+      "loss": 0.4835,
+      "step": 25410
+    },
+    {
+      "epoch": 67.78666666666666,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029918250693138266,
+      "loss": 0.4818,
+      "step": 25420
+    },
+    {
+      "epoch": 67.81333333333333,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002991818512766676,
+      "loss": 0.4675,
+      "step": 25430
+    },
+    {
+      "epoch": 67.84,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0002991811953598491,
+      "loss": 0.4728,
+      "step": 25440
+    },
+    {
+      "epoch": 67.86666666666666,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029918053918092855,
+      "loss": 0.4729,
+      "step": 25450
+    },
+    {
+      "epoch": 67.89333333333333,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.000299179882739907,
+      "loss": 0.4636,
+      "step": 25460
+    },
+    {
+      "epoch": 67.92,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002991792260367856,
+      "loss": 0.4727,
+      "step": 25470
+    },
+    {
+      "epoch": 67.94666666666667,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002991785690715656,
+      "loss": 0.4662,
+      "step": 25480
+    },
+    {
+      "epoch": 67.97333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029917791184424803,
+      "loss": 0.4778,
+      "step": 25490
+    },
+    {
+      "epoch": 68.0,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002991772543548341,
+      "loss": 0.4673,
+      "step": 25500
+    },
+    {
+      "epoch": 68.0,
+      "eval_loss": 0.482963889837265,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.6219,
+      "eval_samples_per_second": 1.663,
+      "eval_steps_per_second": 0.104,
+      "step": 25500
+    },
+    {
+      "epoch": 68.02666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002991765966033249,
+      "loss": 0.4862,
+      "step": 25510
+    },
+    {
+      "epoch": 68.05333333333333,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002991759385897217,
+      "loss": 0.4916,
+      "step": 25520
+    },
+    {
+      "epoch": 68.08,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002991752803140257,
+      "loss": 0.4794,
+      "step": 25530
+    },
+    {
+      "epoch": 68.10666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029917462177623784,
+      "loss": 0.4744,
+      "step": 25540
+    },
+    {
+      "epoch": 68.13333333333334,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002991739629763594,
+      "loss": 0.4759,
+      "step": 25550
+    },
+    {
+      "epoch": 68.16,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002991733039143916,
+      "loss": 0.4807,
+      "step": 25560
+    },
+    {
+      "epoch": 68.18666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029917264459033554,
+      "loss": 0.4723,
+      "step": 25570
+    },
+    {
+      "epoch": 68.21333333333334,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002991719850041923,
+      "loss": 0.4672,
+      "step": 25580
+    },
+    {
+      "epoch": 68.24,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002991713251559631,
+      "loss": 0.4708,
+      "step": 25590
+    },
+    {
+      "epoch": 68.26666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029917066504564916,
+      "loss": 0.4757,
+      "step": 25600
+    },
+    {
+      "epoch": 68.29333333333334,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002991700046732516,
+      "loss": 0.469,
+      "step": 25610
+    },
+    {
+      "epoch": 68.32,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029916934403877155,
+      "loss": 0.4728,
+      "step": 25620
+    },
+    {
+      "epoch": 68.34666666666666,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029916868314221017,
+      "loss": 0.4804,
+      "step": 25630
+    },
+    {
+      "epoch": 68.37333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029916802198356866,
+      "loss": 0.4766,
+      "step": 25640
+    },
+    {
+      "epoch": 68.4,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029916736056284816,
+      "loss": 0.4737,
+      "step": 25650
+    },
+    {
+      "epoch": 68.42666666666666,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002991666988800499,
+      "loss": 0.4802,
+      "step": 25660
+    },
+    {
+      "epoch": 68.45333333333333,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002991660369351749,
+      "loss": 0.482,
+      "step": 25670
+    },
+    {
+      "epoch": 68.48,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029916537472822436,
+      "loss": 0.4874,
+      "step": 25680
+    },
+    {
+      "epoch": 68.50666666666666,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002991647122591996,
+      "loss": 0.4779,
+      "step": 25690
+    },
+    {
+      "epoch": 68.53333333333333,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002991640495281016,
+      "loss": 0.4713,
+      "step": 25700
+    },
+    {
+      "epoch": 68.56,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0002991633865349316,
+      "loss": 0.4725,
+      "step": 25710
+    },
+    {
+      "epoch": 68.58666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029916272327969076,
+      "loss": 0.4688,
+      "step": 25720
+    },
+    {
+      "epoch": 68.61333333333333,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029916205976238016,
+      "loss": 0.4546,
+      "step": 25730
+    },
+    {
+      "epoch": 68.64,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002991613959830012,
+      "loss": 0.4591,
+      "step": 25740
+    },
+    {
+      "epoch": 68.66666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002991607319415548,
+      "loss": 0.4744,
+      "step": 25750
+    },
+    {
+      "epoch": 68.69333333333333,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002991600676380422,
+      "loss": 0.4578,
+      "step": 25760
+    },
+    {
+      "epoch": 68.72,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002991594030724647,
+      "loss": 0.4744,
+      "step": 25770
+    },
+    {
+      "epoch": 68.74666666666667,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00029915873824482324,
+      "loss": 0.4796,
+      "step": 25780
+    },
+    {
+      "epoch": 68.77333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029915807315511917,
+      "loss": 0.4875,
+      "step": 25790
+    },
+    {
+      "epoch": 68.8,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029915740780335357,
+      "loss": 0.4733,
+      "step": 25800
+    },
+    {
+      "epoch": 68.82666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029915674218952763,
+      "loss": 0.4673,
+      "step": 25810
+    },
+    {
+      "epoch": 68.85333333333334,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002991560763136425,
+      "loss": 0.4776,
+      "step": 25820
+    },
+    {
+      "epoch": 68.88,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002991554101756994,
+      "loss": 0.4645,
+      "step": 25830
+    },
+    {
+      "epoch": 68.90666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029915474377569944,
+      "loss": 0.4676,
+      "step": 25840
+    },
+    {
+      "epoch": 68.93333333333334,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002991540771136439,
+      "loss": 0.4683,
+      "step": 25850
+    },
+    {
+      "epoch": 68.96,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002991534101895338,
+      "loss": 0.4679,
+      "step": 25860
+    },
+    {
+      "epoch": 68.98666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002991527430033704,
+      "loss": 0.4816,
+      "step": 25870
+    },
+    {
+      "epoch": 69.0,
+      "eval_loss": 0.48096296191215515,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.278,
+      "eval_samples_per_second": 1.557,
+      "eval_steps_per_second": 0.097,
+      "step": 25875
+    },
+    {
+      "epoch": 69.01333333333334,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029915207555515486,
+      "loss": 0.4706,
+      "step": 25880
+    },
+    {
+      "epoch": 69.04,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029915140784488834,
+      "loss": 0.4949,
+      "step": 25890
+    },
+    {
+      "epoch": 69.06666666666666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029915073987257204,
+      "loss": 0.4821,
+      "step": 25900
+    },
+    {
+      "epoch": 69.09333333333333,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002991500716382071,
+      "loss": 0.4776,
+      "step": 25910
+    },
+    {
+      "epoch": 69.12,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029914940314179476,
+      "loss": 0.4729,
+      "step": 25920
+    },
+    {
+      "epoch": 69.14666666666666,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029914873438333605,
+      "loss": 0.482,
+      "step": 25930
+    },
+    {
+      "epoch": 69.17333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029914806536283234,
+      "loss": 0.4724,
+      "step": 25940
+    },
+    {
+      "epoch": 69.2,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002991473960802846,
+      "loss": 0.4708,
+      "step": 25950
+    },
+    {
+      "epoch": 69.22666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002991467265356942,
+      "loss": 0.4684,
+      "step": 25960
+    },
+    {
+      "epoch": 69.25333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029914605672906223,
+      "loss": 0.4684,
+      "step": 25970
+    },
+    {
+      "epoch": 69.28,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002991453866603898,
+      "loss": 0.4817,
+      "step": 25980
+    },
+    {
+      "epoch": 69.30666666666667,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002991447163296782,
+      "loss": 0.4634,
+      "step": 25990
+    },
+    {
+      "epoch": 69.33333333333333,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.00029914404573692856,
+      "loss": 0.4795,
+      "step": 26000
+    },
+    {
+      "epoch": 69.36,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.000299143374882142,
+      "loss": 0.4811,
+      "step": 26010
+    },
+    {
+      "epoch": 69.38666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029914270376531985,
+      "loss": 0.4728,
+      "step": 26020
+    },
+    {
+      "epoch": 69.41333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002991420323864631,
+      "loss": 0.477,
+      "step": 26030
+    },
+    {
+      "epoch": 69.44,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029914136074557306,
+      "loss": 0.4799,
+      "step": 26040
+    },
+    {
+      "epoch": 69.46666666666667,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.00029914068884265093,
+      "loss": 0.4892,
+      "step": 26050
+    },
+    {
+      "epoch": 69.49333333333334,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0002991400166776978,
+      "loss": 0.4803,
+      "step": 26060
+    },
+    {
+      "epoch": 69.52,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002991393442507148,
+      "loss": 0.4747,
+      "step": 26070
+    },
+    {
+      "epoch": 69.54666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002991386715617033,
+      "loss": 0.4715,
+      "step": 26080
+    },
+    {
+      "epoch": 69.57333333333334,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029913799861066437,
+      "loss": 0.4718,
+      "step": 26090
+    },
+    {
+      "epoch": 69.6,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029913732539759916,
+      "loss": 0.4646,
+      "step": 26100
+    },
+    {
+      "epoch": 69.62666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029913665192250895,
+      "loss": 0.4517,
+      "step": 26110
+    },
+    {
+      "epoch": 69.65333333333334,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029913597818539484,
+      "loss": 0.4709,
+      "step": 26120
+    },
+    {
+      "epoch": 69.68,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.000299135304186258,
+      "loss": 0.4626,
+      "step": 26130
+    },
+    {
+      "epoch": 69.70666666666666,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002991346299250997,
+      "loss": 0.4634,
+      "step": 26140
+    },
+    {
+      "epoch": 69.73333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002991339554019211,
+      "loss": 0.4822,
+      "step": 26150
+    },
+    {
+      "epoch": 69.76,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029913328061672336,
+      "loss": 0.4838,
+      "step": 26160
+    },
+    {
+      "epoch": 69.78666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029913260556950765,
+      "loss": 0.4814,
+      "step": 26170
+    },
+    {
+      "epoch": 69.81333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002991319302602752,
+      "loss": 0.4673,
+      "step": 26180
+    },
+    {
+      "epoch": 69.84,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002991312546890271,
+      "loss": 0.4729,
+      "step": 26190
+    },
+    {
+      "epoch": 69.86666666666666,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029913057885576464,
+      "loss": 0.4724,
+      "step": 26200
+    },
+    {
+      "epoch": 69.89333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.000299129902760489,
+      "loss": 0.4635,
+      "step": 26210
+    },
+    {
+      "epoch": 69.92,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002991292264032014,
+      "loss": 0.4729,
+      "step": 26220
+    },
+    {
+      "epoch": 69.94666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002991285497839029,
+      "loss": 0.4661,
+      "step": 26230
+    },
+    {
+      "epoch": 69.97333333333333,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029912787290259477,
+      "loss": 0.4779,
+      "step": 26240
+    },
+    {
+      "epoch": 70.0,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029912719575927825,
+      "loss": 0.4661,
+      "step": 26250
+    },
+    {
+      "epoch": 70.0,
+      "eval_loss": 0.4806675612926483,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9085,
+      "eval_samples_per_second": 1.615,
+      "eval_steps_per_second": 0.101,
+      "step": 26250
+    },
+    {
+      "epoch": 70.02666666666667,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002991265183539544,
+      "loss": 0.4862,
+      "step": 26260
+    },
+    {
+      "epoch": 70.05333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002991258406866245,
+      "loss": 0.4925,
+      "step": 26270
+    },
+    {
+      "epoch": 70.08,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029912516275728976,
+      "loss": 0.4794,
+      "step": 26280
+    },
+    {
+      "epoch": 70.10666666666667,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0002991244845659513,
+      "loss": 0.4741,
+      "step": 26290
+    },
+    {
+      "epoch": 70.13333333333334,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.00029912380611261035,
+      "loss": 0.4759,
+      "step": 26300
+    },
+    {
+      "epoch": 70.16,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029912312739726807,
+      "loss": 0.4789,
+      "step": 26310
+    },
+    {
+      "epoch": 70.18666666666667,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029912244841992573,
+      "loss": 0.472,
+      "step": 26320
+    },
+    {
+      "epoch": 70.21333333333334,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002991217691805844,
+      "loss": 0.467,
+      "step": 26330
+    },
+    {
+      "epoch": 70.24,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029912108967924543,
+      "loss": 0.4717,
+      "step": 26340
+    },
+    {
+      "epoch": 70.26666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029912040991590986,
+      "loss": 0.4747,
+      "step": 26350
+    },
+    {
+      "epoch": 70.29333333333334,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029911972989057905,
+      "loss": 0.4687,
+      "step": 26360
+    },
+    {
+      "epoch": 70.32,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029911904960325405,
+      "loss": 0.4731,
+      "step": 26370
+    },
+    {
+      "epoch": 70.34666666666666,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002991183690539361,
+      "loss": 0.4811,
+      "step": 26380
+    },
+    {
+      "epoch": 70.37333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029911768824262636,
+      "loss": 0.478,
+      "step": 26390
+    },
+    {
+      "epoch": 70.4,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002991170071693261,
+      "loss": 0.4728,
+      "step": 26400
+    },
+    {
+      "epoch": 70.42666666666666,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002991163258340365,
+      "loss": 0.4789,
+      "step": 26410
+    },
+    {
+      "epoch": 70.45333333333333,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0002991156442367587,
+      "loss": 0.4821,
+      "step": 26420
+    },
+    {
+      "epoch": 70.48,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.000299114962377494,
+      "loss": 0.4879,
+      "step": 26430
+    },
+    {
+      "epoch": 70.50666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029911428025624346,
+      "loss": 0.4776,
+      "step": 26440
+    },
+    {
+      "epoch": 70.53333333333333,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002991135978730084,
+      "loss": 0.4714,
+      "step": 26450
+    },
+    {
+      "epoch": 70.56,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029911291522778996,
+      "loss": 0.473,
+      "step": 26460
+    },
+    {
+      "epoch": 70.58666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002991122323205894,
+      "loss": 0.4685,
+      "step": 26470
+    },
+    {
+      "epoch": 70.61333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002991115491514078,
+      "loss": 0.4539,
+      "step": 26480
+    },
+    {
+      "epoch": 70.64,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002991108657202464,
+      "loss": 0.459,
+      "step": 26490
+    },
+    {
+      "epoch": 70.66666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029911018202710655,
+      "loss": 0.4748,
+      "step": 26500
+    },
+    {
+      "epoch": 70.69333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029910949807198924,
+      "loss": 0.4578,
+      "step": 26510
+    },
+    {
+      "epoch": 70.72,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002991088138548958,
+      "loss": 0.4755,
+      "step": 26520
+    },
+    {
+      "epoch": 70.74666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002991081293758274,
+      "loss": 0.4788,
+      "step": 26530
+    },
+    {
+      "epoch": 70.77333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002991074446347852,
+      "loss": 0.486,
+      "step": 26540
+    },
+    {
+      "epoch": 70.8,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002991067596317705,
+      "loss": 0.473,
+      "step": 26550
+    },
+    {
+      "epoch": 70.82666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002991060743667844,
+      "loss": 0.4679,
+      "step": 26560
+    },
+    {
+      "epoch": 70.85333333333334,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029910538883982826,
+      "loss": 0.4769,
+      "step": 26570
+    },
+    {
+      "epoch": 70.88,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029910470305090305,
+      "loss": 0.4647,
+      "step": 26580
+    },
+    {
+      "epoch": 70.90666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029910401700001014,
+      "loss": 0.4675,
+      "step": 26590
+    },
+    {
+      "epoch": 70.93333333333334,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029910333068715073,
+      "loss": 0.4693,
+      "step": 26600
+    },
+    {
+      "epoch": 70.96,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029910264411232596,
+      "loss": 0.4672,
+      "step": 26610
+    },
+    {
+      "epoch": 70.98666666666666,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00029910195727553707,
+      "loss": 0.481,
+      "step": 26620
+    },
+    {
+      "epoch": 71.0,
+      "eval_loss": 0.4816884398460388,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.311,
+      "eval_samples_per_second": 1.552,
+      "eval_steps_per_second": 0.097,
+      "step": 26625
+    },
+    {
+      "epoch": 71.01333333333334,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002991012701767853,
+      "loss": 0.4697,
+      "step": 26630
+    },
+    {
+      "epoch": 71.04,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002991005828160717,
+      "loss": 0.4945,
+      "step": 26640
+    },
+    {
+      "epoch": 71.06666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029909989519339775,
+      "loss": 0.4816,
+      "step": 26650
+    },
+    {
+      "epoch": 71.09333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029909920730876444,
+      "loss": 0.4764,
+      "step": 26660
+    },
+    {
+      "epoch": 71.12,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.000299098519162173,
+      "loss": 0.4732,
+      "step": 26670
+    },
+    {
+      "epoch": 71.14666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029909783075362476,
+      "loss": 0.4819,
+      "step": 26680
+    },
+    {
+      "epoch": 71.17333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029909714208312084,
+      "loss": 0.4728,
+      "step": 26690
+    },
+    {
+      "epoch": 71.2,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029909645315066244,
+      "loss": 0.4705,
+      "step": 26700
+    },
+    {
+      "epoch": 71.22666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029909576395625083,
+      "loss": 0.4684,
+      "step": 26710
+    },
+    {
+      "epoch": 71.25333333333333,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00029909507449988714,
+      "loss": 0.4688,
+      "step": 26720
+    },
+    {
+      "epoch": 71.28,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002990943847815726,
+      "loss": 0.4814,
+      "step": 26730
+    },
+    {
+      "epoch": 71.30666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029909369480130856,
+      "loss": 0.4633,
+      "step": 26740
+    },
+    {
+      "epoch": 71.33333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029909300455909605,
+      "loss": 0.4792,
+      "step": 26750
+    },
+    {
+      "epoch": 71.36,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029909231405493634,
+      "loss": 0.4807,
+      "step": 26760
+    },
+    {
+      "epoch": 71.38666666666667,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002990916232888307,
+      "loss": 0.4726,
+      "step": 26770
+    },
+    {
+      "epoch": 71.41333333333333,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002990909322607803,
+      "loss": 0.4771,
+      "step": 26780
+    },
+    {
+      "epoch": 71.44,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029909024097078636,
+      "loss": 0.4799,
+      "step": 26790
+    },
+    {
+      "epoch": 71.46666666666667,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.00029908954941885004,
+      "loss": 0.4892,
+      "step": 26800
+    },
+    {
+      "epoch": 71.49333333333334,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029908885760497264,
+      "loss": 0.4797,
+      "step": 26810
+    },
+    {
+      "epoch": 71.52,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002990881655291553,
+      "loss": 0.4744,
+      "step": 26820
+    },
+    {
+      "epoch": 71.54666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029908747319139933,
+      "loss": 0.4717,
+      "step": 26830
+    },
+    {
+      "epoch": 71.57333333333334,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00029908678059170586,
+      "loss": 0.4707,
+      "step": 26840
+    },
+    {
+      "epoch": 71.6,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029908608773007614,
+      "loss": 0.4648,
+      "step": 26850
+    },
+    {
+      "epoch": 71.62666666666667,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029908539460651135,
+      "loss": 0.4512,
+      "step": 26860
+    },
+    {
+      "epoch": 71.65333333333334,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002990847012210128,
+      "loss": 0.4715,
+      "step": 26870
+    },
+    {
+      "epoch": 71.68,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029908400757358165,
+      "loss": 0.4622,
+      "step": 26880
+    },
+    {
+      "epoch": 71.70666666666666,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0002990833136642191,
+      "loss": 0.4627,
+      "step": 26890
+    },
+    {
+      "epoch": 71.73333333333333,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029908261949292635,
+      "loss": 0.4823,
+      "step": 26900
+    },
+    {
+      "epoch": 71.76,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029908192505970476,
+      "loss": 0.4837,
+      "step": 26910
+    },
+    {
+      "epoch": 71.78666666666666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029908123036455537,
+      "loss": 0.4826,
+      "step": 26920
+    },
+    {
+      "epoch": 71.81333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029908053540747954,
+      "loss": 0.4665,
+      "step": 26930
+    },
+    {
+      "epoch": 71.84,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002990798401884784,
+      "loss": 0.4729,
+      "step": 26940
+    },
+    {
+      "epoch": 71.86666666666666,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002990791447075532,
+      "loss": 0.4722,
+      "step": 26950
+    },
+    {
+      "epoch": 71.89333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029907844896470513,
+      "loss": 0.4633,
+      "step": 26960
+    },
+    {
+      "epoch": 71.92,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002990777529599355,
+      "loss": 0.4734,
+      "step": 26970
+    },
+    {
+      "epoch": 71.94666666666667,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.00029907705669324544,
+      "loss": 0.4648,
+      "step": 26980
+    },
+    {
+      "epoch": 71.97333333333333,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029907636016463624,
+      "loss": 0.4775,
+      "step": 26990
+    },
+    {
+      "epoch": 72.0,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002990756633741091,
+      "loss": 0.4662,
+      "step": 27000
+    },
+    {
+      "epoch": 72.0,
+      "eval_loss": 0.4817294776439667,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7689,
+      "eval_samples_per_second": 1.638,
+      "eval_steps_per_second": 0.102,
+      "step": 27000
+    },
+    {
+      "epoch": 72.02666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029907496632166524,
+      "loss": 0.4857,
+      "step": 27010
+    },
+    {
+      "epoch": 72.05333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002990742690073059,
+      "loss": 0.4925,
+      "step": 27020
+    },
+    {
+      "epoch": 72.08,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002990735714310323,
+      "loss": 0.4785,
+      "step": 27030
+    },
+    {
+      "epoch": 72.10666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002990728735928456,
+      "loss": 0.4741,
+      "step": 27040
+    },
+    {
+      "epoch": 72.13333333333334,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029907217549274707,
+      "loss": 0.4752,
+      "step": 27050
+    },
+    {
+      "epoch": 72.16,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.000299071477130738,
+      "loss": 0.4802,
+      "step": 27060
+    },
+    {
+      "epoch": 72.18666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002990707785068196,
+      "loss": 0.4719,
+      "step": 27070
+    },
+    {
+      "epoch": 72.21333333333334,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.000299070079620993,
+      "loss": 0.4667,
+      "step": 27080
+    },
+    {
+      "epoch": 72.24,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002990693804732595,
+      "loss": 0.4707,
+      "step": 27090
+    },
+    {
+      "epoch": 72.26666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002990686810636203,
+      "loss": 0.4752,
+      "step": 27100
+    },
+    {
+      "epoch": 72.29333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002990679813920767,
+      "loss": 0.4687,
+      "step": 27110
+    },
+    {
+      "epoch": 72.32,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029906728145862987,
+      "loss": 0.473,
+      "step": 27120
+    },
+    {
+      "epoch": 72.34666666666666,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.000299066581263281,
+      "loss": 0.4809,
+      "step": 27130
+    },
+    {
+      "epoch": 72.37333333333333,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002990658808060314,
+      "loss": 0.4769,
+      "step": 27140
+    },
+    {
+      "epoch": 72.4,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002990651800868823,
+      "loss": 0.4735,
+      "step": 27150
+    },
+    {
+      "epoch": 72.42666666666666,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029906447910583485,
+      "loss": 0.4787,
+      "step": 27160
+    },
+    {
+      "epoch": 72.45333333333333,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029906377786289033,
+      "loss": 0.4816,
+      "step": 27170
+    },
+    {
+      "epoch": 72.48,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029906307635805,
+      "loss": 0.488,
+      "step": 27180
+    },
+    {
+      "epoch": 72.50666666666666,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00029906237459131506,
+      "loss": 0.4774,
+      "step": 27190
+    },
+    {
+      "epoch": 72.53333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029906167256268674,
+      "loss": 0.4711,
+      "step": 27200
+    },
+    {
+      "epoch": 72.56,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0002990609702721663,
+      "loss": 0.4729,
+      "step": 27210
+    },
+    {
+      "epoch": 72.58666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.000299060267719755,
+      "loss": 0.4688,
+      "step": 27220
+    },
+    {
+      "epoch": 72.61333333333333,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00029905956490545397,
+      "loss": 0.454,
+      "step": 27230
+    },
+    {
+      "epoch": 72.64,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029905886182926447,
+      "loss": 0.4591,
+      "step": 27240
+    },
+    {
+      "epoch": 72.66666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029905815849118783,
+      "loss": 0.4752,
+      "step": 27250
+    },
+    {
+      "epoch": 72.69333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002990574548912252,
+      "loss": 0.4575,
+      "step": 27260
+    },
+    {
+      "epoch": 72.72,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029905675102937787,
+      "loss": 0.4747,
+      "step": 27270
+    },
+    {
+      "epoch": 72.74666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029905604690564703,
+      "loss": 0.4787,
+      "step": 27280
+    },
+    {
+      "epoch": 72.77333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029905534252003393,
+      "loss": 0.4863,
+      "step": 27290
+    },
+    {
+      "epoch": 72.8,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029905463787253977,
+      "loss": 0.4729,
+      "step": 27300
+    },
+    {
+      "epoch": 72.82666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002990539329631659,
+      "loss": 0.4682,
+      "step": 27310
+    },
+    {
+      "epoch": 72.85333333333334,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002990532277919135,
+      "loss": 0.4768,
+      "step": 27320
+    },
+    {
+      "epoch": 72.88,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00029905252235878373,
+      "loss": 0.4648,
+      "step": 27330
+    },
+    {
+      "epoch": 72.90666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029905181666377794,
+      "loss": 0.4674,
+      "step": 27340
+    },
+    {
+      "epoch": 72.93333333333334,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002990511107068973,
+      "loss": 0.469,
+      "step": 27350
+    },
+    {
+      "epoch": 72.96,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002990504044881431,
+      "loss": 0.4673,
+      "step": 27360
+    },
+    {
+      "epoch": 72.98666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002990496980075166,
+      "loss": 0.481,
+      "step": 27370
+    },
+    {
+      "epoch": 73.0,
+      "eval_loss": 0.4834159314632416,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.116,
+      "eval_samples_per_second": 1.582,
+      "eval_steps_per_second": 0.099,
+      "step": 27375
+    },
+    {
+      "epoch": 73.01333333333334,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.000299048991265019,
+      "loss": 0.4702,
+      "step": 27380
+    },
+    {
+      "epoch": 73.04,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002990482842606515,
+      "loss": 0.4945,
+      "step": 27390
+    },
+    {
+      "epoch": 73.06666666666666,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002990475769944154,
+      "loss": 0.4819,
+      "step": 27400
+    },
+    {
+      "epoch": 73.09333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002990468694663119,
+      "loss": 0.4774,
+      "step": 27410
+    },
+    {
+      "epoch": 73.12,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002990461616763423,
+      "loss": 0.4726,
+      "step": 27420
+    },
+    {
+      "epoch": 73.14666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002990454536245078,
+      "loss": 0.4819,
+      "step": 27430
+    },
+    {
+      "epoch": 73.17333333333333,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029904474531080964,
+      "loss": 0.4725,
+      "step": 27440
+    },
+    {
+      "epoch": 73.2,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0002990440367352491,
+      "loss": 0.4706,
+      "step": 27450
+    },
+    {
+      "epoch": 73.22666666666667,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0002990433278978274,
+      "loss": 0.4688,
+      "step": 27460
+    },
+    {
+      "epoch": 73.25333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029904261879854584,
+      "loss": 0.4686,
+      "step": 27470
+    },
+    {
+      "epoch": 73.28,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002990419094374056,
+      "loss": 0.4815,
+      "step": 27480
+    },
+    {
+      "epoch": 73.30666666666667,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002990411998144079,
+      "loss": 0.4627,
+      "step": 27490
+    },
+    {
+      "epoch": 73.33333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029904048992955405,
+      "loss": 0.4788,
+      "step": 27500
+    },
+    {
+      "epoch": 73.36,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002990397797828453,
+      "loss": 0.4812,
+      "step": 27510
+    },
+    {
+      "epoch": 73.38666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029903906937428287,
+      "loss": 0.4716,
+      "step": 27520
+    },
+    {
+      "epoch": 73.41333333333333,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.000299038358703868,
+      "loss": 0.4766,
+      "step": 27530
+    },
+    {
+      "epoch": 73.44,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.000299037647771602,
+      "loss": 0.4796,
+      "step": 27540
+    },
+    {
+      "epoch": 73.46666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029903693657748595,
+      "loss": 0.4892,
+      "step": 27550
+    },
+    {
+      "epoch": 73.49333333333334,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029903622512152133,
+      "loss": 0.4803,
+      "step": 27560
+    },
+    {
+      "epoch": 73.52,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029903551340370926,
+      "loss": 0.4744,
+      "step": 27570
+    },
+    {
+      "epoch": 73.54666666666667,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.000299034801424051,
+      "loss": 0.4716,
+      "step": 27580
+    },
+    {
+      "epoch": 73.57333333333334,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029903408918254785,
+      "loss": 0.471,
+      "step": 27590
+    },
+    {
+      "epoch": 73.6,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.00029903337667920094,
+      "loss": 0.4639,
+      "step": 27600
+    },
+    {
+      "epoch": 73.62666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002990326639140117,
+      "loss": 0.4505,
+      "step": 27610
+    },
+    {
+      "epoch": 73.65333333333334,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.00029903195088698125,
+      "loss": 0.4709,
+      "step": 27620
+    },
+    {
+      "epoch": 73.68,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002990312375981109,
+      "loss": 0.4624,
+      "step": 27630
+    },
+    {
+      "epoch": 73.70666666666666,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002990305240474018,
+      "loss": 0.4629,
+      "step": 27640
+    },
+    {
+      "epoch": 73.73333333333333,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002990298102348554,
+      "loss": 0.482,
+      "step": 27650
+    },
+    {
+      "epoch": 73.76,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002990290961604728,
+      "loss": 0.4825,
+      "step": 27660
+    },
+    {
+      "epoch": 73.78666666666666,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002990283818242553,
+      "loss": 0.4814,
+      "step": 27670
+    },
+    {
+      "epoch": 73.81333333333333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029902766722620413,
+      "loss": 0.4666,
+      "step": 27680
+    },
+    {
+      "epoch": 73.84,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0002990269523663206,
+      "loss": 0.4731,
+      "step": 27690
+    },
+    {
+      "epoch": 73.86666666666666,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002990262372446059,
+      "loss": 0.4719,
+      "step": 27700
+    },
+    {
+      "epoch": 73.89333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002990255218610613,
+      "loss": 0.463,
+      "step": 27710
+    },
+    {
+      "epoch": 73.92,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002990248062156881,
+      "loss": 0.4731,
+      "step": 27720
+    },
+    {
+      "epoch": 73.94666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002990240903084876,
+      "loss": 0.4665,
+      "step": 27730
+    },
+    {
+      "epoch": 73.97333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029902337413946094,
+      "loss": 0.4769,
+      "step": 27740
+    },
+    {
+      "epoch": 74.0,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002990226577086094,
+      "loss": 0.4662,
+      "step": 27750
+    },
+    {
+      "epoch": 74.0,
+      "eval_loss": 0.48103514313697815,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8958,
+      "eval_samples_per_second": 1.617,
+      "eval_steps_per_second": 0.101,
+      "step": 27750
+    },
+    {
+      "epoch": 74.02666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029902194101593435,
+      "loss": 0.486,
+      "step": 27760
+    },
+    {
+      "epoch": 74.05333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002990212240614369,
+      "loss": 0.4917,
+      "step": 27770
+    },
+    {
+      "epoch": 74.08,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029902050684511833,
+      "loss": 0.4784,
+      "step": 27780
+    },
+    {
+      "epoch": 74.10666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029901978936698007,
+      "loss": 0.4742,
+      "step": 27790
+    },
+    {
+      "epoch": 74.13333333333334,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002990190716270232,
+      "loss": 0.4755,
+      "step": 27800
+    },
+    {
+      "epoch": 74.16,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.000299018353625249,
+      "loss": 0.4799,
+      "step": 27810
+    },
+    {
+      "epoch": 74.18666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002990176353616588,
+      "loss": 0.4715,
+      "step": 27820
+    },
+    {
+      "epoch": 74.21333333333334,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002990169168362538,
+      "loss": 0.4669,
+      "step": 27830
+    },
+    {
+      "epoch": 74.24,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0002990161980490354,
+      "loss": 0.4707,
+      "step": 27840
+    },
+    {
+      "epoch": 74.26666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002990154790000047,
+      "loss": 0.4745,
+      "step": 27850
+    },
+    {
+      "epoch": 74.29333333333334,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029901475968916304,
+      "loss": 0.4691,
+      "step": 27860
+    },
+    {
+      "epoch": 74.32,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00029901404011651163,
+      "loss": 0.4726,
+      "step": 27870
+    },
+    {
+      "epoch": 74.34666666666666,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.00029901332028205174,
+      "loss": 0.4807,
+      "step": 27880
+    },
+    {
+      "epoch": 74.37333333333333,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.00029901260018578473,
+      "loss": 0.4775,
+      "step": 27890
+    },
+    {
+      "epoch": 74.4,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002990118798277118,
+      "loss": 0.4734,
+      "step": 27900
+    },
+    {
+      "epoch": 74.42666666666666,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029901115920783416,
+      "loss": 0.4788,
+      "step": 27910
+    },
+    {
+      "epoch": 74.45333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029901043832615315,
+      "loss": 0.4817,
+      "step": 27920
+    },
+    {
+      "epoch": 74.48,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00029900971718267,
+      "loss": 0.4876,
+      "step": 27930
+    },
+    {
+      "epoch": 74.50666666666666,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.000299008995777386,
+      "loss": 0.4771,
+      "step": 27940
+    },
+    {
+      "epoch": 74.53333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029900827411030246,
+      "loss": 0.4716,
+      "step": 27950
+    },
+    {
+      "epoch": 74.56,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002990075521814206,
+      "loss": 0.4721,
+      "step": 27960
+    },
+    {
+      "epoch": 74.58666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029900682999074165,
+      "loss": 0.4686,
+      "step": 27970
+    },
+    {
+      "epoch": 74.61333333333333,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.00029900610753826696,
+      "loss": 0.4546,
+      "step": 27980
+    },
+    {
+      "epoch": 74.64,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029900538482399773,
+      "loss": 0.4586,
+      "step": 27990
+    },
+    {
+      "epoch": 74.66666666666667,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.00029900466184793525,
+      "loss": 0.4742,
+      "step": 28000
+    },
+    {
+      "epoch": 74.69333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029900393861008077,
+      "loss": 0.4574,
+      "step": 28010
+    },
+    {
+      "epoch": 74.72,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029900321511043565,
+      "loss": 0.4757,
+      "step": 28020
+    },
+    {
+      "epoch": 74.74666666666667,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.000299002491349001,
+      "loss": 0.4787,
+      "step": 28030
+    },
+    {
+      "epoch": 74.77333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002990017673257783,
+      "loss": 0.4862,
+      "step": 28040
+    },
+    {
+      "epoch": 74.8,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002990010430407686,
+      "loss": 0.4734,
+      "step": 28050
+    },
+    {
+      "epoch": 74.82666666666667,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002990003184939734,
+      "loss": 0.4681,
+      "step": 28060
+    },
+    {
+      "epoch": 74.85333333333334,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002989995936853938,
+      "loss": 0.4771,
+      "step": 28070
+    },
+    {
+      "epoch": 74.88,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029899886861503117,
+      "loss": 0.4646,
+      "step": 28080
+    },
+    {
+      "epoch": 74.90666666666667,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029899814328288667,
+      "loss": 0.467,
+      "step": 28090
+    },
+    {
+      "epoch": 74.93333333333334,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029899741768896166,
+      "loss": 0.4691,
+      "step": 28100
+    },
+    {
+      "epoch": 74.96,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029899669183325745,
+      "loss": 0.4658,
+      "step": 28110
+    },
+    {
+      "epoch": 74.98666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002989959657157753,
+      "loss": 0.4805,
+      "step": 28120
+    },
+    {
+      "epoch": 75.0,
+      "eval_loss": 0.48107996582984924,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.6286,
+      "eval_samples_per_second": 1.505,
+      "eval_steps_per_second": 0.094,
+      "step": 28125
+    },
+    {
+      "epoch": 75.01333333333334,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002989952393365164,
+      "loss": 0.47,
+      "step": 28130
+    },
+    {
+      "epoch": 75.04,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029899451269548206,
+      "loss": 0.4943,
+      "step": 28140
+    },
+    {
+      "epoch": 75.06666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029899378579267366,
+      "loss": 0.4818,
+      "step": 28150
+    },
+    {
+      "epoch": 75.09333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002989930586280923,
+      "loss": 0.4762,
+      "step": 28160
+    },
+    {
+      "epoch": 75.12,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002989923312017394,
+      "loss": 0.4724,
+      "step": 28170
+    },
+    {
+      "epoch": 75.14666666666666,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029899160351361617,
+      "loss": 0.4821,
+      "step": 28180
+    },
+    {
+      "epoch": 75.17333333333333,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029899087556372396,
+      "loss": 0.4723,
+      "step": 28190
+    },
+    {
+      "epoch": 75.2,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.000298990147352064,
+      "loss": 0.4707,
+      "step": 28200
+    },
+    {
+      "epoch": 75.22666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029898941887863744,
+      "loss": 0.4678,
+      "step": 28210
+    },
+    {
+      "epoch": 75.25333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002989886901434458,
+      "loss": 0.4683,
+      "step": 28220
+    },
+    {
+      "epoch": 75.28,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029898796114649027,
+      "loss": 0.4817,
+      "step": 28230
+    },
+    {
+      "epoch": 75.30666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002989872318877721,
+      "loss": 0.4631,
+      "step": 28240
+    },
+    {
+      "epoch": 75.33333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002989865023672925,
+      "loss": 0.4781,
+      "step": 28250
+    },
+    {
+      "epoch": 75.36,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029898577258505283,
+      "loss": 0.4802,
+      "step": 28260
+    },
+    {
+      "epoch": 75.38666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002989850425410545,
+      "loss": 0.4718,
+      "step": 28270
+    },
+    {
+      "epoch": 75.41333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002989843122352985,
+      "loss": 0.4762,
+      "step": 28280
+    },
+    {
+      "epoch": 75.44,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029898358166778637,
+      "loss": 0.4796,
+      "step": 28290
+    },
+    {
+      "epoch": 75.46666666666667,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002989828508385193,
+      "loss": 0.4885,
+      "step": 28300
+    },
+    {
+      "epoch": 75.49333333333334,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029898211974749864,
+      "loss": 0.4798,
+      "step": 28310
+    },
+    {
+      "epoch": 75.52,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002989813883947255,
+      "loss": 0.4743,
+      "step": 28320
+    },
+    {
+      "epoch": 75.54666666666667,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002989806567802013,
+      "loss": 0.4714,
+      "step": 28330
+    },
+    {
+      "epoch": 75.57333333333334,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029897992490392737,
+      "loss": 0.4708,
+      "step": 28340
+    },
+    {
+      "epoch": 75.6,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029897919276590485,
+      "loss": 0.4642,
+      "step": 28350
+    },
+    {
+      "epoch": 75.62666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029897846036613517,
+      "loss": 0.4508,
+      "step": 28360
+    },
+    {
+      "epoch": 75.65333333333334,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002989777277046195,
+      "loss": 0.4708,
+      "step": 28370
+    },
+    {
+      "epoch": 75.68,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029897699478135923,
+      "loss": 0.4623,
+      "step": 28380
+    },
+    {
+      "epoch": 75.70666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029897626159635553,
+      "loss": 0.4626,
+      "step": 28390
+    },
+    {
+      "epoch": 75.73333333333333,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029897552814960976,
+      "loss": 0.4814,
+      "step": 28400
+    },
+    {
+      "epoch": 75.76,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029897479444112327,
+      "loss": 0.4826,
+      "step": 28410
+    },
+    {
+      "epoch": 75.78666666666666,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002989740604708972,
+      "loss": 0.481,
+      "step": 28420
+    },
+    {
+      "epoch": 75.81333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029897332623893296,
+      "loss": 0.467,
+      "step": 28430
+    },
+    {
+      "epoch": 75.84,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002989725917452318,
+      "loss": 0.4723,
+      "step": 28440
+    },
+    {
+      "epoch": 75.86666666666666,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.000298971856989795,
+      "loss": 0.4717,
+      "step": 28450
+    },
+    {
+      "epoch": 75.89333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002989711219726239,
+      "loss": 0.4633,
+      "step": 28460
+    },
+    {
+      "epoch": 75.92,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002989703866937197,
+      "loss": 0.4723,
+      "step": 28470
+    },
+    {
+      "epoch": 75.94666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002989696511530838,
+      "loss": 0.4656,
+      "step": 28480
+    },
+    {
+      "epoch": 75.97333333333333,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0002989689153507174,
+      "loss": 0.4776,
+      "step": 28490
+    },
+    {
+      "epoch": 76.0,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002989681792866219,
+      "loss": 0.4662,
+      "step": 28500
+    },
+    {
+      "epoch": 76.0,
+      "eval_loss": 0.4805218577384949,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.9242,
+      "eval_samples_per_second": 1.465,
+      "eval_steps_per_second": 0.092,
+      "step": 28500
+    },
+    {
+      "epoch": 76.02666666666667,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029896744296079844,
+      "loss": 0.4858,
+      "step": 28510
+    },
+    {
+      "epoch": 76.05333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002989667063732484,
+      "loss": 0.4921,
+      "step": 28520
+    },
+    {
+      "epoch": 76.08,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029896596952397314,
+      "loss": 0.4785,
+      "step": 28530
+    },
+    {
+      "epoch": 76.10666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029896523241297384,
+      "loss": 0.4735,
+      "step": 28540
+    },
+    {
+      "epoch": 76.13333333333334,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002989644950402518,
+      "loss": 0.4754,
+      "step": 28550
+    },
+    {
+      "epoch": 76.16,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029896375740580844,
+      "loss": 0.4797,
+      "step": 28560
+    },
+    {
+      "epoch": 76.18666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029896301950964495,
+      "loss": 0.4723,
+      "step": 28570
+    },
+    {
+      "epoch": 76.21333333333334,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0002989622813517626,
+      "loss": 0.4666,
+      "step": 28580
+    },
+    {
+      "epoch": 76.24,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002989615429321628,
+      "loss": 0.47,
+      "step": 28590
+    },
+    {
+      "epoch": 76.26666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002989608042508467,
+      "loss": 0.4749,
+      "step": 28600
+    },
+    {
+      "epoch": 76.29333333333334,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029896006530781577,
+      "loss": 0.4691,
+      "step": 28610
+    },
+    {
+      "epoch": 76.32,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002989593261030712,
+      "loss": 0.4725,
+      "step": 28620
+    },
+    {
+      "epoch": 76.34666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002989585866366143,
+      "loss": 0.4805,
+      "step": 28630
+    },
+    {
+      "epoch": 76.37333333333333,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002989578469084464,
+      "loss": 0.4767,
+      "step": 28640
+    },
+    {
+      "epoch": 76.4,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0002989571069185687,
+      "loss": 0.4728,
+      "step": 28650
+    },
+    {
+      "epoch": 76.42666666666666,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029895636666698265,
+      "loss": 0.4792,
+      "step": 28660
+    },
+    {
+      "epoch": 76.45333333333333,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029895562615368944,
+      "loss": 0.4813,
+      "step": 28670
+    },
+    {
+      "epoch": 76.48,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029895488537869043,
+      "loss": 0.4874,
+      "step": 28680
+    },
+    {
+      "epoch": 76.50666666666666,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029895414434198686,
+      "loss": 0.477,
+      "step": 28690
+    },
+    {
+      "epoch": 76.53333333333333,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.00029895340304358014,
+      "loss": 0.4713,
+      "step": 28700
+    },
+    {
+      "epoch": 76.56,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029895266148347147,
+      "loss": 0.4719,
+      "step": 28710
+    },
+    {
+      "epoch": 76.58666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002989519196616622,
+      "loss": 0.4679,
+      "step": 28720
+    },
+    {
+      "epoch": 76.61333333333333,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0002989511775781536,
+      "loss": 0.4542,
+      "step": 28730
+    },
+    {
+      "epoch": 76.64,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.000298950435232947,
+      "loss": 0.4587,
+      "step": 28740
+    },
+    {
+      "epoch": 76.66666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002989496926260437,
+      "loss": 0.4748,
+      "step": 28750
+    },
+    {
+      "epoch": 76.69333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029894894975744505,
+      "loss": 0.4576,
+      "step": 28760
+    },
+    {
+      "epoch": 76.72,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029894820662715224,
+      "loss": 0.4743,
+      "step": 28770
+    },
+    {
+      "epoch": 76.74666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002989474632351667,
+      "loss": 0.4789,
+      "step": 28780
+    },
+    {
+      "epoch": 76.77333333333333,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029894671958148963,
+      "loss": 0.486,
+      "step": 28790
+    },
+    {
+      "epoch": 76.8,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029894597566612244,
+      "loss": 0.473,
+      "step": 28800
+    },
+    {
+      "epoch": 76.82666666666667,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002989452314890663,
+      "loss": 0.4678,
+      "step": 28810
+    },
+    {
+      "epoch": 76.85333333333334,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.00029894448705032264,
+      "loss": 0.4765,
+      "step": 28820
+    },
+    {
+      "epoch": 76.88,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002989437423498928,
+      "loss": 0.4644,
+      "step": 28830
+    },
+    {
+      "epoch": 76.90666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029894299738777796,
+      "loss": 0.4671,
+      "step": 28840
+    },
+    {
+      "epoch": 76.93333333333334,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002989422521639795,
+      "loss": 0.4691,
+      "step": 28850
+    },
+    {
+      "epoch": 76.96,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002989415066784987,
+      "loss": 0.4668,
+      "step": 28860
+    },
+    {
+      "epoch": 76.98666666666666,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002989407609313369,
+      "loss": 0.4813,
+      "step": 28870
+    },
+    {
+      "epoch": 77.0,
+      "eval_loss": 0.4802263081073761,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9162,
+      "eval_samples_per_second": 1.614,
+      "eval_steps_per_second": 0.101,
+      "step": 28875
+    },
+    {
+      "epoch": 77.01333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002989400149224954,
+      "loss": 0.4696,
+      "step": 28880
+    },
+    {
+      "epoch": 77.04,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029893926865197547,
+      "loss": 0.4937,
+      "step": 28890
+    },
+    {
+      "epoch": 77.06666666666666,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002989385221197785,
+      "loss": 0.4809,
+      "step": 28900
+    },
+    {
+      "epoch": 77.09333333333333,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00029893777532590573,
+      "loss": 0.476,
+      "step": 28910
+    },
+    {
+      "epoch": 77.12,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029893702827035854,
+      "loss": 0.4722,
+      "step": 28920
+    },
+    {
+      "epoch": 77.14666666666666,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029893628095313817,
+      "loss": 0.482,
+      "step": 28930
+    },
+    {
+      "epoch": 77.17333333333333,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.000298935533374246,
+      "loss": 0.4718,
+      "step": 28940
+    },
+    {
+      "epoch": 77.2,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002989347855336833,
+      "loss": 0.4707,
+      "step": 28950
+    },
+    {
+      "epoch": 77.22666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002989340374314514,
+      "loss": 0.4683,
+      "step": 28960
+    },
+    {
+      "epoch": 77.25333333333333,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029893328906755156,
+      "loss": 0.4678,
+      "step": 28970
+    },
+    {
+      "epoch": 77.28,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002989325404419852,
+      "loss": 0.4817,
+      "step": 28980
+    },
+    {
+      "epoch": 77.30666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002989317915547535,
+      "loss": 0.4625,
+      "step": 28990
+    },
+    {
+      "epoch": 77.33333333333333,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002989310424058579,
+      "loss": 0.4777,
+      "step": 29000
+    },
+    {
+      "epoch": 77.36,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002989302929952997,
+      "loss": 0.4806,
+      "step": 29010
+    },
+    {
+      "epoch": 77.38666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029892954332308024,
+      "loss": 0.4721,
+      "step": 29020
+    },
+    {
+      "epoch": 77.41333333333333,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002989287933892007,
+      "loss": 0.4769,
+      "step": 29030
+    },
+    {
+      "epoch": 77.44,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029892804319366247,
+      "loss": 0.4787,
+      "step": 29040
+    },
+    {
+      "epoch": 77.46666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029892729273646695,
+      "loss": 0.4891,
+      "step": 29050
+    },
+    {
+      "epoch": 77.49333333333334,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029892654201761534,
+      "loss": 0.4801,
+      "step": 29060
+    },
+    {
+      "epoch": 77.52,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.000298925791037109,
+      "loss": 0.4739,
+      "step": 29070
+    },
+    {
+      "epoch": 77.54666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029892503979494925,
+      "loss": 0.4704,
+      "step": 29080
+    },
+    {
+      "epoch": 77.57333333333334,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0002989242882911375,
+      "loss": 0.4702,
+      "step": 29090
+    },
+    {
+      "epoch": 77.6,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002989235365256749,
+      "loss": 0.4638,
+      "step": 29100
+    },
+    {
+      "epoch": 77.62666666666667,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029892278449856287,
+      "loss": 0.4503,
+      "step": 29110
+    },
+    {
+      "epoch": 77.65333333333334,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0002989220322098027,
+      "loss": 0.4705,
+      "step": 29120
+    },
+    {
+      "epoch": 77.68,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0002989212796593958,
+      "loss": 0.4616,
+      "step": 29130
+    },
+    {
+      "epoch": 77.70666666666666,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029892052684734347,
+      "loss": 0.462,
+      "step": 29140
+    },
+    {
+      "epoch": 77.73333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029891977377364686,
+      "loss": 0.482,
+      "step": 29150
+    },
+    {
+      "epoch": 77.76,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002989190204383075,
+      "loss": 0.483,
+      "step": 29160
+    },
+    {
+      "epoch": 77.78666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029891826684132656,
+      "loss": 0.4815,
+      "step": 29170
+    },
+    {
+      "epoch": 77.81333333333333,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.00029891751298270547,
+      "loss": 0.4664,
+      "step": 29180
+    },
+    {
+      "epoch": 77.84,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002989167588624456,
+      "loss": 0.4723,
+      "step": 29190
+    },
+    {
+      "epoch": 77.86666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002989160044805481,
+      "loss": 0.4722,
+      "step": 29200
+    },
+    {
+      "epoch": 77.89333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002989152498370144,
+      "loss": 0.4622,
+      "step": 29210
+    },
+    {
+      "epoch": 77.92,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029891449493184584,
+      "loss": 0.4729,
+      "step": 29220
+    },
+    {
+      "epoch": 77.94666666666667,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029891373976504375,
+      "loss": 0.4653,
+      "step": 29230
+    },
+    {
+      "epoch": 77.97333333333333,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0002989129843366094,
+      "loss": 0.4763,
+      "step": 29240
+    },
+    {
+      "epoch": 78.0,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029891222864654416,
+      "loss": 0.4653,
+      "step": 29250
+    },
+    {
+      "epoch": 78.0,
+      "eval_loss": 0.4812386631965637,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.6074,
+      "eval_samples_per_second": 1.508,
+      "eval_steps_per_second": 0.094,
+      "step": 29250
+    },
+    {
+      "epoch": 78.02666666666667,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002989114726948493,
+      "loss": 0.4855,
+      "step": 29260
+    },
+    {
+      "epoch": 78.05333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029891071648152623,
+      "loss": 0.4916,
+      "step": 29270
+    },
+    {
+      "epoch": 78.08,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002989099600065763,
+      "loss": 0.4788,
+      "step": 29280
+    },
+    {
+      "epoch": 78.10666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002989092032700007,
+      "loss": 0.4733,
+      "step": 29290
+    },
+    {
+      "epoch": 78.13333333333334,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.00029890844627180084,
+      "loss": 0.4754,
+      "step": 29300
+    },
+    {
+      "epoch": 78.16,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.00029890768901197806,
+      "loss": 0.4797,
+      "step": 29310
+    },
+    {
+      "epoch": 78.18666666666667,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.00029890693149053373,
+      "loss": 0.4719,
+      "step": 29320
+    },
+    {
+      "epoch": 78.21333333333334,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0002989061737074691,
+      "loss": 0.467,
+      "step": 29330
+    },
+    {
+      "epoch": 78.24,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002989054156627855,
+      "loss": 0.4708,
+      "step": 29340
+    },
+    {
+      "epoch": 78.26666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029890465735648436,
+      "loss": 0.475,
+      "step": 29350
+    },
+    {
+      "epoch": 78.29333333333334,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0002989038987885669,
+      "loss": 0.4686,
+      "step": 29360
+    },
+    {
+      "epoch": 78.32,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029890313995903454,
+      "loss": 0.4722,
+      "step": 29370
+    },
+    {
+      "epoch": 78.34666666666666,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029890238086788853,
+      "loss": 0.4811,
+      "step": 29380
+    },
+    {
+      "epoch": 78.37333333333333,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002989016215151303,
+      "loss": 0.477,
+      "step": 29390
+    },
+    {
+      "epoch": 78.4,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002989008619007611,
+      "loss": 0.4726,
+      "step": 29400
+    },
+    {
+      "epoch": 78.42666666666666,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029890010202478234,
+      "loss": 0.479,
+      "step": 29410
+    },
+    {
+      "epoch": 78.45333333333333,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029889934188719524,
+      "loss": 0.4813,
+      "step": 29420
+    },
+    {
+      "epoch": 78.48,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002988985814880012,
+      "loss": 0.4875,
+      "step": 29430
+    },
+    {
+      "epoch": 78.50666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002988978208272016,
+      "loss": 0.4772,
+      "step": 29440
+    },
+    {
+      "epoch": 78.53333333333333,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002988970599047978,
+      "loss": 0.4712,
+      "step": 29450
+    },
+    {
+      "epoch": 78.56,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.000298896298720791,
+      "loss": 0.4723,
+      "step": 29460
+    },
+    {
+      "epoch": 78.58666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029889553727518265,
+      "loss": 0.4682,
+      "step": 29470
+    },
+    {
+      "epoch": 78.61333333333333,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.000298894775567974,
+      "loss": 0.4543,
+      "step": 29480
+    },
+    {
+      "epoch": 78.64,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002988940135991665,
+      "loss": 0.4586,
+      "step": 29490
+    },
+    {
+      "epoch": 78.66666666666667,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029889325136876136,
+      "loss": 0.4745,
+      "step": 29500
+    },
+    {
+      "epoch": 78.69333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002988924888767601,
+      "loss": 0.4574,
+      "step": 29510
+    },
+    {
+      "epoch": 78.72,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029889172612316384,
+      "loss": 0.4733,
+      "step": 29520
+    },
+    {
+      "epoch": 78.74666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002988909631079741,
+      "loss": 0.4785,
+      "step": 29530
+    },
+    {
+      "epoch": 78.77333333333333,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0002988901998311921,
+      "loss": 0.4861,
+      "step": 29540
+    },
+    {
+      "epoch": 78.8,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029888943629281925,
+      "loss": 0.4731,
+      "step": 29550
+    },
+    {
+      "epoch": 78.82666666666667,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00029888867249285684,
+      "loss": 0.4675,
+      "step": 29560
+    },
+    {
+      "epoch": 78.85333333333334,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029888790843130627,
+      "loss": 0.4764,
+      "step": 29570
+    },
+    {
+      "epoch": 78.88,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002988871441081688,
+      "loss": 0.4647,
+      "step": 29580
+    },
+    {
+      "epoch": 78.90666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002988863795234459,
+      "loss": 0.4674,
+      "step": 29590
+    },
+    {
+      "epoch": 78.93333333333334,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029888561467713887,
+      "loss": 0.469,
+      "step": 29600
+    },
+    {
+      "epoch": 78.96,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029888484956924896,
+      "loss": 0.4669,
+      "step": 29610
+    },
+    {
+      "epoch": 78.98666666666666,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002988840841997776,
+      "loss": 0.4804,
+      "step": 29620
+    },
+    {
+      "epoch": 79.0,
+      "eval_loss": 0.48059168457984924,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1909,
+      "eval_samples_per_second": 1.57,
+      "eval_steps_per_second": 0.098,
+      "step": 29625
+    },
+    {
+      "epoch": 79.01333333333334,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002988833185687261,
+      "loss": 0.4694,
+      "step": 29630
+    },
+    {
+      "epoch": 79.04,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002988825526760958,
+      "loss": 0.4937,
+      "step": 29640
+    },
+    {
+      "epoch": 79.06666666666666,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002988817865218881,
+      "loss": 0.4809,
+      "step": 29650
+    },
+    {
+      "epoch": 79.09333333333333,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0002988810201061043,
+      "loss": 0.4764,
+      "step": 29660
+    },
+    {
+      "epoch": 79.12,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002988802534287458,
+      "loss": 0.4723,
+      "step": 29670
+    },
+    {
+      "epoch": 79.14666666666666,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002988794864898138,
+      "loss": 0.4817,
+      "step": 29680
+    },
+    {
+      "epoch": 79.17333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029887871928930984,
+      "loss": 0.4719,
+      "step": 29690
+    },
+    {
+      "epoch": 79.2,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029887795182723516,
+      "loss": 0.4698,
+      "step": 29700
+    },
+    {
+      "epoch": 79.22666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002988771841035911,
+      "loss": 0.4685,
+      "step": 29710
+    },
+    {
+      "epoch": 79.25333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029887641611837907,
+      "loss": 0.4675,
+      "step": 29720
+    },
+    {
+      "epoch": 79.28,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002988756478716004,
+      "loss": 0.4813,
+      "step": 29730
+    },
+    {
+      "epoch": 79.30666666666667,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002988748793632564,
+      "loss": 0.4627,
+      "step": 29740
+    },
+    {
+      "epoch": 79.33333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002988741105933484,
+      "loss": 0.4781,
+      "step": 29750
+    },
+    {
+      "epoch": 79.36,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029887334156187786,
+      "loss": 0.4806,
+      "step": 29760
+    },
+    {
+      "epoch": 79.38666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002988725722688461,
+      "loss": 0.4716,
+      "step": 29770
+    },
+    {
+      "epoch": 79.41333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029887180271425437,
+      "loss": 0.4762,
+      "step": 29780
+    },
+    {
+      "epoch": 79.44,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002988710328981041,
+      "loss": 0.4789,
+      "step": 29790
+    },
+    {
+      "epoch": 79.46666666666667,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00029887026282039665,
+      "loss": 0.4893,
+      "step": 29800
+    },
+    {
+      "epoch": 79.49333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029886949248113335,
+      "loss": 0.4795,
+      "step": 29810
+    },
+    {
+      "epoch": 79.52,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002988687218803156,
+      "loss": 0.4743,
+      "step": 29820
+    },
+    {
+      "epoch": 79.54666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029886795101794465,
+      "loss": 0.471,
+      "step": 29830
+    },
+    {
+      "epoch": 79.57333333333334,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029886717989402196,
+      "loss": 0.4703,
+      "step": 29840
+    },
+    {
+      "epoch": 79.6,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029886640850854883,
+      "loss": 0.464,
+      "step": 29850
+    },
+    {
+      "epoch": 79.62666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029886563686152667,
+      "loss": 0.4506,
+      "step": 29860
+    },
+    {
+      "epoch": 79.65333333333334,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029886486495295673,
+      "loss": 0.4702,
+      "step": 29870
+    },
+    {
+      "epoch": 79.68,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0002988640927828404,
+      "loss": 0.4616,
+      "step": 29880
+    },
+    {
+      "epoch": 79.70666666666666,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002988633203511792,
+      "loss": 0.4624,
+      "step": 29890
+    },
+    {
+      "epoch": 79.73333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029886254765797425,
+      "loss": 0.4812,
+      "step": 29900
+    },
+    {
+      "epoch": 79.76,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029886177470322706,
+      "loss": 0.4826,
+      "step": 29910
+    },
+    {
+      "epoch": 79.78666666666666,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029886100148693886,
+      "loss": 0.4811,
+      "step": 29920
+    },
+    {
+      "epoch": 79.81333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002988602280091112,
+      "loss": 0.4666,
+      "step": 29930
+    },
+    {
+      "epoch": 79.84,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029885945426974525,
+      "loss": 0.4721,
+      "step": 29940
+    },
+    {
+      "epoch": 79.86666666666666,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002988586802688425,
+      "loss": 0.4713,
+      "step": 29950
+    },
+    {
+      "epoch": 79.89333333333333,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029885790600640426,
+      "loss": 0.4634,
+      "step": 29960
+    },
+    {
+      "epoch": 79.92,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0002988571314824318,
+      "loss": 0.4728,
+      "step": 29970
+    },
+    {
+      "epoch": 79.94666666666667,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0002988563566969267,
+      "loss": 0.4658,
+      "step": 29980
+    },
+    {
+      "epoch": 79.97333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002988555816498901,
+      "loss": 0.4765,
+      "step": 29990
+    },
+    {
+      "epoch": 80.0,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029885480634132345,
+      "loss": 0.4661,
+      "step": 30000
+    },
+    {
+      "epoch": 80.0,
+      "eval_loss": 0.4805583655834198,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3038,
+      "eval_samples_per_second": 1.553,
+      "eval_steps_per_second": 0.097,
+      "step": 30000
+    },
+    {
+      "epoch": 80.02666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002988540307712281,
+      "loss": 0.4855,
+      "step": 30010
+    },
+    {
+      "epoch": 80.05333333333333,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00029885325493960547,
+      "loss": 0.4912,
+      "step": 30020
+    },
+    {
+      "epoch": 80.08,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029885247884645685,
+      "loss": 0.4778,
+      "step": 30030
+    },
+    {
+      "epoch": 80.10666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002988517024917837,
+      "loss": 0.4733,
+      "step": 30040
+    },
+    {
+      "epoch": 80.13333333333334,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002988509258755872,
+      "loss": 0.4748,
+      "step": 30050
+    },
+    {
+      "epoch": 80.16,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002988501489978689,
+      "loss": 0.4796,
+      "step": 30060
+    },
+    {
+      "epoch": 80.18666666666667,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029884937185863016,
+      "loss": 0.4716,
+      "step": 30070
+    },
+    {
+      "epoch": 80.21333333333334,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002988485944578722,
+      "loss": 0.4665,
+      "step": 30080
+    },
+    {
+      "epoch": 80.24,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029884781679559647,
+      "loss": 0.4706,
+      "step": 30090
+    },
+    {
+      "epoch": 80.26666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029884703887180436,
+      "loss": 0.4743,
+      "step": 30100
+    },
+    {
+      "epoch": 80.29333333333334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002988462606864972,
+      "loss": 0.4684,
+      "step": 30110
+    },
+    {
+      "epoch": 80.32,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029884548223967636,
+      "loss": 0.4722,
+      "step": 30120
+    },
+    {
+      "epoch": 80.34666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002988447035313432,
+      "loss": 0.4802,
+      "step": 30130
+    },
+    {
+      "epoch": 80.37333333333333,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029884392456149916,
+      "loss": 0.4759,
+      "step": 30140
+    },
+    {
+      "epoch": 80.4,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002988431453301455,
+      "loss": 0.4732,
+      "step": 30150
+    },
+    {
+      "epoch": 80.42666666666666,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.00029884236583728365,
+      "loss": 0.4792,
+      "step": 30160
+    },
+    {
+      "epoch": 80.45333333333333,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.000298841586082915,
+      "loss": 0.485,
+      "step": 30170
+    },
+    {
+      "epoch": 80.48,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.00029884080606704084,
+      "loss": 0.4903,
+      "step": 30180
+    },
+    {
+      "epoch": 80.50666666666666,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0002988400257896626,
+      "loss": 0.4783,
+      "step": 30190
+    },
+    {
+      "epoch": 80.53333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029883924525078167,
+      "loss": 0.4707,
+      "step": 30200
+    },
+    {
+      "epoch": 80.56,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029883846445039936,
+      "loss": 0.4717,
+      "step": 30210
+    },
+    {
+      "epoch": 80.58666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029883768338851715,
+      "loss": 0.4685,
+      "step": 30220
+    },
+    {
+      "epoch": 80.61333333333333,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.00029883690206513626,
+      "loss": 0.4535,
+      "step": 30230
+    },
+    {
+      "epoch": 80.64,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029883612048025817,
+      "loss": 0.4585,
+      "step": 30240
+    },
+    {
+      "epoch": 80.66666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002988353386338842,
+      "loss": 0.4739,
+      "step": 30250
+    },
+    {
+      "epoch": 80.69333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029883455652601575,
+      "loss": 0.4573,
+      "step": 30260
+    },
+    {
+      "epoch": 80.72,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029883377415665423,
+      "loss": 0.474,
+      "step": 30270
+    },
+    {
+      "epoch": 80.74666666666667,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.00029883299152580093,
+      "loss": 0.4784,
+      "step": 30280
+    },
+    {
+      "epoch": 80.77333333333333,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002988322086334573,
+      "loss": 0.4862,
+      "step": 30290
+    },
+    {
+      "epoch": 80.8,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002988314254796246,
+      "loss": 0.4731,
+      "step": 30300
+    },
+    {
+      "epoch": 80.82666666666667,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.00029883064206430443,
+      "loss": 0.4674,
+      "step": 30310
+    },
+    {
+      "epoch": 80.85333333333334,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.000298829858387498,
+      "loss": 0.4767,
+      "step": 30320
+    },
+    {
+      "epoch": 80.88,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029882907444920664,
+      "loss": 0.4645,
+      "step": 30330
+    },
+    {
+      "epoch": 80.90666666666667,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029882829024943186,
+      "loss": 0.4676,
+      "step": 30340
+    },
+    {
+      "epoch": 80.93333333333334,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.00029882750578817495,
+      "loss": 0.4686,
+      "step": 30350
+    },
+    {
+      "epoch": 80.96,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002988267210654373,
+      "loss": 0.4661,
+      "step": 30360
+    },
+    {
+      "epoch": 80.98666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029882593608122036,
+      "loss": 0.4798,
+      "step": 30370
+    },
+    {
+      "epoch": 81.0,
+      "eval_loss": 0.4806881844997406,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3578,
+      "eval_samples_per_second": 1.545,
+      "eval_steps_per_second": 0.097,
+      "step": 30375
+    },
+    {
+      "epoch": 81.01333333333334,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029882515083552545,
+      "loss": 0.47,
+      "step": 30380
+    },
+    {
+      "epoch": 81.04,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.000298824365328354,
+      "loss": 0.4937,
+      "step": 30390
+    },
+    {
+      "epoch": 81.06666666666666,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029882357955970723,
+      "loss": 0.4813,
+      "step": 30400
+    },
+    {
+      "epoch": 81.09333333333333,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002988227935295867,
+      "loss": 0.4761,
+      "step": 30410
+    },
+    {
+      "epoch": 81.12,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0002988220072379937,
+      "loss": 0.4725,
+      "step": 30420
+    },
+    {
+      "epoch": 81.14666666666666,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002988212206849297,
+      "loss": 0.4818,
+      "step": 30430
+    },
+    {
+      "epoch": 81.17333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.000298820433870396,
+      "loss": 0.4717,
+      "step": 30440
+    },
+    {
+      "epoch": 81.2,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029881964679439395,
+      "loss": 0.47,
+      "step": 30450
+    },
+    {
+      "epoch": 81.22666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002988188594569251,
+      "loss": 0.4676,
+      "step": 30460
+    },
+    {
+      "epoch": 81.25333333333333,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002988180718579906,
+      "loss": 0.4679,
+      "step": 30470
+    },
+    {
+      "epoch": 81.28,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.000298817283997592,
+      "loss": 0.4817,
+      "step": 30480
+    },
+    {
+      "epoch": 81.30666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029881649587573065,
+      "loss": 0.463,
+      "step": 30490
+    },
+    {
+      "epoch": 81.33333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002988157074924079,
+      "loss": 0.478,
+      "step": 30500
+    },
+    {
+      "epoch": 81.36,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002988149188476251,
+      "loss": 0.4796,
+      "step": 30510
+    },
+    {
+      "epoch": 81.38666666666667,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0002988141299413838,
+      "loss": 0.4716,
+      "step": 30520
+    },
+    {
+      "epoch": 81.41333333333333,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.00029881334077368524,
+      "loss": 0.4765,
+      "step": 30530
+    },
+    {
+      "epoch": 81.44,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002988125513445309,
+      "loss": 0.4791,
+      "step": 30540
+    },
+    {
+      "epoch": 81.46666666666667,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029881176165392196,
+      "loss": 0.4884,
+      "step": 30550
+    },
+    {
+      "epoch": 81.49333333333334,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002988109717018601,
+      "loss": 0.4794,
+      "step": 30560
+    },
+    {
+      "epoch": 81.52,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029881018148834653,
+      "loss": 0.4741,
+      "step": 30570
+    },
+    {
+      "epoch": 81.54666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002988093910133826,
+      "loss": 0.4711,
+      "step": 30580
+    },
+    {
+      "epoch": 81.57333333333334,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0002988086002769699,
+      "loss": 0.4702,
+      "step": 30590
+    },
+    {
+      "epoch": 81.6,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029880780927910966,
+      "loss": 0.4643,
+      "step": 30600
+    },
+    {
+      "epoch": 81.62666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029880701801980326,
+      "loss": 0.4513,
+      "step": 30610
+    },
+    {
+      "epoch": 81.65333333333334,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029880622649905214,
+      "loss": 0.4706,
+      "step": 30620
+    },
+    {
+      "epoch": 81.68,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002988054347168577,
+      "loss": 0.4626,
+      "step": 30630
+    },
+    {
+      "epoch": 81.70666666666666,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002988046426732214,
+      "loss": 0.4623,
+      "step": 30640
+    },
+    {
+      "epoch": 81.73333333333333,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029880385036814444,
+      "loss": 0.4812,
+      "step": 30650
+    },
+    {
+      "epoch": 81.76,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029880305780162833,
+      "loss": 0.4827,
+      "step": 30660
+    },
+    {
+      "epoch": 81.78666666666666,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029880226497367443,
+      "loss": 0.4811,
+      "step": 30670
+    },
+    {
+      "epoch": 81.81333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002988014718842842,
+      "loss": 0.4662,
+      "step": 30680
+    },
+    {
+      "epoch": 81.84,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.000298800678533459,
+      "loss": 0.4723,
+      "step": 30690
+    },
+    {
+      "epoch": 81.86666666666666,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002987998849212002,
+      "loss": 0.4713,
+      "step": 30700
+    },
+    {
+      "epoch": 81.89333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029879909104750925,
+      "loss": 0.4626,
+      "step": 30710
+    },
+    {
+      "epoch": 81.92,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029879829691238744,
+      "loss": 0.4729,
+      "step": 30720
+    },
+    {
+      "epoch": 81.94666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029879750251583624,
+      "loss": 0.4649,
+      "step": 30730
+    },
+    {
+      "epoch": 81.97333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.000298796707857857,
+      "loss": 0.4773,
+      "step": 30740
+    },
+    {
+      "epoch": 82.0,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029879591293845116,
+      "loss": 0.4664,
+      "step": 30750
+    },
+    {
+      "epoch": 82.0,
+      "eval_loss": 0.4804117679595947,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1678,
+      "eval_samples_per_second": 1.574,
+      "eval_steps_per_second": 0.098,
+      "step": 30750
+    },
+    {
+      "epoch": 82.02666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029879511775762014,
+      "loss": 0.4853,
+      "step": 30760
+    },
+    {
+      "epoch": 82.05333333333333,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029879432231536533,
+      "loss": 0.4913,
+      "step": 30770
+    },
+    {
+      "epoch": 82.08,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029879352661168806,
+      "loss": 0.4789,
+      "step": 30780
+    },
+    {
+      "epoch": 82.10666666666667,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00029879273064658976,
+      "loss": 0.4736,
+      "step": 30790
+    },
+    {
+      "epoch": 82.13333333333334,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029879193442007183,
+      "loss": 0.4739,
+      "step": 30800
+    },
+    {
+      "epoch": 82.16,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002987911379321357,
+      "loss": 0.4794,
+      "step": 30810
+    },
+    {
+      "epoch": 82.18666666666667,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002987903411827827,
+      "loss": 0.4711,
+      "step": 30820
+    },
+    {
+      "epoch": 82.21333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029878954417201435,
+      "loss": 0.4663,
+      "step": 30830
+    },
+    {
+      "epoch": 82.24,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029878874689983194,
+      "loss": 0.4707,
+      "step": 30840
+    },
+    {
+      "epoch": 82.26666666666667,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0002987879493662369,
+      "loss": 0.4751,
+      "step": 30850
+    },
+    {
+      "epoch": 82.29333333333334,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002987871515712306,
+      "loss": 0.4685,
+      "step": 30860
+    },
+    {
+      "epoch": 82.32,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029878635351481453,
+      "loss": 0.4719,
+      "step": 30870
+    },
+    {
+      "epoch": 82.34666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029878555519699003,
+      "loss": 0.4801,
+      "step": 30880
+    },
+    {
+      "epoch": 82.37333333333333,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029878475661775854,
+      "loss": 0.4753,
+      "step": 30890
+    },
+    {
+      "epoch": 82.4,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002987839577771214,
+      "loss": 0.4728,
+      "step": 30900
+    },
+    {
+      "epoch": 82.42666666666666,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029878315867508,
+      "loss": 0.4793,
+      "step": 30910
+    },
+    {
+      "epoch": 82.45333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002987823593116359,
+      "loss": 0.4804,
+      "step": 30920
+    },
+    {
+      "epoch": 82.48,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002987815596867903,
+      "loss": 0.4876,
+      "step": 30930
+    },
+    {
+      "epoch": 82.50666666666666,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002987807598005448,
+      "loss": 0.4765,
+      "step": 30940
+    },
+    {
+      "epoch": 82.53333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029877995965290065,
+      "loss": 0.4707,
+      "step": 30950
+    },
+    {
+      "epoch": 82.56,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002987791592438593,
+      "loss": 0.4719,
+      "step": 30960
+    },
+    {
+      "epoch": 82.58666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002987783585734222,
+      "loss": 0.4677,
+      "step": 30970
+    },
+    {
+      "epoch": 82.61333333333333,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029877755764159074,
+      "loss": 0.4539,
+      "step": 30980
+    },
+    {
+      "epoch": 82.64,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00029877675644836624,
+      "loss": 0.4587,
+      "step": 30990
+    },
+    {
+      "epoch": 82.66666666666667,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002987759549937503,
+      "loss": 0.4742,
+      "step": 31000
+    },
+    {
+      "epoch": 82.69333333333333,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002987751532777441,
+      "loss": 0.457,
+      "step": 31010
+    },
+    {
+      "epoch": 82.72,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029877435130034924,
+      "loss": 0.4747,
+      "step": 31020
+    },
+    {
+      "epoch": 82.74666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.000298773549061567,
+      "loss": 0.4779,
+      "step": 31030
+    },
+    {
+      "epoch": 82.77333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029877274656139885,
+      "loss": 0.4861,
+      "step": 31040
+    },
+    {
+      "epoch": 82.8,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002987719437998462,
+      "loss": 0.4733,
+      "step": 31050
+    },
+    {
+      "epoch": 82.82666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002987711407769104,
+      "loss": 0.4674,
+      "step": 31060
+    },
+    {
+      "epoch": 82.85333333333334,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029877033749259297,
+      "loss": 0.4761,
+      "step": 31070
+    },
+    {
+      "epoch": 82.88,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002987695339468952,
+      "loss": 0.4643,
+      "step": 31080
+    },
+    {
+      "epoch": 82.90666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029876873013981865,
+      "loss": 0.4669,
+      "step": 31090
+    },
+    {
+      "epoch": 82.93333333333334,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002987679260713645,
+      "loss": 0.4685,
+      "step": 31100
+    },
+    {
+      "epoch": 82.96,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029876712174153446,
+      "loss": 0.4668,
+      "step": 31110
+    },
+    {
+      "epoch": 82.98666666666666,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002987663171503297,
+      "loss": 0.4802,
+      "step": 31120
+    },
+    {
+      "epoch": 83.0,
+      "eval_loss": 0.48024803400039673,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8308,
+      "eval_samples_per_second": 1.628,
+      "eval_steps_per_second": 0.102,
+      "step": 31125
+    },
+    {
+      "epoch": 83.01333333333334,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002987655122977518,
+      "loss": 0.4696,
+      "step": 31130
+    },
+    {
+      "epoch": 83.04,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.000298764707183802,
+      "loss": 0.4939,
+      "step": 31140
+    },
+    {
+      "epoch": 83.06666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002987639018084819,
+      "loss": 0.4816,
+      "step": 31150
+    },
+    {
+      "epoch": 83.09333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029876309617179276,
+      "loss": 0.4755,
+      "step": 31160
+    },
+    {
+      "epoch": 83.12,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002987622902737361,
+      "loss": 0.4729,
+      "step": 31170
+    },
+    {
+      "epoch": 83.14666666666666,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002987614841143133,
+      "loss": 0.4816,
+      "step": 31180
+    },
+    {
+      "epoch": 83.17333333333333,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002987606776935257,
+      "loss": 0.472,
+      "step": 31190
+    },
+    {
+      "epoch": 83.2,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029875987101137485,
+      "loss": 0.47,
+      "step": 31200
+    },
+    {
+      "epoch": 83.22666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029875906406786213,
+      "loss": 0.4682,
+      "step": 31210
+    },
+    {
+      "epoch": 83.25333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029875825686298893,
+      "loss": 0.4684,
+      "step": 31220
+    },
+    {
+      "epoch": 83.28,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029875744939675665,
+      "loss": 0.4813,
+      "step": 31230
+    },
+    {
+      "epoch": 83.30666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029875664166916676,
+      "loss": 0.4621,
+      "step": 31240
+    },
+    {
+      "epoch": 83.33333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002987558336802206,
+      "loss": 0.4775,
+      "step": 31250
+    },
+    {
+      "epoch": 83.36,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002987550254299197,
+      "loss": 0.4795,
+      "step": 31260
+    },
+    {
+      "epoch": 83.38666666666667,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029875421691826545,
+      "loss": 0.4714,
+      "step": 31270
+    },
+    {
+      "epoch": 83.41333333333333,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002987534081452592,
+      "loss": 0.4755,
+      "step": 31280
+    },
+    {
+      "epoch": 83.44,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0002987525991109024,
+      "loss": 0.4789,
+      "step": 31290
+    },
+    {
+      "epoch": 83.46666666666667,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029875178981519647,
+      "loss": 0.4879,
+      "step": 31300
+    },
+    {
+      "epoch": 83.49333333333334,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.00029875098025814286,
+      "loss": 0.4793,
+      "step": 31310
+    },
+    {
+      "epoch": 83.52,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.000298750170439743,
+      "loss": 0.4736,
+      "step": 31320
+    },
+    {
+      "epoch": 83.54666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002987493603599983,
+      "loss": 0.4708,
+      "step": 31330
+    },
+    {
+      "epoch": 83.57333333333334,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029874855001891013,
+      "loss": 0.4702,
+      "step": 31340
+    },
+    {
+      "epoch": 83.6,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029874773941648,
+      "loss": 0.4641,
+      "step": 31350
+    },
+    {
+      "epoch": 83.62666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002987469285527093,
+      "loss": 0.4512,
+      "step": 31360
+    },
+    {
+      "epoch": 83.65333333333334,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002987461174275994,
+      "loss": 0.4703,
+      "step": 31370
+    },
+    {
+      "epoch": 83.68,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002987453060411518,
+      "loss": 0.4619,
+      "step": 31380
+    },
+    {
+      "epoch": 83.70666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002987444943933679,
+      "loss": 0.4629,
+      "step": 31390
+    },
+    {
+      "epoch": 83.73333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002987436824842491,
+      "loss": 0.4817,
+      "step": 31400
+    },
+    {
+      "epoch": 83.76,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002987428703137969,
+      "loss": 0.4827,
+      "step": 31410
+    },
+    {
+      "epoch": 83.78666666666666,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029874205788201267,
+      "loss": 0.4816,
+      "step": 31420
+    },
+    {
+      "epoch": 83.81333333333333,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002987412451888978,
+      "loss": 0.4666,
+      "step": 31430
+    },
+    {
+      "epoch": 83.84,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002987404322344538,
+      "loss": 0.4718,
+      "step": 31440
+    },
+    {
+      "epoch": 83.86666666666666,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00029873961901868203,
+      "loss": 0.4709,
+      "step": 31450
+    },
+    {
+      "epoch": 83.89333333333333,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.000298738805541584,
+      "loss": 0.4624,
+      "step": 31460
+    },
+    {
+      "epoch": 83.92,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.00029873799180316107,
+      "loss": 0.4729,
+      "step": 31470
+    },
+    {
+      "epoch": 83.94666666666667,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029873717780341466,
+      "loss": 0.4648,
+      "step": 31480
+    },
+    {
+      "epoch": 83.97333333333333,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.00029873636354234626,
+      "loss": 0.4776,
+      "step": 31490
+    },
+    {
+      "epoch": 84.0,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029873554901995727,
+      "loss": 0.4658,
+      "step": 31500
+    },
+    {
+      "epoch": 84.0,
+      "eval_loss": 0.48183757066726685,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.5507,
+      "eval_samples_per_second": 1.675,
+      "eval_steps_per_second": 0.105,
+      "step": 31500
+    },
+    {
+      "epoch": 84.02666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029873473423624905,
+      "loss": 0.4854,
+      "step": 31510
+    },
+    {
+      "epoch": 84.05333333333333,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002987339191912232,
+      "loss": 0.4905,
+      "step": 31520
+    },
+    {
+      "epoch": 84.08,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.00029873310388488103,
+      "loss": 0.4782,
+      "step": 31530
+    },
+    {
+      "epoch": 84.10666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.000298732288317224,
+      "loss": 0.4729,
+      "step": 31540
+    },
+    {
+      "epoch": 84.13333333333334,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002987314724882535,
+      "loss": 0.475,
+      "step": 31550
+    },
+    {
+      "epoch": 84.16,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.000298730656397971,
+      "loss": 0.4785,
+      "step": 31560
+    },
+    {
+      "epoch": 84.18666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.000298729840046378,
+      "loss": 0.4714,
+      "step": 31570
+    },
+    {
+      "epoch": 84.21333333333334,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00029872902343347583,
+      "loss": 0.4656,
+      "step": 31580
+    },
+    {
+      "epoch": 84.24,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.000298728206559266,
+      "loss": 0.4699,
+      "step": 31590
+    },
+    {
+      "epoch": 84.26666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029872738942374985,
+      "loss": 0.4742,
+      "step": 31600
+    },
+    {
+      "epoch": 84.29333333333334,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002987265720269289,
+      "loss": 0.4685,
+      "step": 31610
+    },
+    {
+      "epoch": 84.32,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00029872575436880463,
+      "loss": 0.4715,
+      "step": 31620
+    },
+    {
+      "epoch": 84.34666666666666,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002987249364493783,
+      "loss": 0.4804,
+      "step": 31630
+    },
+    {
+      "epoch": 84.37333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029872411826865154,
+      "loss": 0.4758,
+      "step": 31640
+    },
+    {
+      "epoch": 84.4,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.00029872329982662564,
+      "loss": 0.472,
+      "step": 31650
+    },
+    {
+      "epoch": 84.42666666666666,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0002987224811233022,
+      "loss": 0.4787,
+      "step": 31660
+    },
+    {
+      "epoch": 84.45333333333333,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002987216621586825,
+      "loss": 0.4805,
+      "step": 31670
+    },
+    {
+      "epoch": 84.48,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.000298720842932768,
+      "loss": 0.4872,
+      "step": 31680
+    },
+    {
+      "epoch": 84.50666666666666,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002987200234455602,
+      "loss": 0.4768,
+      "step": 31690
+    },
+    {
+      "epoch": 84.53333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029871920369706055,
+      "loss": 0.4707,
+      "step": 31700
+    },
+    {
+      "epoch": 84.56,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029871838368727044,
+      "loss": 0.4719,
+      "step": 31710
+    },
+    {
+      "epoch": 84.58666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029871756341619135,
+      "loss": 0.4668,
+      "step": 31720
+    },
+    {
+      "epoch": 84.61333333333333,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0002987167428838247,
+      "loss": 0.453,
+      "step": 31730
+    },
+    {
+      "epoch": 84.64,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029871592209017187,
+      "loss": 0.4585,
+      "step": 31740
+    },
+    {
+      "epoch": 84.66666666666667,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002987151010352344,
+      "loss": 0.4739,
+      "step": 31750
+    },
+    {
+      "epoch": 84.69333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002987142797190137,
+      "loss": 0.4565,
+      "step": 31760
+    },
+    {
+      "epoch": 84.72,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029871345814151126,
+      "loss": 0.4742,
+      "step": 31770
+    },
+    {
+      "epoch": 84.74666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002987126363027284,
+      "loss": 0.4786,
+      "step": 31780
+    },
+    {
+      "epoch": 84.77333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029871181420266667,
+      "loss": 0.486,
+      "step": 31790
+    },
+    {
+      "epoch": 84.8,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029871099184132743,
+      "loss": 0.4731,
+      "step": 31800
+    },
+    {
+      "epoch": 84.82666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002987101692187123,
+      "loss": 0.4663,
+      "step": 31810
+    },
+    {
+      "epoch": 84.85333333333334,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002987093463348225,
+      "loss": 0.4756,
+      "step": 31820
+    },
+    {
+      "epoch": 84.88,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002987085231896596,
+      "loss": 0.4643,
+      "step": 31830
+    },
+    {
+      "epoch": 84.90666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.000298707699783225,
+      "loss": 0.4665,
+      "step": 31840
+    },
+    {
+      "epoch": 84.93333333333334,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029870687611552023,
+      "loss": 0.4678,
+      "step": 31850
+    },
+    {
+      "epoch": 84.96,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002987060521865466,
+      "loss": 0.4666,
+      "step": 31860
+    },
+    {
+      "epoch": 84.98666666666666,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002987052279963057,
+      "loss": 0.4805,
+      "step": 31870
+    },
+    {
+      "epoch": 85.0,
+      "eval_loss": 0.48072928190231323,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7018,
+      "eval_samples_per_second": 1.649,
+      "eval_steps_per_second": 0.103,
+      "step": 31875
+    },
+    {
+      "epoch": 85.01333333333334,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.00029870440354479893,
+      "loss": 0.4689,
+      "step": 31880
+    },
+    {
+      "epoch": 85.04,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029870357883202765,
+      "loss": 0.4933,
+      "step": 31890
+    },
+    {
+      "epoch": 85.06666666666666,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002987027538579934,
+      "loss": 0.4812,
+      "step": 31900
+    },
+    {
+      "epoch": 85.09333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002987019286226977,
+      "loss": 0.4754,
+      "step": 31910
+    },
+    {
+      "epoch": 85.12,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0002987011031261418,
+      "loss": 0.4721,
+      "step": 31920
+    },
+    {
+      "epoch": 85.14666666666666,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002987002773683273,
+      "loss": 0.4811,
+      "step": 31930
+    },
+    {
+      "epoch": 85.17333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029869945134925564,
+      "loss": 0.4715,
+      "step": 31940
+    },
+    {
+      "epoch": 85.2,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029869862506892817,
+      "loss": 0.4698,
+      "step": 31950
+    },
+    {
+      "epoch": 85.22666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029869779852734646,
+      "loss": 0.4683,
+      "step": 31960
+    },
+    {
+      "epoch": 85.25333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002986969717245119,
+      "loss": 0.467,
+      "step": 31970
+    },
+    {
+      "epoch": 85.28,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.000298696144660426,
+      "loss": 0.4812,
+      "step": 31980
+    },
+    {
+      "epoch": 85.30666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002986953173350901,
+      "loss": 0.4616,
+      "step": 31990
+    },
+    {
+      "epoch": 85.33333333333333,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029869448974850576,
+      "loss": 0.4779,
+      "step": 32000
+    },
+    {
+      "epoch": 85.36,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0002986936619006744,
+      "loss": 0.4797,
+      "step": 32010
+    },
+    {
+      "epoch": 85.38666666666667,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002986928337915975,
+      "loss": 0.4714,
+      "step": 32020
+    },
+    {
+      "epoch": 85.41333333333333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029869200542127647,
+      "loss": 0.4752,
+      "step": 32030
+    },
+    {
+      "epoch": 85.44,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.00029869117678971277,
+      "loss": 0.4787,
+      "step": 32040
+    },
+    {
+      "epoch": 85.46666666666667,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029869034789690784,
+      "loss": 0.4874,
+      "step": 32050
+    },
+    {
+      "epoch": 85.49333333333334,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00029868951874286327,
+      "loss": 0.4802,
+      "step": 32060
+    },
+    {
+      "epoch": 85.52,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029868868932758034,
+      "loss": 0.474,
+      "step": 32070
+    },
+    {
+      "epoch": 85.54666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002986878596510606,
+      "loss": 0.4711,
+      "step": 32080
+    },
+    {
+      "epoch": 85.57333333333334,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002986870297133055,
+      "loss": 0.4699,
+      "step": 32090
+    },
+    {
+      "epoch": 85.6,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002986861995143164,
+      "loss": 0.4635,
+      "step": 32100
+    },
+    {
+      "epoch": 85.62666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.000298685369054095,
+      "loss": 0.4502,
+      "step": 32110
+    },
+    {
+      "epoch": 85.65333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002986845383326425,
+      "loss": 0.4706,
+      "step": 32120
+    },
+    {
+      "epoch": 85.68,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029868370734996045,
+      "loss": 0.4617,
+      "step": 32130
+    },
+    {
+      "epoch": 85.70666666666666,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002986828761060504,
+      "loss": 0.4619,
+      "step": 32140
+    },
+    {
+      "epoch": 85.73333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002986820446009136,
+      "loss": 0.4807,
+      "step": 32150
+    },
+    {
+      "epoch": 85.76,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029868121283455176,
+      "loss": 0.4818,
+      "step": 32160
+    },
+    {
+      "epoch": 85.78666666666666,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0002986803808069662,
+      "loss": 0.4811,
+      "step": 32170
+    },
+    {
+      "epoch": 85.81333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002986795485181584,
+      "loss": 0.4663,
+      "step": 32180
+    },
+    {
+      "epoch": 85.84,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029867871596812986,
+      "loss": 0.4723,
+      "step": 32190
+    },
+    {
+      "epoch": 85.86666666666666,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.000298677883156882,
+      "loss": 0.4709,
+      "step": 32200
+    },
+    {
+      "epoch": 85.89333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029867705008441634,
+      "loss": 0.4625,
+      "step": 32210
+    },
+    {
+      "epoch": 85.92,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029867621675073417,
+      "loss": 0.4726,
+      "step": 32220
+    },
+    {
+      "epoch": 85.94666666666667,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0002986753831558372,
+      "loss": 0.4655,
+      "step": 32230
+    },
+    {
+      "epoch": 85.97333333333333,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002986745492997267,
+      "loss": 0.4775,
+      "step": 32240
+    },
+    {
+      "epoch": 86.0,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002986737151824043,
+      "loss": 0.4658,
+      "step": 32250
+    },
+    {
+      "epoch": 86.0,
+      "eval_loss": 0.48104819655418396,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.7417,
+      "eval_samples_per_second": 1.49,
+      "eval_steps_per_second": 0.093,
+      "step": 32250
+    },
+    {
+      "epoch": 86.02666666666667,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002986728808038713,
+      "loss": 0.4851,
+      "step": 32260
+    },
+    {
+      "epoch": 86.05333333333333,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002986720461641293,
+      "loss": 0.491,
+      "step": 32270
+    },
+    {
+      "epoch": 86.08,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029867121126317967,
+      "loss": 0.4782,
+      "step": 32280
+    },
+    {
+      "epoch": 86.10666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.000298670376101024,
+      "loss": 0.4731,
+      "step": 32290
+    },
+    {
+      "epoch": 86.13333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029866954067766355,
+      "loss": 0.4752,
+      "step": 32300
+    },
+    {
+      "epoch": 86.16,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002986687049931,
+      "loss": 0.4786,
+      "step": 32310
+    },
+    {
+      "epoch": 86.18666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002986678690473347,
+      "loss": 0.4708,
+      "step": 32320
+    },
+    {
+      "epoch": 86.21333333333334,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029866703284036917,
+      "loss": 0.4655,
+      "step": 32330
+    },
+    {
+      "epoch": 86.24,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029866619637220485,
+      "loss": 0.47,
+      "step": 32340
+    },
+    {
+      "epoch": 86.26666666666667,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0002986653596428432,
+      "loss": 0.4741,
+      "step": 32350
+    },
+    {
+      "epoch": 86.29333333333334,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.00029866452265228573,
+      "loss": 0.4685,
+      "step": 32360
+    },
+    {
+      "epoch": 86.32,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002986636854005339,
+      "loss": 0.4712,
+      "step": 32370
+    },
+    {
+      "epoch": 86.34666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002986628478875892,
+      "loss": 0.4796,
+      "step": 32380
+    },
+    {
+      "epoch": 86.37333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.000298662010113453,
+      "loss": 0.4762,
+      "step": 32390
+    },
+    {
+      "epoch": 86.4,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002986611720781269,
+      "loss": 0.4724,
+      "step": 32400
+    },
+    {
+      "epoch": 86.42666666666666,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002986603337816123,
+      "loss": 0.4788,
+      "step": 32410
+    },
+    {
+      "epoch": 86.45333333333333,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00029865949522391065,
+      "loss": 0.4804,
+      "step": 32420
+    },
+    {
+      "epoch": 86.48,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029865865640502353,
+      "loss": 0.4869,
+      "step": 32430
+    },
+    {
+      "epoch": 86.50666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029865781732495226,
+      "loss": 0.4764,
+      "step": 32440
+    },
+    {
+      "epoch": 86.53333333333333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002986569779836985,
+      "loss": 0.4705,
+      "step": 32450
+    },
+    {
+      "epoch": 86.56,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029865613838126354,
+      "loss": 0.4715,
+      "step": 32460
+    },
+    {
+      "epoch": 86.58666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000298655298517649,
+      "loss": 0.467,
+      "step": 32470
+    },
+    {
+      "epoch": 86.61333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002986544583928563,
+      "loss": 0.4536,
+      "step": 32480
+    },
+    {
+      "epoch": 86.64,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002986536180068869,
+      "loss": 0.4579,
+      "step": 32490
+    },
+    {
+      "epoch": 86.66666666666667,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0002986527773597422,
+      "loss": 0.4735,
+      "step": 32500
+    },
+    {
+      "epoch": 86.69333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002986519364514239,
+      "loss": 0.4572,
+      "step": 32510
+    },
+    {
+      "epoch": 86.72,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029865109528193326,
+      "loss": 0.4737,
+      "step": 32520
+    },
+    {
+      "epoch": 86.74666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029865025385127184,
+      "loss": 0.4781,
+      "step": 32530
+    },
+    {
+      "epoch": 86.77333333333333,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002986494121594412,
+      "loss": 0.4864,
+      "step": 32540
+    },
+    {
+      "epoch": 86.8,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029864857020644265,
+      "loss": 0.4727,
+      "step": 32550
+    },
+    {
+      "epoch": 86.82666666666667,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002986477279922778,
+      "loss": 0.4676,
+      "step": 32560
+    },
+    {
+      "epoch": 86.85333333333334,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002986468855169481,
+      "loss": 0.4765,
+      "step": 32570
+    },
+    {
+      "epoch": 86.88,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.000298646042780455,
+      "loss": 0.4636,
+      "step": 32580
+    },
+    {
+      "epoch": 86.90666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002986451997828,
+      "loss": 0.4672,
+      "step": 32590
+    },
+    {
+      "epoch": 86.93333333333334,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002986443565239845,
+      "loss": 0.4687,
+      "step": 32600
+    },
+    {
+      "epoch": 86.96,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029864351300401017,
+      "loss": 0.466,
+      "step": 32610
+    },
+    {
+      "epoch": 86.98666666666666,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029864266922287834,
+      "loss": 0.4798,
+      "step": 32620
+    },
+    {
+      "epoch": 87.0,
+      "eval_loss": 0.4798685312271118,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9855,
+      "eval_samples_per_second": 1.602,
+      "eval_steps_per_second": 0.1,
+      "step": 32625
+    },
+    {
+      "epoch": 87.01333333333334,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029864182518059057,
+      "loss": 0.4695,
+      "step": 32630
+    },
+    {
+      "epoch": 87.04,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029864098087714825,
+      "loss": 0.4932,
+      "step": 32640
+    },
+    {
+      "epoch": 87.06666666666666,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029864013631255296,
+      "loss": 0.4814,
+      "step": 32650
+    },
+    {
+      "epoch": 87.09333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029863929148680617,
+      "loss": 0.4764,
+      "step": 32660
+    },
+    {
+      "epoch": 87.12,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002986384463999093,
+      "loss": 0.4726,
+      "step": 32670
+    },
+    {
+      "epoch": 87.14666666666666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029863760105186386,
+      "loss": 0.4808,
+      "step": 32680
+    },
+    {
+      "epoch": 87.17333333333333,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00029863675544267133,
+      "loss": 0.4709,
+      "step": 32690
+    },
+    {
+      "epoch": 87.2,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029863590957233325,
+      "loss": 0.4694,
+      "step": 32700
+    },
+    {
+      "epoch": 87.22666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002986350634408511,
+      "loss": 0.4674,
+      "step": 32710
+    },
+    {
+      "epoch": 87.25333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029863421704822633,
+      "loss": 0.4673,
+      "step": 32720
+    },
+    {
+      "epoch": 87.28,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029863337039446035,
+      "loss": 0.4809,
+      "step": 32730
+    },
+    {
+      "epoch": 87.30666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029863252347955485,
+      "loss": 0.462,
+      "step": 32740
+    },
+    {
+      "epoch": 87.33333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002986316763035112,
+      "loss": 0.4772,
+      "step": 32750
+    },
+    {
+      "epoch": 87.36,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002986308288663308,
+      "loss": 0.4793,
+      "step": 32760
+    },
+    {
+      "epoch": 87.38666666666667,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002986299811680153,
+      "loss": 0.4715,
+      "step": 32770
+    },
+    {
+      "epoch": 87.41333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029862913320856605,
+      "loss": 0.4755,
+      "step": 32780
+    },
+    {
+      "epoch": 87.44,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029862828498798466,
+      "loss": 0.4785,
+      "step": 32790
+    },
+    {
+      "epoch": 87.46666666666667,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002986274365062725,
+      "loss": 0.4883,
+      "step": 32800
+    },
+    {
+      "epoch": 87.49333333333334,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002986265877634312,
+      "loss": 0.4802,
+      "step": 32810
+    },
+    {
+      "epoch": 87.52,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.00029862573875946214,
+      "loss": 0.4742,
+      "step": 32820
+    },
+    {
+      "epoch": 87.54666666666667,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029862488949436694,
+      "loss": 0.4708,
+      "step": 32830
+    },
+    {
+      "epoch": 87.57333333333334,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002986240399681469,
+      "loss": 0.4696,
+      "step": 32840
+    },
+    {
+      "epoch": 87.6,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029862319018080364,
+      "loss": 0.4636,
+      "step": 32850
+    },
+    {
+      "epoch": 87.62666666666667,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00029862234013233865,
+      "loss": 0.451,
+      "step": 32860
+    },
+    {
+      "epoch": 87.65333333333334,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002986214898227534,
+      "loss": 0.4692,
+      "step": 32870
+    },
+    {
+      "epoch": 87.68,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029862063925204934,
+      "loss": 0.4608,
+      "step": 32880
+    },
+    {
+      "epoch": 87.70666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000298619788420228,
+      "loss": 0.462,
+      "step": 32890
+    },
+    {
+      "epoch": 87.73333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.000298618937327291,
+      "loss": 0.4811,
+      "step": 32900
+    },
+    {
+      "epoch": 87.76,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002986180859732396,
+      "loss": 0.4822,
+      "step": 32910
+    },
+    {
+      "epoch": 87.78666666666666,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00029861723435807547,
+      "loss": 0.4806,
+      "step": 32920
+    },
+    {
+      "epoch": 87.81333333333333,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00029861638248180006,
+      "loss": 0.4659,
+      "step": 32930
+    },
+    {
+      "epoch": 87.84,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029861553034441483,
+      "loss": 0.4715,
+      "step": 32940
+    },
+    {
+      "epoch": 87.86666666666666,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002986146779459213,
+      "loss": 0.4707,
+      "step": 32950
+    },
+    {
+      "epoch": 87.89333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029861382528632103,
+      "loss": 0.4624,
+      "step": 32960
+    },
+    {
+      "epoch": 87.92,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029861297236561545,
+      "loss": 0.4724,
+      "step": 32970
+    },
+    {
+      "epoch": 87.94666666666667,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.000298612119183806,
+      "loss": 0.465,
+      "step": 32980
+    },
+    {
+      "epoch": 87.97333333333333,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002986112657408943,
+      "loss": 0.4768,
+      "step": 32990
+    },
+    {
+      "epoch": 88.0,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0002986104120368818,
+      "loss": 0.4652,
+      "step": 33000
+    },
+    {
+      "epoch": 88.0,
+      "eval_loss": 0.4803306758403778,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9873,
+      "eval_samples_per_second": 1.602,
+      "eval_steps_per_second": 0.1,
+      "step": 33000
+    },
+    {
+      "epoch": 88.02666666666667,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029860955807177003,
+      "loss": 0.4856,
+      "step": 33010
+    },
+    {
+      "epoch": 88.05333333333333,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002986087038455604,
+      "loss": 0.4915,
+      "step": 33020
+    },
+    {
+      "epoch": 88.08,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0002986078493582545,
+      "loss": 0.4781,
+      "step": 33030
+    },
+    {
+      "epoch": 88.10666666666667,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0002986069946098538,
+      "loss": 0.4728,
+      "step": 33040
+    },
+    {
+      "epoch": 88.13333333333334,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002986061396003598,
+      "loss": 0.4753,
+      "step": 33050
+    },
+    {
+      "epoch": 88.16,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029860528432977404,
+      "loss": 0.4796,
+      "step": 33060
+    },
+    {
+      "epoch": 88.18666666666667,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00029860442879809794,
+      "loss": 0.471,
+      "step": 33070
+    },
+    {
+      "epoch": 88.21333333333334,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029860357300533306,
+      "loss": 0.4663,
+      "step": 33080
+    },
+    {
+      "epoch": 88.24,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002986027169514809,
+      "loss": 0.4697,
+      "step": 33090
+    },
+    {
+      "epoch": 88.26666666666667,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029860186063654294,
+      "loss": 0.4741,
+      "step": 33100
+    },
+    {
+      "epoch": 88.29333333333334,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002986010040605207,
+      "loss": 0.468,
+      "step": 33110
+    },
+    {
+      "epoch": 88.32,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029860014722341574,
+      "loss": 0.4711,
+      "step": 33120
+    },
+    {
+      "epoch": 88.34666666666666,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002985992901252295,
+      "loss": 0.4801,
+      "step": 33130
+    },
+    {
+      "epoch": 88.37333333333333,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002985984327659635,
+      "loss": 0.4761,
+      "step": 33140
+    },
+    {
+      "epoch": 88.4,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.00029859757514561923,
+      "loss": 0.4725,
+      "step": 33150
+    },
+    {
+      "epoch": 88.42666666666666,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002985967172641982,
+      "loss": 0.4788,
+      "step": 33160
+    },
+    {
+      "epoch": 88.45333333333333,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0002985958591217019,
+      "loss": 0.4807,
+      "step": 33170
+    },
+    {
+      "epoch": 88.48,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002985950007181319,
+      "loss": 0.4867,
+      "step": 33180
+    },
+    {
+      "epoch": 88.50666666666666,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029859414205348975,
+      "loss": 0.4763,
+      "step": 33190
+    },
+    {
+      "epoch": 88.53333333333333,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029859328312777675,
+      "loss": 0.4701,
+      "step": 33200
+    },
+    {
+      "epoch": 88.56,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029859242394099464,
+      "loss": 0.4712,
+      "step": 33210
+    },
+    {
+      "epoch": 88.58666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029859156449314484,
+      "loss": 0.4678,
+      "step": 33220
+    },
+    {
+      "epoch": 88.61333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029859070478422875,
+      "loss": 0.4534,
+      "step": 33230
+    },
+    {
+      "epoch": 88.64,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029858984481424805,
+      "loss": 0.4579,
+      "step": 33240
+    },
+    {
+      "epoch": 88.66666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002985889845832042,
+      "loss": 0.4737,
+      "step": 33250
+    },
+    {
+      "epoch": 88.69333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002985881240910987,
+      "loss": 0.4564,
+      "step": 33260
+    },
+    {
+      "epoch": 88.72,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029858726333793306,
+      "loss": 0.4737,
+      "step": 33270
+    },
+    {
+      "epoch": 88.74666666666667,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00029858640232370874,
+      "loss": 0.4786,
+      "step": 33280
+    },
+    {
+      "epoch": 88.77333333333333,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029858554104842733,
+      "loss": 0.4854,
+      "step": 33290
+    },
+    {
+      "epoch": 88.8,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002985846795120903,
+      "loss": 0.4718,
+      "step": 33300
+    },
+    {
+      "epoch": 88.82666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002985838177146992,
+      "loss": 0.4665,
+      "step": 33310
+    },
+    {
+      "epoch": 88.85333333333334,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029858295565625555,
+      "loss": 0.4757,
+      "step": 33320
+    },
+    {
+      "epoch": 88.88,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002985820933367608,
+      "loss": 0.4638,
+      "step": 33330
+    },
+    {
+      "epoch": 88.90666666666667,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029858123075621656,
+      "loss": 0.4667,
+      "step": 33340
+    },
+    {
+      "epoch": 88.93333333333334,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029858036791462417,
+      "loss": 0.4678,
+      "step": 33350
+    },
+    {
+      "epoch": 88.96,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002985795048119854,
+      "loss": 0.4661,
+      "step": 33360
+    },
+    {
+      "epoch": 88.98666666666666,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002985786414483015,
+      "loss": 0.4806,
+      "step": 33370
+    },
+    {
+      "epoch": 89.0,
+      "eval_loss": 0.48046040534973145,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 11.5105,
+      "eval_samples_per_second": 1.39,
+      "eval_steps_per_second": 0.087,
+      "step": 33375
+    },
+    {
+      "epoch": 89.01333333333334,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029857777782357417,
+      "loss": 0.4689,
+      "step": 33380
+    },
+    {
+      "epoch": 89.04,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029857691393780493,
+      "loss": 0.4924,
+      "step": 33390
+    },
+    {
+      "epoch": 89.06666666666666,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002985760497909952,
+      "loss": 0.4813,
+      "step": 33400
+    },
+    {
+      "epoch": 89.09333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029857518538314653,
+      "loss": 0.4757,
+      "step": 33410
+    },
+    {
+      "epoch": 89.12,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029857432071426045,
+      "loss": 0.4715,
+      "step": 33420
+    },
+    {
+      "epoch": 89.14666666666666,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002985734557843385,
+      "loss": 0.4811,
+      "step": 33430
+    },
+    {
+      "epoch": 89.17333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029857259059338224,
+      "loss": 0.472,
+      "step": 33440
+    },
+    {
+      "epoch": 89.2,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029857172514139303,
+      "loss": 0.47,
+      "step": 33450
+    },
+    {
+      "epoch": 89.22666666666667,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002985708594283725,
+      "loss": 0.4672,
+      "step": 33460
+    },
+    {
+      "epoch": 89.25333333333333,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002985699934543222,
+      "loss": 0.4681,
+      "step": 33470
+    },
+    {
+      "epoch": 89.28,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029856912721924354,
+      "loss": 0.4815,
+      "step": 33480
+    },
+    {
+      "epoch": 89.30666666666667,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002985682607231382,
+      "loss": 0.4625,
+      "step": 33490
+    },
+    {
+      "epoch": 89.33333333333333,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0002985673939660076,
+      "loss": 0.4775,
+      "step": 33500
+    },
+    {
+      "epoch": 89.36,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029856652694785325,
+      "loss": 0.4799,
+      "step": 33510
+    },
+    {
+      "epoch": 89.38666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002985656596686767,
+      "loss": 0.4709,
+      "step": 33520
+    },
+    {
+      "epoch": 89.41333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002985647921284795,
+      "loss": 0.4756,
+      "step": 33530
+    },
+    {
+      "epoch": 89.44,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002985639243272631,
+      "loss": 0.4786,
+      "step": 33540
+    },
+    {
+      "epoch": 89.46666666666667,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002985630562650291,
+      "loss": 0.4879,
+      "step": 33550
+    },
+    {
+      "epoch": 89.49333333333334,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029856218794177905,
+      "loss": 0.4795,
+      "step": 33560
+    },
+    {
+      "epoch": 89.52,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029856131935751437,
+      "loss": 0.4736,
+      "step": 33570
+    },
+    {
+      "epoch": 89.54666666666667,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002985604505122367,
+      "loss": 0.4702,
+      "step": 33580
+    },
+    {
+      "epoch": 89.57333333333334,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002985595814059474,
+      "loss": 0.4695,
+      "step": 33590
+    },
+    {
+      "epoch": 89.6,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029855871203864824,
+      "loss": 0.4631,
+      "step": 33600
+    },
+    {
+      "epoch": 89.62666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002985578424103405,
+      "loss": 0.4506,
+      "step": 33610
+    },
+    {
+      "epoch": 89.65333333333334,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002985569725210259,
+      "loss": 0.4697,
+      "step": 33620
+    },
+    {
+      "epoch": 89.68,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029855610237070583,
+      "loss": 0.4623,
+      "step": 33630
+    },
+    {
+      "epoch": 89.70666666666666,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002985552319593819,
+      "loss": 0.4619,
+      "step": 33640
+    },
+    {
+      "epoch": 89.73333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029855436128705564,
+      "loss": 0.4804,
+      "step": 33650
+    },
+    {
+      "epoch": 89.76,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029855349035372855,
+      "loss": 0.4822,
+      "step": 33660
+    },
+    {
+      "epoch": 89.78666666666666,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002985526191594022,
+      "loss": 0.4807,
+      "step": 33670
+    },
+    {
+      "epoch": 89.81333333333333,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.000298551747704078,
+      "loss": 0.4661,
+      "step": 33680
+    },
+    {
+      "epoch": 89.84,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002985508759877576,
+      "loss": 0.4711,
+      "step": 33690
+    },
+    {
+      "epoch": 89.86666666666666,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002985500040104425,
+      "loss": 0.4707,
+      "step": 33700
+    },
+    {
+      "epoch": 89.89333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002985491317721343,
+      "loss": 0.4621,
+      "step": 33710
+    },
+    {
+      "epoch": 89.92,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029854825927283436,
+      "loss": 0.4728,
+      "step": 33720
+    },
+    {
+      "epoch": 89.94666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002985473865125444,
+      "loss": 0.4649,
+      "step": 33730
+    },
+    {
+      "epoch": 89.97333333333333,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002985465134912658,
+      "loss": 0.476,
+      "step": 33740
+    },
+    {
+      "epoch": 90.0,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029854564020900023,
+      "loss": 0.465,
+      "step": 33750
+    },
+    {
+      "epoch": 90.0,
+      "eval_loss": 0.47980237007141113,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8873,
+      "eval_samples_per_second": 1.618,
+      "eval_steps_per_second": 0.101,
+      "step": 33750
+    },
+    {
+      "epoch": 90.02666666666667,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002985447666657491,
+      "loss": 0.485,
+      "step": 33760
+    },
+    {
+      "epoch": 90.05333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029854389286151407,
+      "loss": 0.4917,
+      "step": 33770
+    },
+    {
+      "epoch": 90.08,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002985430187962965,
+      "loss": 0.4785,
+      "step": 33780
+    },
+    {
+      "epoch": 90.10666666666667,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029854214447009816,
+      "loss": 0.4731,
+      "step": 33790
+    },
+    {
+      "epoch": 90.13333333333334,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002985412698829204,
+      "loss": 0.474,
+      "step": 33800
+    },
+    {
+      "epoch": 90.16,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029854039503476485,
+      "loss": 0.4788,
+      "step": 33810
+    },
+    {
+      "epoch": 90.18666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.000298539519925633,
+      "loss": 0.4712,
+      "step": 33820
+    },
+    {
+      "epoch": 90.21333333333334,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029853864455552635,
+      "loss": 0.4662,
+      "step": 33830
+    },
+    {
+      "epoch": 90.24,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029853776892444655,
+      "loss": 0.4693,
+      "step": 33840
+    },
+    {
+      "epoch": 90.26666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029853689303239507,
+      "loss": 0.474,
+      "step": 33850
+    },
+    {
+      "epoch": 90.29333333333334,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002985360168793734,
+      "loss": 0.4683,
+      "step": 33860
+    },
+    {
+      "epoch": 90.32,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029853514046538325,
+      "loss": 0.472,
+      "step": 33870
+    },
+    {
+      "epoch": 90.34666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029853426379042596,
+      "loss": 0.4796,
+      "step": 33880
+    },
+    {
+      "epoch": 90.37333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002985333868545032,
+      "loss": 0.476,
+      "step": 33890
+    },
+    {
+      "epoch": 90.4,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002985325096576164,
+      "loss": 0.4725,
+      "step": 33900
+    },
+    {
+      "epoch": 90.42666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002985316321997672,
+      "loss": 0.4783,
+      "step": 33910
+    },
+    {
+      "epoch": 90.45333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002985307544809571,
+      "loss": 0.4805,
+      "step": 33920
+    },
+    {
+      "epoch": 90.48,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002985298765011877,
+      "loss": 0.4873,
+      "step": 33930
+    },
+    {
+      "epoch": 90.50666666666666,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00029852899826046046,
+      "loss": 0.4763,
+      "step": 33940
+    },
+    {
+      "epoch": 90.53333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.000298528119758777,
+      "loss": 0.4701,
+      "step": 33950
+    },
+    {
+      "epoch": 90.56,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029852724099613877,
+      "loss": 0.4713,
+      "step": 33960
+    },
+    {
+      "epoch": 90.58666666666667,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0002985263619725474,
+      "loss": 0.4675,
+      "step": 33970
+    },
+    {
+      "epoch": 90.61333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002985254826880044,
+      "loss": 0.4533,
+      "step": 33980
+    },
+    {
+      "epoch": 90.64,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002985246031425113,
+      "loss": 0.4584,
+      "step": 33990
+    },
+    {
+      "epoch": 90.66666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029852372333606967,
+      "loss": 0.4738,
+      "step": 34000
+    },
+    {
+      "epoch": 90.69333333333333,
+      "grad_norm": 0.25,
+      "learning_rate": 0.000298522843268681,
+      "loss": 0.4567,
+      "step": 34010
+    },
+    {
+      "epoch": 90.72,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002985219629403469,
+      "loss": 0.4737,
+      "step": 34020
+    },
+    {
+      "epoch": 90.74666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029852108235106894,
+      "loss": 0.4777,
+      "step": 34030
+    },
+    {
+      "epoch": 90.77333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002985202015008486,
+      "loss": 0.4856,
+      "step": 34040
+    },
+    {
+      "epoch": 90.8,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029851932038968746,
+      "loss": 0.4718,
+      "step": 34050
+    },
+    {
+      "epoch": 90.82666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002985184390175871,
+      "loss": 0.4672,
+      "step": 34060
+    },
+    {
+      "epoch": 90.85333333333334,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.000298517557384549,
+      "loss": 0.4761,
+      "step": 34070
+    },
+    {
+      "epoch": 90.88,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029851667549057463,
+      "loss": 0.4638,
+      "step": 34080
+    },
+    {
+      "epoch": 90.90666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002985157933356658,
+      "loss": 0.4665,
+      "step": 34090
+    },
+    {
+      "epoch": 90.93333333333334,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029851491091982383,
+      "loss": 0.468,
+      "step": 34100
+    },
+    {
+      "epoch": 90.96,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002985140282430504,
+      "loss": 0.4657,
+      "step": 34110
+    },
+    {
+      "epoch": 90.98666666666666,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029851314530534694,
+      "loss": 0.48,
+      "step": 34120
+    },
+    {
+      "epoch": 91.0,
+      "eval_loss": 0.4798223376274109,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5524,
+      "eval_samples_per_second": 1.516,
+      "eval_steps_per_second": 0.095,
+      "step": 34125
+    },
+    {
+      "epoch": 91.01333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002985122621067151,
+      "loss": 0.4693,
+      "step": 34130
+    },
+    {
+      "epoch": 91.04,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00029851137864715644,
+      "loss": 0.4938,
+      "step": 34140
+    },
+    {
+      "epoch": 91.06666666666666,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029851049492667244,
+      "loss": 0.4816,
+      "step": 34150
+    },
+    {
+      "epoch": 91.09333333333333,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0002985096109452647,
+      "loss": 0.4753,
+      "step": 34160
+    },
+    {
+      "epoch": 91.12,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029850872670293476,
+      "loss": 0.4715,
+      "step": 34170
+    },
+    {
+      "epoch": 91.14666666666666,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002985078421996841,
+      "loss": 0.4809,
+      "step": 34180
+    },
+    {
+      "epoch": 91.17333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002985069574355145,
+      "loss": 0.4713,
+      "step": 34190
+    },
+    {
+      "epoch": 91.2,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029850607241042724,
+      "loss": 0.4694,
+      "step": 34200
+    },
+    {
+      "epoch": 91.22666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029850518712442404,
+      "loss": 0.4676,
+      "step": 34210
+    },
+    {
+      "epoch": 91.25333333333333,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0002985043015775064,
+      "loss": 0.4666,
+      "step": 34220
+    },
+    {
+      "epoch": 91.28,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002985034157696759,
+      "loss": 0.4807,
+      "step": 34230
+    },
+    {
+      "epoch": 91.30666666666667,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029850252970093407,
+      "loss": 0.4618,
+      "step": 34240
+    },
+    {
+      "epoch": 91.33333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029850164337128247,
+      "loss": 0.4774,
+      "step": 34250
+    },
+    {
+      "epoch": 91.36,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002985007567807227,
+      "loss": 0.4795,
+      "step": 34260
+    },
+    {
+      "epoch": 91.38666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002984998699292563,
+      "loss": 0.4714,
+      "step": 34270
+    },
+    {
+      "epoch": 91.41333333333333,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0002984989828168848,
+      "loss": 0.4753,
+      "step": 34280
+    },
+    {
+      "epoch": 91.44,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029849809544360974,
+      "loss": 0.4789,
+      "step": 34290
+    },
+    {
+      "epoch": 91.46666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029849720780943275,
+      "loss": 0.4875,
+      "step": 34300
+    },
+    {
+      "epoch": 91.49333333333334,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002984963199143553,
+      "loss": 0.4795,
+      "step": 34310
+    },
+    {
+      "epoch": 91.52,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00029849543175837907,
+      "loss": 0.4733,
+      "step": 34320
+    },
+    {
+      "epoch": 91.54666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029849454334150553,
+      "loss": 0.4702,
+      "step": 34330
+    },
+    {
+      "epoch": 91.57333333333334,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.00029849365466373624,
+      "loss": 0.4698,
+      "step": 34340
+    },
+    {
+      "epoch": 91.6,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029849276572507284,
+      "loss": 0.4632,
+      "step": 34350
+    },
+    {
+      "epoch": 91.62666666666667,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002984918765255168,
+      "loss": 0.4502,
+      "step": 34360
+    },
+    {
+      "epoch": 91.65333333333334,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002984909870650697,
+      "loss": 0.4697,
+      "step": 34370
+    },
+    {
+      "epoch": 91.68,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029849009734373317,
+      "loss": 0.4613,
+      "step": 34380
+    },
+    {
+      "epoch": 91.70666666666666,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029848920736150864,
+      "loss": 0.462,
+      "step": 34390
+    },
+    {
+      "epoch": 91.73333333333333,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0002984883171183978,
+      "loss": 0.4806,
+      "step": 34400
+    },
+    {
+      "epoch": 91.76,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0002984874266144022,
+      "loss": 0.4822,
+      "step": 34410
+    },
+    {
+      "epoch": 91.78666666666666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002984865358495233,
+      "loss": 0.4804,
+      "step": 34420
+    },
+    {
+      "epoch": 91.81333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002984856448237629,
+      "loss": 0.4656,
+      "step": 34430
+    },
+    {
+      "epoch": 91.84,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029848475353712225,
+      "loss": 0.4722,
+      "step": 34440
+    },
+    {
+      "epoch": 91.86666666666666,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002984838619896031,
+      "loss": 0.4713,
+      "step": 34450
+    },
+    {
+      "epoch": 91.89333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029848297018120705,
+      "loss": 0.4615,
+      "step": 34460
+    },
+    {
+      "epoch": 91.92,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029848207811193557,
+      "loss": 0.4724,
+      "step": 34470
+    },
+    {
+      "epoch": 91.94666666666667,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002984811857817903,
+      "loss": 0.4649,
+      "step": 34480
+    },
+    {
+      "epoch": 91.97333333333333,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0002984802931907727,
+      "loss": 0.4762,
+      "step": 34490
+    },
+    {
+      "epoch": 92.0,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002984794003388845,
+      "loss": 0.4657,
+      "step": 34500
+    },
+    {
+      "epoch": 92.0,
+      "eval_loss": 0.4802961051464081,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1935,
+      "eval_samples_per_second": 1.57,
+      "eval_steps_per_second": 0.098,
+      "step": 34500
+    },
+    {
+      "epoch": 92.02666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002984785072261271,
+      "loss": 0.4849,
+      "step": 34510
+    },
+    {
+      "epoch": 92.05333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029847761385250215,
+      "loss": 0.4904,
+      "step": 34520
+    },
+    {
+      "epoch": 92.08,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029847672021801125,
+      "loss": 0.478,
+      "step": 34530
+    },
+    {
+      "epoch": 92.10666666666667,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00029847582632265595,
+      "loss": 0.4734,
+      "step": 34540
+    },
+    {
+      "epoch": 92.13333333333334,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002984749321664378,
+      "loss": 0.4742,
+      "step": 34550
+    },
+    {
+      "epoch": 92.16,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029847403774935834,
+      "loss": 0.4787,
+      "step": 34560
+    },
+    {
+      "epoch": 92.18666666666667,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0002984731430714192,
+      "loss": 0.4709,
+      "step": 34570
+    },
+    {
+      "epoch": 92.21333333333334,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029847224813262196,
+      "loss": 0.4652,
+      "step": 34580
+    },
+    {
+      "epoch": 92.24,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029847135293296814,
+      "loss": 0.4688,
+      "step": 34590
+    },
+    {
+      "epoch": 92.26666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029847045747245934,
+      "loss": 0.4734,
+      "step": 34600
+    },
+    {
+      "epoch": 92.29333333333334,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029846956175109713,
+      "loss": 0.4675,
+      "step": 34610
+    },
+    {
+      "epoch": 92.32,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002984686657688831,
+      "loss": 0.4719,
+      "step": 34620
+    },
+    {
+      "epoch": 92.34666666666666,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002984677695258188,
+      "loss": 0.4791,
+      "step": 34630
+    },
+    {
+      "epoch": 92.37333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002984668730219058,
+      "loss": 0.4754,
+      "step": 34640
+    },
+    {
+      "epoch": 92.4,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002984659762571457,
+      "loss": 0.4725,
+      "step": 34650
+    },
+    {
+      "epoch": 92.42666666666666,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029846507923154004,
+      "loss": 0.4782,
+      "step": 34660
+    },
+    {
+      "epoch": 92.45333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002984641819450904,
+      "loss": 0.4804,
+      "step": 34670
+    },
+    {
+      "epoch": 92.48,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00029846328439779845,
+      "loss": 0.487,
+      "step": 34680
+    },
+    {
+      "epoch": 92.50666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029846238658966565,
+      "loss": 0.4764,
+      "step": 34690
+    },
+    {
+      "epoch": 92.53333333333333,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029846148852069365,
+      "loss": 0.4704,
+      "step": 34700
+    },
+    {
+      "epoch": 92.56,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029846059019088397,
+      "loss": 0.4711,
+      "step": 34710
+    },
+    {
+      "epoch": 92.58666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029845969160023823,
+      "loss": 0.4668,
+      "step": 34720
+    },
+    {
+      "epoch": 92.61333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029845879274875795,
+      "loss": 0.4525,
+      "step": 34730
+    },
+    {
+      "epoch": 92.64,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002984578936364448,
+      "loss": 0.4581,
+      "step": 34740
+    },
+    {
+      "epoch": 92.66666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029845699426330033,
+      "loss": 0.4735,
+      "step": 34750
+    },
+    {
+      "epoch": 92.69333333333333,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0002984560946293261,
+      "loss": 0.4571,
+      "step": 34760
+    },
+    {
+      "epoch": 92.72,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029845519473452363,
+      "loss": 0.4734,
+      "step": 34770
+    },
+    {
+      "epoch": 92.74666666666667,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029845429457889467,
+      "loss": 0.478,
+      "step": 34780
+    },
+    {
+      "epoch": 92.77333333333333,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002984533941624406,
+      "loss": 0.4851,
+      "step": 34790
+    },
+    {
+      "epoch": 92.8,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029845249348516316,
+      "loss": 0.4726,
+      "step": 34800
+    },
+    {
+      "epoch": 92.82666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029845159254706386,
+      "loss": 0.4669,
+      "step": 34810
+    },
+    {
+      "epoch": 92.85333333333334,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002984506913481443,
+      "loss": 0.4756,
+      "step": 34820
+    },
+    {
+      "epoch": 92.88,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029844978988840604,
+      "loss": 0.4639,
+      "step": 34830
+    },
+    {
+      "epoch": 92.90666666666667,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029844888816785067,
+      "loss": 0.4665,
+      "step": 34840
+    },
+    {
+      "epoch": 92.93333333333334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002984479861864798,
+      "loss": 0.4681,
+      "step": 34850
+    },
+    {
+      "epoch": 92.96,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.000298447083944295,
+      "loss": 0.4662,
+      "step": 34860
+    },
+    {
+      "epoch": 92.98666666666666,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002984461814412978,
+      "loss": 0.4795,
+      "step": 34870
+    },
+    {
+      "epoch": 93.0,
+      "eval_loss": 0.48105791211128235,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8036,
+      "eval_samples_per_second": 1.632,
+      "eval_steps_per_second": 0.102,
+      "step": 34875
+    },
+    {
+      "epoch": 93.01333333333334,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029844527867749,
+      "loss": 0.4688,
+      "step": 34880
+    },
+    {
+      "epoch": 93.04,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029844437565287293,
+      "loss": 0.4933,
+      "step": 34890
+    },
+    {
+      "epoch": 93.06666666666666,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.00029844347236744823,
+      "loss": 0.4807,
+      "step": 34900
+    },
+    {
+      "epoch": 93.09333333333333,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002984425688212176,
+      "loss": 0.4752,
+      "step": 34910
+    },
+    {
+      "epoch": 93.12,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029844166501418254,
+      "loss": 0.4723,
+      "step": 34920
+    },
+    {
+      "epoch": 93.14666666666666,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029844076094634464,
+      "loss": 0.4807,
+      "step": 34930
+    },
+    {
+      "epoch": 93.17333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002984398566177055,
+      "loss": 0.4715,
+      "step": 34940
+    },
+    {
+      "epoch": 93.2,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029843895202826676,
+      "loss": 0.4699,
+      "step": 34950
+    },
+    {
+      "epoch": 93.22666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029843804717803,
+      "loss": 0.4674,
+      "step": 34960
+    },
+    {
+      "epoch": 93.25333333333333,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029843714206699665,
+      "loss": 0.4666,
+      "step": 34970
+    },
+    {
+      "epoch": 93.28,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029843623669516847,
+      "loss": 0.4804,
+      "step": 34980
+    },
+    {
+      "epoch": 93.30666666666667,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.000298435331062547,
+      "loss": 0.462,
+      "step": 34990
+    },
+    {
+      "epoch": 93.33333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002984344251691339,
+      "loss": 0.4775,
+      "step": 35000
+    },
+    {
+      "epoch": 93.36,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002984335190149306,
+      "loss": 0.4793,
+      "step": 35010
+    },
+    {
+      "epoch": 93.38666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002984326125999389,
+      "loss": 0.4713,
+      "step": 35020
+    },
+    {
+      "epoch": 93.41333333333333,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029843170592416015,
+      "loss": 0.4749,
+      "step": 35030
+    },
+    {
+      "epoch": 93.44,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.00029843079898759616,
+      "loss": 0.478,
+      "step": 35040
+    },
+    {
+      "epoch": 93.46666666666667,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029842989179024843,
+      "loss": 0.4874,
+      "step": 35050
+    },
+    {
+      "epoch": 93.49333333333334,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002984289843321185,
+      "loss": 0.4796,
+      "step": 35060
+    },
+    {
+      "epoch": 93.52,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002984280766132081,
+      "loss": 0.4733,
+      "step": 35070
+    },
+    {
+      "epoch": 93.54666666666667,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029842716863351875,
+      "loss": 0.4705,
+      "step": 35080
+    },
+    {
+      "epoch": 93.57333333333334,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029842626039305203,
+      "loss": 0.4698,
+      "step": 35090
+    },
+    {
+      "epoch": 93.6,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002984253518918096,
+      "loss": 0.4627,
+      "step": 35100
+    },
+    {
+      "epoch": 93.62666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029842444312979294,
+      "loss": 0.4507,
+      "step": 35110
+    },
+    {
+      "epoch": 93.65333333333334,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002984235341070037,
+      "loss": 0.4702,
+      "step": 35120
+    },
+    {
+      "epoch": 93.68,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029842262482344354,
+      "loss": 0.461,
+      "step": 35130
+    },
+    {
+      "epoch": 93.70666666666666,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029842171527911404,
+      "loss": 0.4621,
+      "step": 35140
+    },
+    {
+      "epoch": 93.73333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002984208054740167,
+      "loss": 0.4813,
+      "step": 35150
+    },
+    {
+      "epoch": 93.76,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002984198954081532,
+      "loss": 0.4824,
+      "step": 35160
+    },
+    {
+      "epoch": 93.78666666666666,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029841898508152514,
+      "loss": 0.4804,
+      "step": 35170
+    },
+    {
+      "epoch": 93.81333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002984180744941341,
+      "loss": 0.4657,
+      "step": 35180
+    },
+    {
+      "epoch": 93.84,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002984171636459817,
+      "loss": 0.4718,
+      "step": 35190
+    },
+    {
+      "epoch": 93.86666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029841625253706946,
+      "loss": 0.4705,
+      "step": 35200
+    },
+    {
+      "epoch": 93.89333333333333,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029841534116739914,
+      "loss": 0.4623,
+      "step": 35210
+    },
+    {
+      "epoch": 93.92,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029841442953697217,
+      "loss": 0.4716,
+      "step": 35220
+    },
+    {
+      "epoch": 93.94666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029841351764579025,
+      "loss": 0.4652,
+      "step": 35230
+    },
+    {
+      "epoch": 93.97333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029841260549385494,
+      "loss": 0.4763,
+      "step": 35240
+    },
+    {
+      "epoch": 94.0,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002984116930811679,
+      "loss": 0.4654,
+      "step": 35250
+    },
+    {
+      "epoch": 94.0,
+      "eval_loss": 0.48085930943489075,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2211,
+      "eval_samples_per_second": 1.565,
+      "eval_steps_per_second": 0.098,
+      "step": 35250
+    },
+    {
+      "epoch": 94.02666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002984107804077307,
+      "loss": 0.4845,
+      "step": 35260
+    },
+    {
+      "epoch": 94.05333333333333,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029840986747354495,
+      "loss": 0.4904,
+      "step": 35270
+    },
+    {
+      "epoch": 94.08,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002984089542786122,
+      "loss": 0.4772,
+      "step": 35280
+    },
+    {
+      "epoch": 94.10666666666667,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002984080408229341,
+      "loss": 0.4732,
+      "step": 35290
+    },
+    {
+      "epoch": 94.13333333333334,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029840712710651226,
+      "loss": 0.4748,
+      "step": 35300
+    },
+    {
+      "epoch": 94.16,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002984062131293483,
+      "loss": 0.4782,
+      "step": 35310
+    },
+    {
+      "epoch": 94.18666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002984052988914438,
+      "loss": 0.4708,
+      "step": 35320
+    },
+    {
+      "epoch": 94.21333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029840438439280036,
+      "loss": 0.4656,
+      "step": 35330
+    },
+    {
+      "epoch": 94.24,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002984034696334196,
+      "loss": 0.4698,
+      "step": 35340
+    },
+    {
+      "epoch": 94.26666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002984025546133031,
+      "loss": 0.4733,
+      "step": 35350
+    },
+    {
+      "epoch": 94.29333333333334,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029840163933245254,
+      "loss": 0.4673,
+      "step": 35360
+    },
+    {
+      "epoch": 94.32,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029840072379086946,
+      "loss": 0.4717,
+      "step": 35370
+    },
+    {
+      "epoch": 94.34666666666666,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002983998079885555,
+      "loss": 0.4795,
+      "step": 35380
+    },
+    {
+      "epoch": 94.37333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002983988919255122,
+      "loss": 0.4755,
+      "step": 35390
+    },
+    {
+      "epoch": 94.4,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029839797560174126,
+      "loss": 0.4723,
+      "step": 35400
+    },
+    {
+      "epoch": 94.42666666666666,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.00029839705901724427,
+      "loss": 0.4785,
+      "step": 35410
+    },
+    {
+      "epoch": 94.45333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029839614217202283,
+      "loss": 0.4802,
+      "step": 35420
+    },
+    {
+      "epoch": 94.48,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002983952250660785,
+      "loss": 0.4866,
+      "step": 35430
+    },
+    {
+      "epoch": 94.50666666666666,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.000298394307699413,
+      "loss": 0.4765,
+      "step": 35440
+    },
+    {
+      "epoch": 94.53333333333333,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029839339007202783,
+      "loss": 0.471,
+      "step": 35450
+    },
+    {
+      "epoch": 94.56,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029839247218392464,
+      "loss": 0.4709,
+      "step": 35460
+    },
+    {
+      "epoch": 94.58666666666667,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029839155403510513,
+      "loss": 0.4668,
+      "step": 35470
+    },
+    {
+      "epoch": 94.61333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002983906356255708,
+      "loss": 0.4531,
+      "step": 35480
+    },
+    {
+      "epoch": 94.64,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002983897169553233,
+      "loss": 0.4575,
+      "step": 35490
+    },
+    {
+      "epoch": 94.66666666666667,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.00029838879802436427,
+      "loss": 0.473,
+      "step": 35500
+    },
+    {
+      "epoch": 94.69333333333333,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00029838787883269526,
+      "loss": 0.4567,
+      "step": 35510
+    },
+    {
+      "epoch": 94.72,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029838695938031796,
+      "loss": 0.4732,
+      "step": 35520
+    },
+    {
+      "epoch": 94.74666666666667,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029838603966723395,
+      "loss": 0.4779,
+      "step": 35530
+    },
+    {
+      "epoch": 94.77333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029838511969344485,
+      "loss": 0.4856,
+      "step": 35540
+    },
+    {
+      "epoch": 94.8,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002983841994589523,
+      "loss": 0.4726,
+      "step": 35550
+    },
+    {
+      "epoch": 94.82666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002983832789637578,
+      "loss": 0.4663,
+      "step": 35560
+    },
+    {
+      "epoch": 94.85333333333334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002983823582078631,
+      "loss": 0.4761,
+      "step": 35570
+    },
+    {
+      "epoch": 94.88,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002983814371912698,
+      "loss": 0.4641,
+      "step": 35580
+    },
+    {
+      "epoch": 94.90666666666667,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0002983805159139795,
+      "loss": 0.4665,
+      "step": 35590
+    },
+    {
+      "epoch": 94.93333333333334,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002983795943759938,
+      "loss": 0.468,
+      "step": 35600
+    },
+    {
+      "epoch": 94.96,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002983786725773143,
+      "loss": 0.4663,
+      "step": 35610
+    },
+    {
+      "epoch": 94.98666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029837775051794267,
+      "loss": 0.4799,
+      "step": 35620
+    },
+    {
+      "epoch": 95.0,
+      "eval_loss": 0.4803951382637024,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5518,
+      "eval_samples_per_second": 1.516,
+      "eval_steps_per_second": 0.095,
+      "step": 35625
+    },
+    {
+      "epoch": 95.01333333333334,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002983768281978805,
+      "loss": 0.4695,
+      "step": 35630
+    },
+    {
+      "epoch": 95.04,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029837590561712944,
+      "loss": 0.4935,
+      "step": 35640
+    },
+    {
+      "epoch": 95.06666666666666,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002983749827756911,
+      "loss": 0.4807,
+      "step": 35650
+    },
+    {
+      "epoch": 95.09333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029837405967356705,
+      "loss": 0.4756,
+      "step": 35660
+    },
+    {
+      "epoch": 95.12,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.000298373136310759,
+      "loss": 0.4726,
+      "step": 35670
+    },
+    {
+      "epoch": 95.14666666666666,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002983722126872685,
+      "loss": 0.4815,
+      "step": 35680
+    },
+    {
+      "epoch": 95.17333333333333,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029837128880309723,
+      "loss": 0.471,
+      "step": 35690
+    },
+    {
+      "epoch": 95.2,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029837036465824675,
+      "loss": 0.4692,
+      "step": 35700
+    },
+    {
+      "epoch": 95.22666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029836944025271876,
+      "loss": 0.4664,
+      "step": 35710
+    },
+    {
+      "epoch": 95.25333333333333,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0002983685155865148,
+      "loss": 0.4671,
+      "step": 35720
+    },
+    {
+      "epoch": 95.28,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029836759065963656,
+      "loss": 0.4803,
+      "step": 35730
+    },
+    {
+      "epoch": 95.30666666666667,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.00029836666547208564,
+      "loss": 0.4619,
+      "step": 35740
+    },
+    {
+      "epoch": 95.33333333333333,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002983657400238636,
+      "loss": 0.4775,
+      "step": 35750
+    },
+    {
+      "epoch": 95.36,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029836481431497224,
+      "loss": 0.4784,
+      "step": 35760
+    },
+    {
+      "epoch": 95.38666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.000298363888345413,
+      "loss": 0.4708,
+      "step": 35770
+    },
+    {
+      "epoch": 95.41333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029836296211518764,
+      "loss": 0.4755,
+      "step": 35780
+    },
+    {
+      "epoch": 95.44,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00029836203562429777,
+      "loss": 0.4788,
+      "step": 35790
+    },
+    {
+      "epoch": 95.46666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002983611088727449,
+      "loss": 0.4874,
+      "step": 35800
+    },
+    {
+      "epoch": 95.49333333333334,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002983601818605307,
+      "loss": 0.4789,
+      "step": 35810
+    },
+    {
+      "epoch": 95.52,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029835925458765694,
+      "loss": 0.473,
+      "step": 35820
+    },
+    {
+      "epoch": 95.54666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029835832705412513,
+      "loss": 0.4703,
+      "step": 35830
+    },
+    {
+      "epoch": 95.57333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029835739925993687,
+      "loss": 0.4697,
+      "step": 35840
+    },
+    {
+      "epoch": 95.6,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002983564712050939,
+      "loss": 0.4631,
+      "step": 35850
+    },
+    {
+      "epoch": 95.62666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002983555428895977,
+      "loss": 0.4496,
+      "step": 35860
+    },
+    {
+      "epoch": 95.65333333333334,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029835461431345,
+      "loss": 0.4692,
+      "step": 35870
+    },
+    {
+      "epoch": 95.68,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002983536854766525,
+      "loss": 0.4613,
+      "step": 35880
+    },
+    {
+      "epoch": 95.70666666666666,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002983527563792067,
+      "loss": 0.4619,
+      "step": 35890
+    },
+    {
+      "epoch": 95.73333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002983518270211143,
+      "loss": 0.4808,
+      "step": 35900
+    },
+    {
+      "epoch": 95.76,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002983508974023769,
+      "loss": 0.4816,
+      "step": 35910
+    },
+    {
+      "epoch": 95.78666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002983499675229961,
+      "loss": 0.48,
+      "step": 35920
+    },
+    {
+      "epoch": 95.81333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029834903738297367,
+      "loss": 0.4658,
+      "step": 35930
+    },
+    {
+      "epoch": 95.84,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002983481069823112,
+      "loss": 0.4714,
+      "step": 35940
+    },
+    {
+      "epoch": 95.86666666666666,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029834717632101017,
+      "loss": 0.4711,
+      "step": 35950
+    },
+    {
+      "epoch": 95.89333333333333,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002983462453990724,
+      "loss": 0.4621,
+      "step": 35960
+    },
+    {
+      "epoch": 95.92,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002983453142164994,
+      "loss": 0.4719,
+      "step": 35970
+    },
+    {
+      "epoch": 95.94666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029834438277329285,
+      "loss": 0.4654,
+      "step": 35980
+    },
+    {
+      "epoch": 95.97333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002983434510694544,
+      "loss": 0.4763,
+      "step": 35990
+    },
+    {
+      "epoch": 96.0,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002983425191049858,
+      "loss": 0.4651,
+      "step": 36000
+    },
+    {
+      "epoch": 96.0,
+      "eval_loss": 0.48041442036628723,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5207,
+      "eval_samples_per_second": 1.521,
+      "eval_steps_per_second": 0.095,
+      "step": 36000
+    },
+    {
+      "epoch": 96.02666666666667,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029834158687988847,
+      "loss": 0.4847,
+      "step": 36010
+    },
+    {
+      "epoch": 96.05333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002983406543941641,
+      "loss": 0.4905,
+      "step": 36020
+    },
+    {
+      "epoch": 96.08,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002983397216478145,
+      "loss": 0.4774,
+      "step": 36030
+    },
+    {
+      "epoch": 96.10666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002983387886408411,
+      "loss": 0.4722,
+      "step": 36040
+    },
+    {
+      "epoch": 96.13333333333334,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002983378553732456,
+      "loss": 0.4741,
+      "step": 36050
+    },
+    {
+      "epoch": 96.16,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002983369218450297,
+      "loss": 0.4787,
+      "step": 36060
+    },
+    {
+      "epoch": 96.18666666666667,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029833598805619507,
+      "loss": 0.4715,
+      "step": 36070
+    },
+    {
+      "epoch": 96.21333333333334,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002983350540067432,
+      "loss": 0.4657,
+      "step": 36080
+    },
+    {
+      "epoch": 96.24,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029833411969667584,
+      "loss": 0.4699,
+      "step": 36090
+    },
+    {
+      "epoch": 96.26666666666667,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029833318512599463,
+      "loss": 0.4739,
+      "step": 36100
+    },
+    {
+      "epoch": 96.29333333333334,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029833225029470115,
+      "loss": 0.4675,
+      "step": 36110
+    },
+    {
+      "epoch": 96.32,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002983313152027971,
+      "loss": 0.4717,
+      "step": 36120
+    },
+    {
+      "epoch": 96.34666666666666,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002983303798502841,
+      "loss": 0.4791,
+      "step": 36130
+    },
+    {
+      "epoch": 96.37333333333333,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029832944423716383,
+      "loss": 0.4754,
+      "step": 36140
+    },
+    {
+      "epoch": 96.4,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002983285083634379,
+      "loss": 0.4722,
+      "step": 36150
+    },
+    {
+      "epoch": 96.42666666666666,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002983275722291079,
+      "loss": 0.4782,
+      "step": 36160
+    },
+    {
+      "epoch": 96.45333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002983266358341756,
+      "loss": 0.4802,
+      "step": 36170
+    },
+    {
+      "epoch": 96.48,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002983256991786425,
+      "loss": 0.4868,
+      "step": 36180
+    },
+    {
+      "epoch": 96.50666666666666,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029832476226251037,
+      "loss": 0.4763,
+      "step": 36190
+    },
+    {
+      "epoch": 96.53333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002983238250857808,
+      "loss": 0.4693,
+      "step": 36200
+    },
+    {
+      "epoch": 96.56,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029832288764845545,
+      "loss": 0.471,
+      "step": 36210
+    },
+    {
+      "epoch": 96.58666666666667,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.000298321949950536,
+      "loss": 0.467,
+      "step": 36220
+    },
+    {
+      "epoch": 96.61333333333333,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.000298321011992024,
+      "loss": 0.4521,
+      "step": 36230
+    },
+    {
+      "epoch": 96.64,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002983200737729212,
+      "loss": 0.4576,
+      "step": 36240
+    },
+    {
+      "epoch": 96.66666666666667,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029831913529322916,
+      "loss": 0.4734,
+      "step": 36250
+    },
+    {
+      "epoch": 96.69333333333333,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002983181965529496,
+      "loss": 0.4563,
+      "step": 36260
+    },
+    {
+      "epoch": 96.72,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002983172575520841,
+      "loss": 0.4737,
+      "step": 36270
+    },
+    {
+      "epoch": 96.74666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002983163182906344,
+      "loss": 0.4773,
+      "step": 36280
+    },
+    {
+      "epoch": 96.77333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002983153787686021,
+      "loss": 0.4845,
+      "step": 36290
+    },
+    {
+      "epoch": 96.8,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002983144389859888,
+      "loss": 0.4719,
+      "step": 36300
+    },
+    {
+      "epoch": 96.82666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002983134989427963,
+      "loss": 0.4663,
+      "step": 36310
+    },
+    {
+      "epoch": 96.85333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029831255863902606,
+      "loss": 0.4755,
+      "step": 36320
+    },
+    {
+      "epoch": 96.88,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002983116180746799,
+      "loss": 0.4638,
+      "step": 36330
+    },
+    {
+      "epoch": 96.90666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002983106772497594,
+      "loss": 0.4668,
+      "step": 36340
+    },
+    {
+      "epoch": 96.93333333333334,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002983097361642662,
+      "loss": 0.4682,
+      "step": 36350
+    },
+    {
+      "epoch": 96.96,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029830879481820193,
+      "loss": 0.4659,
+      "step": 36360
+    },
+    {
+      "epoch": 96.98666666666666,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029830785321156833,
+      "loss": 0.4792,
+      "step": 36370
+    },
+    {
+      "epoch": 97.0,
+      "eval_loss": 0.4807257354259491,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5628,
+      "eval_samples_per_second": 1.515,
+      "eval_steps_per_second": 0.095,
+      "step": 36375
+    },
+    {
+      "epoch": 97.01333333333334,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029830691134436697,
+      "loss": 0.469,
+      "step": 36380
+    },
+    {
+      "epoch": 97.04,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029830596921659957,
+      "loss": 0.4937,
+      "step": 36390
+    },
+    {
+      "epoch": 97.06666666666666,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029830502682826775,
+      "loss": 0.4805,
+      "step": 36400
+    },
+    {
+      "epoch": 97.09333333333333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002983040841793732,
+      "loss": 0.4752,
+      "step": 36410
+    },
+    {
+      "epoch": 97.12,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029830314126991746,
+      "loss": 0.4715,
+      "step": 36420
+    },
+    {
+      "epoch": 97.14666666666666,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002983021980999024,
+      "loss": 0.4797,
+      "step": 36430
+    },
+    {
+      "epoch": 97.17333333333333,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00029830125466932946,
+      "loss": 0.4714,
+      "step": 36440
+    },
+    {
+      "epoch": 97.2,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0002983003109782004,
+      "loss": 0.4694,
+      "step": 36450
+    },
+    {
+      "epoch": 97.22666666666667,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029829936702651684,
+      "loss": 0.4668,
+      "step": 36460
+    },
+    {
+      "epoch": 97.25333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029829842281428056,
+      "loss": 0.4673,
+      "step": 36470
+    },
+    {
+      "epoch": 97.28,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029829747834149304,
+      "loss": 0.4804,
+      "step": 36480
+    },
+    {
+      "epoch": 97.30666666666667,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029829653360815607,
+      "loss": 0.4615,
+      "step": 36490
+    },
+    {
+      "epoch": 97.33333333333333,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029829558861427127,
+      "loss": 0.4769,
+      "step": 36500
+    },
+    {
+      "epoch": 97.36,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002982946433598403,
+      "loss": 0.4796,
+      "step": 36510
+    },
+    {
+      "epoch": 97.38666666666667,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002982936978448647,
+      "loss": 0.4708,
+      "step": 36520
+    },
+    {
+      "epoch": 97.41333333333333,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002982927520693464,
+      "loss": 0.4753,
+      "step": 36530
+    },
+    {
+      "epoch": 97.44,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029829180603328684,
+      "loss": 0.4777,
+      "step": 36540
+    },
+    {
+      "epoch": 97.46666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029829085973668776,
+      "loss": 0.4882,
+      "step": 36550
+    },
+    {
+      "epoch": 97.49333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002982899131795508,
+      "loss": 0.4797,
+      "step": 36560
+    },
+    {
+      "epoch": 97.52,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002982889663618776,
+      "loss": 0.4733,
+      "step": 36570
+    },
+    {
+      "epoch": 97.54666666666667,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002982880192836699,
+      "loss": 0.4705,
+      "step": 36580
+    },
+    {
+      "epoch": 97.57333333333334,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029828707194492934,
+      "loss": 0.4696,
+      "step": 36590
+    },
+    {
+      "epoch": 97.6,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029828612434565756,
+      "loss": 0.4635,
+      "step": 36600
+    },
+    {
+      "epoch": 97.62666666666667,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00029828517648585617,
+      "loss": 0.4503,
+      "step": 36610
+    },
+    {
+      "epoch": 97.65333333333334,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.00029828422836552696,
+      "loss": 0.47,
+      "step": 36620
+    },
+    {
+      "epoch": 97.68,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029828327998467156,
+      "loss": 0.4609,
+      "step": 36630
+    },
+    {
+      "epoch": 97.70666666666666,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.00029828233134329154,
+      "loss": 0.4614,
+      "step": 36640
+    },
+    {
+      "epoch": 97.73333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002982813824413887,
+      "loss": 0.4802,
+      "step": 36650
+    },
+    {
+      "epoch": 97.76,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002982804332789646,
+      "loss": 0.482,
+      "step": 36660
+    },
+    {
+      "epoch": 97.78666666666666,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029827948385602093,
+      "loss": 0.4809,
+      "step": 36670
+    },
+    {
+      "epoch": 97.81333333333333,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029827853417255945,
+      "loss": 0.4649,
+      "step": 36680
+    },
+    {
+      "epoch": 97.84,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029827758422858175,
+      "loss": 0.4717,
+      "step": 36690
+    },
+    {
+      "epoch": 97.86666666666666,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029827663402408946,
+      "loss": 0.4709,
+      "step": 36700
+    },
+    {
+      "epoch": 97.89333333333333,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.00029827568355908425,
+      "loss": 0.4617,
+      "step": 36710
+    },
+    {
+      "epoch": 97.92,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029827473283356793,
+      "loss": 0.4722,
+      "step": 36720
+    },
+    {
+      "epoch": 97.94666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029827378184754205,
+      "loss": 0.4649,
+      "step": 36730
+    },
+    {
+      "epoch": 97.97333333333333,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002982728306010083,
+      "loss": 0.4762,
+      "step": 36740
+    },
+    {
+      "epoch": 98.0,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002982718790939684,
+      "loss": 0.4653,
+      "step": 36750
+    },
+    {
+      "epoch": 98.0,
+      "eval_loss": 0.4806678295135498,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.9011,
+      "eval_samples_per_second": 1.468,
+      "eval_steps_per_second": 0.092,
+      "step": 36750
+    },
+    {
+      "epoch": 98.02666666666667,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029827092732642393,
+      "loss": 0.4849,
+      "step": 36760
+    },
+    {
+      "epoch": 98.05333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002982699752983766,
+      "loss": 0.4905,
+      "step": 36770
+    },
+    {
+      "epoch": 98.08,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.00029826902300982813,
+      "loss": 0.4775,
+      "step": 36780
+    },
+    {
+      "epoch": 98.10666666666667,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.00029826807046078014,
+      "loss": 0.4729,
+      "step": 36790
+    },
+    {
+      "epoch": 98.13333333333334,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029826711765123436,
+      "loss": 0.4738,
+      "step": 36800
+    },
+    {
+      "epoch": 98.16,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029826616458119236,
+      "loss": 0.4782,
+      "step": 36810
+    },
+    {
+      "epoch": 98.18666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029826521125065594,
+      "loss": 0.4706,
+      "step": 36820
+    },
+    {
+      "epoch": 98.21333333333334,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029826425765962667,
+      "loss": 0.4655,
+      "step": 36830
+    },
+    {
+      "epoch": 98.24,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029826330380810633,
+      "loss": 0.4698,
+      "step": 36840
+    },
+    {
+      "epoch": 98.26666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029826234969609645,
+      "loss": 0.4734,
+      "step": 36850
+    },
+    {
+      "epoch": 98.29333333333334,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002982613953235989,
+      "loss": 0.467,
+      "step": 36860
+    },
+    {
+      "epoch": 98.32,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002982604406906152,
+      "loss": 0.4711,
+      "step": 36870
+    },
+    {
+      "epoch": 98.34666666666666,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.000298259485797147,
+      "loss": 0.4794,
+      "step": 36880
+    },
+    {
+      "epoch": 98.37333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029825853064319613,
+      "loss": 0.4754,
+      "step": 36890
+    },
+    {
+      "epoch": 98.4,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002982575752287642,
+      "loss": 0.4721,
+      "step": 36900
+    },
+    {
+      "epoch": 98.42666666666666,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002982566195538529,
+      "loss": 0.4787,
+      "step": 36910
+    },
+    {
+      "epoch": 98.45333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002982556636184638,
+      "loss": 0.48,
+      "step": 36920
+    },
+    {
+      "epoch": 98.48,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029825470742259877,
+      "loss": 0.487,
+      "step": 36930
+    },
+    {
+      "epoch": 98.50666666666666,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029825375096625934,
+      "loss": 0.4761,
+      "step": 36940
+    },
+    {
+      "epoch": 98.53333333333333,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029825279424944723,
+      "loss": 0.4703,
+      "step": 36950
+    },
+    {
+      "epoch": 98.56,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029825183727216417,
+      "loss": 0.4703,
+      "step": 36960
+    },
+    {
+      "epoch": 98.58666666666667,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002982508800344118,
+      "loss": 0.4668,
+      "step": 36970
+    },
+    {
+      "epoch": 98.61333333333333,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0002982499225361918,
+      "loss": 0.4524,
+      "step": 36980
+    },
+    {
+      "epoch": 98.64,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029824896477750586,
+      "loss": 0.4575,
+      "step": 36990
+    },
+    {
+      "epoch": 98.66666666666667,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029824800675835563,
+      "loss": 0.4738,
+      "step": 37000
+    },
+    {
+      "epoch": 98.69333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029824704847874285,
+      "loss": 0.4561,
+      "step": 37010
+    },
+    {
+      "epoch": 98.72,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029824608993866915,
+      "loss": 0.4733,
+      "step": 37020
+    },
+    {
+      "epoch": 98.74666666666667,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0002982451311381362,
+      "loss": 0.4774,
+      "step": 37030
+    },
+    {
+      "epoch": 98.77333333333333,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002982441720771458,
+      "loss": 0.4845,
+      "step": 37040
+    },
+    {
+      "epoch": 98.8,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029824321275569957,
+      "loss": 0.4724,
+      "step": 37050
+    },
+    {
+      "epoch": 98.82666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002982422531737991,
+      "loss": 0.4665,
+      "step": 37060
+    },
+    {
+      "epoch": 98.85333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029824129333144624,
+      "loss": 0.476,
+      "step": 37070
+    },
+    {
+      "epoch": 98.88,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029824033322864256,
+      "loss": 0.4641,
+      "step": 37080
+    },
+    {
+      "epoch": 98.90666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029823937286538977,
+      "loss": 0.4658,
+      "step": 37090
+    },
+    {
+      "epoch": 98.93333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029823841224168966,
+      "loss": 0.4684,
+      "step": 37100
+    },
+    {
+      "epoch": 98.96,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029823745135754376,
+      "loss": 0.4661,
+      "step": 37110
+    },
+    {
+      "epoch": 98.98666666666666,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029823649021295384,
+      "loss": 0.4795,
+      "step": 37120
+    },
+    {
+      "epoch": 99.0,
+      "eval_loss": 0.4802570939064026,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.9453,
+      "eval_samples_per_second": 1.462,
+      "eval_steps_per_second": 0.091,
+      "step": 37125
+    },
+    {
+      "epoch": 99.01333333333334,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029823552880792154,
+      "loss": 0.4695,
+      "step": 37130
+    },
+    {
+      "epoch": 99.04,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029823456714244865,
+      "loss": 0.4936,
+      "step": 37140
+    },
+    {
+      "epoch": 99.06666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029823360521653673,
+      "loss": 0.4806,
+      "step": 37150
+    },
+    {
+      "epoch": 99.09333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002982326430301876,
+      "loss": 0.4756,
+      "step": 37160
+    },
+    {
+      "epoch": 99.12,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002982316805834028,
+      "loss": 0.4718,
+      "step": 37170
+    },
+    {
+      "epoch": 99.14666666666666,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002982307178761842,
+      "loss": 0.4809,
+      "step": 37180
+    },
+    {
+      "epoch": 99.17333333333333,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029822975490853337,
+      "loss": 0.4712,
+      "step": 37190
+    },
+    {
+      "epoch": 99.2,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029822879168045203,
+      "loss": 0.4701,
+      "step": 37200
+    },
+    {
+      "epoch": 99.22666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029822782819194187,
+      "loss": 0.4669,
+      "step": 37210
+    },
+    {
+      "epoch": 99.25333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002982268644430045,
+      "loss": 0.467,
+      "step": 37220
+    },
+    {
+      "epoch": 99.28,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029822590043364176,
+      "loss": 0.4808,
+      "step": 37230
+    },
+    {
+      "epoch": 99.30666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029822493616385535,
+      "loss": 0.462,
+      "step": 37240
+    },
+    {
+      "epoch": 99.33333333333333,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002982239716336468,
+      "loss": 0.4776,
+      "step": 37250
+    },
+    {
+      "epoch": 99.36,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.000298223006843018,
+      "loss": 0.4789,
+      "step": 37260
+    },
+    {
+      "epoch": 99.38666666666667,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029822204179197044,
+      "loss": 0.4706,
+      "step": 37270
+    },
+    {
+      "epoch": 99.41333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.000298221076480506,
+      "loss": 0.4755,
+      "step": 37280
+    },
+    {
+      "epoch": 99.44,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002982201109086263,
+      "loss": 0.4781,
+      "step": 37290
+    },
+    {
+      "epoch": 99.46666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029821914507633296,
+      "loss": 0.4876,
+      "step": 37300
+    },
+    {
+      "epoch": 99.49333333333334,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029821817898362783,
+      "loss": 0.4791,
+      "step": 37310
+    },
+    {
+      "epoch": 99.52,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029821721263051246,
+      "loss": 0.4731,
+      "step": 37320
+    },
+    {
+      "epoch": 99.54666666666667,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.00029821624601698865,
+      "loss": 0.4702,
+      "step": 37330
+    },
+    {
+      "epoch": 99.57333333333334,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00029821527914305806,
+      "loss": 0.4691,
+      "step": 37340
+    },
+    {
+      "epoch": 99.6,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002982143120087224,
+      "loss": 0.4626,
+      "step": 37350
+    },
+    {
+      "epoch": 99.62666666666667,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0002982133446139833,
+      "loss": 0.4501,
+      "step": 37360
+    },
+    {
+      "epoch": 99.65333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002982123769588426,
+      "loss": 0.4697,
+      "step": 37370
+    },
+    {
+      "epoch": 99.68,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029821140904330194,
+      "loss": 0.4614,
+      "step": 37380
+    },
+    {
+      "epoch": 99.70666666666666,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029821044086736294,
+      "loss": 0.4617,
+      "step": 37390
+    },
+    {
+      "epoch": 99.73333333333333,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002982094724310274,
+      "loss": 0.4804,
+      "step": 37400
+    },
+    {
+      "epoch": 99.76,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00029820850373429695,
+      "loss": 0.4819,
+      "step": 37410
+    },
+    {
+      "epoch": 99.78666666666666,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002982075347771734,
+      "loss": 0.4809,
+      "step": 37420
+    },
+    {
+      "epoch": 99.81333333333333,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.00029820656555965827,
+      "loss": 0.466,
+      "step": 37430
+    },
+    {
+      "epoch": 99.84,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002982055960817534,
+      "loss": 0.4708,
+      "step": 37440
+    },
+    {
+      "epoch": 99.86666666666666,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029820462634346054,
+      "loss": 0.4712,
+      "step": 37450
+    },
+    {
+      "epoch": 99.89333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002982036563447813,
+      "loss": 0.4615,
+      "step": 37460
+    },
+    {
+      "epoch": 99.92,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029820268608571733,
+      "loss": 0.472,
+      "step": 37470
+    },
+    {
+      "epoch": 99.94666666666667,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029820171556627046,
+      "loss": 0.4642,
+      "step": 37480
+    },
+    {
+      "epoch": 99.97333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029820074478644236,
+      "loss": 0.4754,
+      "step": 37490
+    },
+    {
+      "epoch": 100.0,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002981997737462347,
+      "loss": 0.4642,
+      "step": 37500
+    },
+    {
+      "epoch": 100.0,
+      "eval_loss": 0.480648934841156,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0038,
+      "eval_samples_per_second": 1.599,
+      "eval_steps_per_second": 0.1,
+      "step": 37500
+    },
+    {
+      "epoch": 100.02666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002981988024456492,
+      "loss": 0.4849,
+      "step": 37510
+    },
+    {
+      "epoch": 100.05333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002981978308846876,
+      "loss": 0.4902,
+      "step": 37520
+    },
+    {
+      "epoch": 100.08,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002981968590633515,
+      "loss": 0.4771,
+      "step": 37530
+    },
+    {
+      "epoch": 100.10666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029819588698164277,
+      "loss": 0.4719,
+      "step": 37540
+    },
+    {
+      "epoch": 100.13333333333334,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029819491463956303,
+      "loss": 0.4741,
+      "step": 37550
+    },
+    {
+      "epoch": 100.16,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029819394203711393,
+      "loss": 0.4785,
+      "step": 37560
+    },
+    {
+      "epoch": 100.18666666666667,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029819296917429726,
+      "loss": 0.4705,
+      "step": 37570
+    },
+    {
+      "epoch": 100.21333333333334,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029819199605111475,
+      "loss": 0.4656,
+      "step": 37580
+    },
+    {
+      "epoch": 100.24,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.000298191022667568,
+      "loss": 0.4696,
+      "step": 37590
+    },
+    {
+      "epoch": 100.26666666666667,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002981900490236589,
+      "loss": 0.4733,
+      "step": 37600
+    },
+    {
+      "epoch": 100.29333333333334,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029818907511938893,
+      "loss": 0.4675,
+      "step": 37610
+    },
+    {
+      "epoch": 100.32,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029818810095476,
+      "loss": 0.471,
+      "step": 37620
+    },
+    {
+      "epoch": 100.34666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029818712652977374,
+      "loss": 0.4788,
+      "step": 37630
+    },
+    {
+      "epoch": 100.37333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029818615184443187,
+      "loss": 0.4755,
+      "step": 37640
+    },
+    {
+      "epoch": 100.4,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002981851768987361,
+      "loss": 0.4713,
+      "step": 37650
+    },
+    {
+      "epoch": 100.42666666666666,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002981842016926881,
+      "loss": 0.4781,
+      "step": 37660
+    },
+    {
+      "epoch": 100.45333333333333,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029818322622628964,
+      "loss": 0.4798,
+      "step": 37670
+    },
+    {
+      "epoch": 100.48,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029818225049954236,
+      "loss": 0.4867,
+      "step": 37680
+    },
+    {
+      "epoch": 100.50666666666666,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0002981812745124481,
+      "loss": 0.4764,
+      "step": 37690
+    },
+    {
+      "epoch": 100.53333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029818029826500855,
+      "loss": 0.4697,
+      "step": 37700
+    },
+    {
+      "epoch": 100.56,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002981793217572253,
+      "loss": 0.4707,
+      "step": 37710
+    },
+    {
+      "epoch": 100.58666666666667,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002981783449891002,
+      "loss": 0.4665,
+      "step": 37720
+    },
+    {
+      "epoch": 100.61333333333333,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029817736796063484,
+      "loss": 0.452,
+      "step": 37730
+    },
+    {
+      "epoch": 100.64,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002981763906718311,
+      "loss": 0.4574,
+      "step": 37740
+    },
+    {
+      "epoch": 100.66666666666667,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029817541312269054,
+      "loss": 0.4733,
+      "step": 37750
+    },
+    {
+      "epoch": 100.69333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.000298174435313215,
+      "loss": 0.4558,
+      "step": 37760
+    },
+    {
+      "epoch": 100.72,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002981734572434061,
+      "loss": 0.4734,
+      "step": 37770
+    },
+    {
+      "epoch": 100.74666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029817247891326563,
+      "loss": 0.4776,
+      "step": 37780
+    },
+    {
+      "epoch": 100.77333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029817150032279525,
+      "loss": 0.4846,
+      "step": 37790
+    },
+    {
+      "epoch": 100.8,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029817052147199677,
+      "loss": 0.4719,
+      "step": 37800
+    },
+    {
+      "epoch": 100.82666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002981695423608718,
+      "loss": 0.4658,
+      "step": 37810
+    },
+    {
+      "epoch": 100.85333333333334,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002981685629894221,
+      "loss": 0.4757,
+      "step": 37820
+    },
+    {
+      "epoch": 100.88,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002981675833576494,
+      "loss": 0.4634,
+      "step": 37830
+    },
+    {
+      "epoch": 100.90666666666667,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00029816660346555546,
+      "loss": 0.4664,
+      "step": 37840
+    },
+    {
+      "epoch": 100.93333333333334,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00029816562331314196,
+      "loss": 0.4679,
+      "step": 37850
+    },
+    {
+      "epoch": 100.96,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002981646429004106,
+      "loss": 0.4662,
+      "step": 37860
+    },
+    {
+      "epoch": 100.98666666666666,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029816366222736317,
+      "loss": 0.4791,
+      "step": 37870
+    },
+    {
+      "epoch": 101.0,
+      "eval_loss": 0.4809539318084717,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7941,
+      "eval_samples_per_second": 1.634,
+      "eval_steps_per_second": 0.102,
+      "step": 37875
+    },
+    {
+      "epoch": 101.01333333333334,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002981626812940013,
+      "loss": 0.4689,
+      "step": 37880
+    },
+    {
+      "epoch": 101.04,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.00029816170010032674,
+      "loss": 0.4927,
+      "step": 37890
+    },
+    {
+      "epoch": 101.06666666666666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029816071864634133,
+      "loss": 0.481,
+      "step": 37900
+    },
+    {
+      "epoch": 101.09333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002981597369320466,
+      "loss": 0.4753,
+      "step": 37910
+    },
+    {
+      "epoch": 101.12,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029815875495744446,
+      "loss": 0.4713,
+      "step": 37920
+    },
+    {
+      "epoch": 101.14666666666666,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002981577727225365,
+      "loss": 0.48,
+      "step": 37930
+    },
+    {
+      "epoch": 101.17333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029815679022732454,
+      "loss": 0.4705,
+      "step": 37940
+    },
+    {
+      "epoch": 101.2,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002981558074718102,
+      "loss": 0.4693,
+      "step": 37950
+    },
+    {
+      "epoch": 101.22666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029815482445599535,
+      "loss": 0.4668,
+      "step": 37960
+    },
+    {
+      "epoch": 101.25333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002981538411798816,
+      "loss": 0.4668,
+      "step": 37970
+    },
+    {
+      "epoch": 101.28,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002981528576434707,
+      "loss": 0.4801,
+      "step": 37980
+    },
+    {
+      "epoch": 101.30666666666667,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029815187384676445,
+      "loss": 0.4613,
+      "step": 37990
+    },
+    {
+      "epoch": 101.33333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002981508897897645,
+      "loss": 0.4769,
+      "step": 38000
+    },
+    {
+      "epoch": 101.36,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029814990547247255,
+      "loss": 0.4786,
+      "step": 38010
+    },
+    {
+      "epoch": 101.38666666666667,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029814892089489043,
+      "loss": 0.4704,
+      "step": 38020
+    },
+    {
+      "epoch": 101.41333333333333,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.00029814793605701984,
+      "loss": 0.4746,
+      "step": 38030
+    },
+    {
+      "epoch": 101.44,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029814695095886244,
+      "loss": 0.4782,
+      "step": 38040
+    },
+    {
+      "epoch": 101.46666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029814596560042003,
+      "loss": 0.487,
+      "step": 38050
+    },
+    {
+      "epoch": 101.49333333333334,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029814497998169434,
+      "loss": 0.4788,
+      "step": 38060
+    },
+    {
+      "epoch": 101.52,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029814399410268706,
+      "loss": 0.4734,
+      "step": 38070
+    },
+    {
+      "epoch": 101.54666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002981430079633999,
+      "loss": 0.4702,
+      "step": 38080
+    },
+    {
+      "epoch": 101.57333333333334,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002981420215638347,
+      "loss": 0.4695,
+      "step": 38090
+    },
+    {
+      "epoch": 101.6,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029814103490399314,
+      "loss": 0.4633,
+      "step": 38100
+    },
+    {
+      "epoch": 101.62666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002981400479838769,
+      "loss": 0.4498,
+      "step": 38110
+    },
+    {
+      "epoch": 101.65333333333334,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029813906080348784,
+      "loss": 0.4692,
+      "step": 38120
+    },
+    {
+      "epoch": 101.68,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029813807336282757,
+      "loss": 0.4609,
+      "step": 38130
+    },
+    {
+      "epoch": 101.70666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029813708566189784,
+      "loss": 0.4614,
+      "step": 38140
+    },
+    {
+      "epoch": 101.73333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002981360977007004,
+      "loss": 0.4796,
+      "step": 38150
+    },
+    {
+      "epoch": 101.76,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002981351094792371,
+      "loss": 0.4813,
+      "step": 38160
+    },
+    {
+      "epoch": 101.78666666666666,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029813412099750946,
+      "loss": 0.4804,
+      "step": 38170
+    },
+    {
+      "epoch": 101.81333333333333,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029813313225551943,
+      "loss": 0.4651,
+      "step": 38180
+    },
+    {
+      "epoch": 101.84,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029813214325326857,
+      "loss": 0.4714,
+      "step": 38190
+    },
+    {
+      "epoch": 101.86666666666666,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002981311539907587,
+      "loss": 0.4709,
+      "step": 38200
+    },
+    {
+      "epoch": 101.89333333333333,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0002981301644679916,
+      "loss": 0.4609,
+      "step": 38210
+    },
+    {
+      "epoch": 101.92,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.000298129174684969,
+      "loss": 0.4719,
+      "step": 38220
+    },
+    {
+      "epoch": 101.94666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029812818464169255,
+      "loss": 0.4649,
+      "step": 38230
+    },
+    {
+      "epoch": 101.97333333333333,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.000298127194338164,
+      "loss": 0.4762,
+      "step": 38240
+    },
+    {
+      "epoch": 102.0,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002981262037743852,
+      "loss": 0.4645,
+      "step": 38250
+    },
+    {
+      "epoch": 102.0,
+      "eval_loss": 0.48175567388534546,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3943,
+      "eval_samples_per_second": 1.539,
+      "eval_steps_per_second": 0.096,
+      "step": 38250
+    },
+    {
+      "epoch": 102.02666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002981252129503578,
+      "loss": 0.484,
+      "step": 38260
+    },
+    {
+      "epoch": 102.05333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002981242218660836,
+      "loss": 0.4902,
+      "step": 38270
+    },
+    {
+      "epoch": 102.08,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.00029812323052156426,
+      "loss": 0.4776,
+      "step": 38280
+    },
+    {
+      "epoch": 102.10666666666667,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029812223891680155,
+      "loss": 0.4727,
+      "step": 38290
+    },
+    {
+      "epoch": 102.13333333333334,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002981212470517973,
+      "loss": 0.474,
+      "step": 38300
+    },
+    {
+      "epoch": 102.16,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002981202549265531,
+      "loss": 0.4785,
+      "step": 38310
+    },
+    {
+      "epoch": 102.18666666666667,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0002981192625410708,
+      "loss": 0.4705,
+      "step": 38320
+    },
+    {
+      "epoch": 102.21333333333334,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029811826989535216,
+      "loss": 0.4655,
+      "step": 38330
+    },
+    {
+      "epoch": 102.24,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029811727698939884,
+      "loss": 0.4695,
+      "step": 38340
+    },
+    {
+      "epoch": 102.26666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029811628382321267,
+      "loss": 0.4737,
+      "step": 38350
+    },
+    {
+      "epoch": 102.29333333333334,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0002981152903967953,
+      "loss": 0.4678,
+      "step": 38360
+    },
+    {
+      "epoch": 102.32,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.00029811429671014857,
+      "loss": 0.4713,
+      "step": 38370
+    },
+    {
+      "epoch": 102.34666666666666,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029811330276327416,
+      "loss": 0.479,
+      "step": 38380
+    },
+    {
+      "epoch": 102.37333333333333,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0002981123085561738,
+      "loss": 0.4751,
+      "step": 38390
+    },
+    {
+      "epoch": 102.4,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002981113140888494,
+      "loss": 0.4718,
+      "step": 38400
+    },
+    {
+      "epoch": 102.42666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029811031936130246,
+      "loss": 0.4786,
+      "step": 38410
+    },
+    {
+      "epoch": 102.45333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002981093243735349,
+      "loss": 0.4801,
+      "step": 38420
+    },
+    {
+      "epoch": 102.48,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002981083291255484,
+      "loss": 0.4866,
+      "step": 38430
+    },
+    {
+      "epoch": 102.50666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002981073336173447,
+      "loss": 0.4758,
+      "step": 38440
+    },
+    {
+      "epoch": 102.53333333333333,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029810633784892567,
+      "loss": 0.4702,
+      "step": 38450
+    },
+    {
+      "epoch": 102.56,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002981053418202929,
+      "loss": 0.4708,
+      "step": 38460
+    },
+    {
+      "epoch": 102.58666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029810434553144817,
+      "loss": 0.4667,
+      "step": 38470
+    },
+    {
+      "epoch": 102.61333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002981033489823933,
+      "loss": 0.4526,
+      "step": 38480
+    },
+    {
+      "epoch": 102.64,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0002981023521731301,
+      "loss": 0.4573,
+      "step": 38490
+    },
+    {
+      "epoch": 102.66666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002981013551036601,
+      "loss": 0.4731,
+      "step": 38500
+    },
+    {
+      "epoch": 102.69333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002981003577739852,
+      "loss": 0.4562,
+      "step": 38510
+    },
+    {
+      "epoch": 102.72,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029809936018410717,
+      "loss": 0.4738,
+      "step": 38520
+    },
+    {
+      "epoch": 102.74666666666667,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029809836233402775,
+      "loss": 0.4775,
+      "step": 38530
+    },
+    {
+      "epoch": 102.77333333333333,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029809736422374864,
+      "loss": 0.4845,
+      "step": 38540
+    },
+    {
+      "epoch": 102.8,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029809636585327164,
+      "loss": 0.472,
+      "step": 38550
+    },
+    {
+      "epoch": 102.82666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002980953672225984,
+      "loss": 0.4657,
+      "step": 38560
+    },
+    {
+      "epoch": 102.85333333333334,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00029809436833173083,
+      "loss": 0.4747,
+      "step": 38570
+    },
+    {
+      "epoch": 102.88,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029809336918067066,
+      "loss": 0.4635,
+      "step": 38580
+    },
+    {
+      "epoch": 102.90666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029809236976941953,
+      "loss": 0.4657,
+      "step": 38590
+    },
+    {
+      "epoch": 102.93333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029809137009797934,
+      "loss": 0.4685,
+      "step": 38600
+    },
+    {
+      "epoch": 102.96,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002980903701663517,
+      "loss": 0.4663,
+      "step": 38610
+    },
+    {
+      "epoch": 102.98666666666666,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029808936997453844,
+      "loss": 0.4792,
+      "step": 38620
+    },
+    {
+      "epoch": 103.0,
+      "eval_loss": 0.47996532917022705,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0526,
+      "eval_samples_per_second": 1.592,
+      "eval_steps_per_second": 0.099,
+      "step": 38625
+    },
+    {
+      "epoch": 103.01333333333334,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002980883695225413,
+      "loss": 0.4684,
+      "step": 38630
+    },
+    {
+      "epoch": 103.04,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002980873688103621,
+      "loss": 0.4925,
+      "step": 38640
+    },
+    {
+      "epoch": 103.06666666666666,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002980863678380026,
+      "loss": 0.4806,
+      "step": 38650
+    },
+    {
+      "epoch": 103.09333333333333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002980853666054645,
+      "loss": 0.4748,
+      "step": 38660
+    },
+    {
+      "epoch": 103.12,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002980843651127495,
+      "loss": 0.472,
+      "step": 38670
+    },
+    {
+      "epoch": 103.14666666666666,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002980833633598595,
+      "loss": 0.4801,
+      "step": 38680
+    },
+    {
+      "epoch": 103.17333333333333,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002980823613467961,
+      "loss": 0.4712,
+      "step": 38690
+    },
+    {
+      "epoch": 103.2,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002980813590735612,
+      "loss": 0.469,
+      "step": 38700
+    },
+    {
+      "epoch": 103.22666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002980803565401565,
+      "loss": 0.4664,
+      "step": 38710
+    },
+    {
+      "epoch": 103.25333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002980793537465838,
+      "loss": 0.4663,
+      "step": 38720
+    },
+    {
+      "epoch": 103.28,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002980783506928449,
+      "loss": 0.4806,
+      "step": 38730
+    },
+    {
+      "epoch": 103.30666666666667,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00029807734737894137,
+      "loss": 0.4613,
+      "step": 38740
+    },
+    {
+      "epoch": 103.33333333333333,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002980763438048752,
+      "loss": 0.477,
+      "step": 38750
+    },
+    {
+      "epoch": 103.36,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.000298075339970648,
+      "loss": 0.4788,
+      "step": 38760
+    },
+    {
+      "epoch": 103.38666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029807433587626154,
+      "loss": 0.4707,
+      "step": 38770
+    },
+    {
+      "epoch": 103.41333333333333,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00029807333152171773,
+      "loss": 0.4753,
+      "step": 38780
+    },
+    {
+      "epoch": 103.44,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002980723269070182,
+      "loss": 0.4777,
+      "step": 38790
+    },
+    {
+      "epoch": 103.46666666666667,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029807132203216473,
+      "loss": 0.4874,
+      "step": 38800
+    },
+    {
+      "epoch": 103.49333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002980703168971591,
+      "loss": 0.479,
+      "step": 38810
+    },
+    {
+      "epoch": 103.52,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029806931150200316,
+      "loss": 0.4723,
+      "step": 38820
+    },
+    {
+      "epoch": 103.54666666666667,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002980683058466985,
+      "loss": 0.4698,
+      "step": 38830
+    },
+    {
+      "epoch": 103.57333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029806729993124705,
+      "loss": 0.4691,
+      "step": 38840
+    },
+    {
+      "epoch": 103.6,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002980662937556505,
+      "loss": 0.4632,
+      "step": 38850
+    },
+    {
+      "epoch": 103.62666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002980652873199106,
+      "loss": 0.4501,
+      "step": 38860
+    },
+    {
+      "epoch": 103.65333333333334,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002980642806240292,
+      "loss": 0.4692,
+      "step": 38870
+    },
+    {
+      "epoch": 103.68,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.000298063273668008,
+      "loss": 0.4604,
+      "step": 38880
+    },
+    {
+      "epoch": 103.70666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002980622664518488,
+      "loss": 0.4616,
+      "step": 38890
+    },
+    {
+      "epoch": 103.73333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002980612589755533,
+      "loss": 0.4807,
+      "step": 38900
+    },
+    {
+      "epoch": 103.76,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029806025123912337,
+      "loss": 0.4809,
+      "step": 38910
+    },
+    {
+      "epoch": 103.78666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029805924324256074,
+      "loss": 0.4796,
+      "step": 38920
+    },
+    {
+      "epoch": 103.81333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029805823498586716,
+      "loss": 0.4656,
+      "step": 38930
+    },
+    {
+      "epoch": 103.84,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029805722646904443,
+      "loss": 0.471,
+      "step": 38940
+    },
+    {
+      "epoch": 103.86666666666666,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002980562176920943,
+      "loss": 0.4702,
+      "step": 38950
+    },
+    {
+      "epoch": 103.89333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029805520865501853,
+      "loss": 0.462,
+      "step": 38960
+    },
+    {
+      "epoch": 103.92,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.000298054199357819,
+      "loss": 0.472,
+      "step": 38970
+    },
+    {
+      "epoch": 103.94666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002980531898004973,
+      "loss": 0.4643,
+      "step": 38980
+    },
+    {
+      "epoch": 103.97333333333333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002980521799830554,
+      "loss": 0.4761,
+      "step": 38990
+    },
+    {
+      "epoch": 104.0,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002980511699054949,
+      "loss": 0.4645,
+      "step": 39000
+    },
+    {
+      "epoch": 104.0,
+      "eval_loss": 0.4795394241809845,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.7369,
+      "eval_samples_per_second": 1.49,
+      "eval_steps_per_second": 0.093,
+      "step": 39000
+    },
+    {
+      "epoch": 104.02666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029805015956781773,
+      "loss": 0.4843,
+      "step": 39010
+    },
+    {
+      "epoch": 104.05333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029804914897002554,
+      "loss": 0.4906,
+      "step": 39020
+    },
+    {
+      "epoch": 104.08,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002980481381121202,
+      "loss": 0.4774,
+      "step": 39030
+    },
+    {
+      "epoch": 104.10666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029804712699410334,
+      "loss": 0.4723,
+      "step": 39040
+    },
+    {
+      "epoch": 104.13333333333334,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002980461156159769,
+      "loss": 0.4736,
+      "step": 39050
+    },
+    {
+      "epoch": 104.16,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029804510397774263,
+      "loss": 0.4792,
+      "step": 39060
+    },
+    {
+      "epoch": 104.18666666666667,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002980440920794022,
+      "loss": 0.4707,
+      "step": 39070
+    },
+    {
+      "epoch": 104.21333333333334,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029804307992095744,
+      "loss": 0.465,
+      "step": 39080
+    },
+    {
+      "epoch": 104.24,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002980420675024102,
+      "loss": 0.4689,
+      "step": 39090
+    },
+    {
+      "epoch": 104.26666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002980410548237622,
+      "loss": 0.4735,
+      "step": 39100
+    },
+    {
+      "epoch": 104.29333333333334,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029804004188501527,
+      "loss": 0.4675,
+      "step": 39110
+    },
+    {
+      "epoch": 104.32,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002980390286861711,
+      "loss": 0.4709,
+      "step": 39120
+    },
+    {
+      "epoch": 104.34666666666666,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002980380152272315,
+      "loss": 0.4788,
+      "step": 39130
+    },
+    {
+      "epoch": 104.37333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029803700150819826,
+      "loss": 0.4754,
+      "step": 39140
+    },
+    {
+      "epoch": 104.4,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002980359875290732,
+      "loss": 0.4713,
+      "step": 39150
+    },
+    {
+      "epoch": 104.42666666666666,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.000298034973289858,
+      "loss": 0.4783,
+      "step": 39160
+    },
+    {
+      "epoch": 104.45333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029803395879055456,
+      "loss": 0.4797,
+      "step": 39170
+    },
+    {
+      "epoch": 104.48,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029803294403116465,
+      "loss": 0.4865,
+      "step": 39180
+    },
+    {
+      "epoch": 104.50666666666666,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029803192901169,
+      "loss": 0.4755,
+      "step": 39190
+    },
+    {
+      "epoch": 104.53333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029803091373213235,
+      "loss": 0.4699,
+      "step": 39200
+    },
+    {
+      "epoch": 104.56,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002980298981924936,
+      "loss": 0.4708,
+      "step": 39210
+    },
+    {
+      "epoch": 104.58666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002980288823927754,
+      "loss": 0.4661,
+      "step": 39220
+    },
+    {
+      "epoch": 104.61333333333333,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.00029802786633297966,
+      "loss": 0.4527,
+      "step": 39230
+    },
+    {
+      "epoch": 104.64,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029802685001310814,
+      "loss": 0.4575,
+      "step": 39240
+    },
+    {
+      "epoch": 104.66666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029802583343316254,
+      "loss": 0.4732,
+      "step": 39250
+    },
+    {
+      "epoch": 104.69333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029802481659314475,
+      "loss": 0.4562,
+      "step": 39260
+    },
+    {
+      "epoch": 104.72,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002980237994930565,
+      "loss": 0.4731,
+      "step": 39270
+    },
+    {
+      "epoch": 104.74666666666667,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029802278213289957,
+      "loss": 0.4775,
+      "step": 39280
+    },
+    {
+      "epoch": 104.77333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002980217645126758,
+      "loss": 0.4846,
+      "step": 39290
+    },
+    {
+      "epoch": 104.8,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029802074663238696,
+      "loss": 0.4713,
+      "step": 39300
+    },
+    {
+      "epoch": 104.82666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002980197284920348,
+      "loss": 0.4662,
+      "step": 39310
+    },
+    {
+      "epoch": 104.85333333333334,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029801871009162113,
+      "loss": 0.4752,
+      "step": 39320
+    },
+    {
+      "epoch": 104.88,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029801769143114777,
+      "loss": 0.4635,
+      "step": 39330
+    },
+    {
+      "epoch": 104.90666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002980166725106165,
+      "loss": 0.4655,
+      "step": 39340
+    },
+    {
+      "epoch": 104.93333333333334,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.000298015653330029,
+      "loss": 0.4679,
+      "step": 39350
+    },
+    {
+      "epoch": 104.96,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029801463388938725,
+      "loss": 0.4651,
+      "step": 39360
+    },
+    {
+      "epoch": 104.98666666666666,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029801361418869293,
+      "loss": 0.4797,
+      "step": 39370
+    },
+    {
+      "epoch": 105.0,
+      "eval_loss": 0.4803178310394287,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7476,
+      "eval_samples_per_second": 1.641,
+      "eval_steps_per_second": 0.103,
+      "step": 39375
+    },
+    {
+      "epoch": 105.01333333333334,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0002980125942279478,
+      "loss": 0.4686,
+      "step": 39380
+    },
+    {
+      "epoch": 105.04,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0002980115740071537,
+      "loss": 0.4933,
+      "step": 39390
+    },
+    {
+      "epoch": 105.06666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029801055352631243,
+      "loss": 0.48,
+      "step": 39400
+    },
+    {
+      "epoch": 105.09333333333333,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002980095327854258,
+      "loss": 0.4745,
+      "step": 39410
+    },
+    {
+      "epoch": 105.12,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002980085117844956,
+      "loss": 0.4716,
+      "step": 39420
+    },
+    {
+      "epoch": 105.14666666666666,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029800749052352353,
+      "loss": 0.4807,
+      "step": 39430
+    },
+    {
+      "epoch": 105.17333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029800646900251143,
+      "loss": 0.4713,
+      "step": 39440
+    },
+    {
+      "epoch": 105.2,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002980054472214612,
+      "loss": 0.469,
+      "step": 39450
+    },
+    {
+      "epoch": 105.22666666666667,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002980044251803745,
+      "loss": 0.4664,
+      "step": 39460
+    },
+    {
+      "epoch": 105.25333333333333,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029800340287925326,
+      "loss": 0.4668,
+      "step": 39470
+    },
+    {
+      "epoch": 105.28,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029800238031809913,
+      "loss": 0.4803,
+      "step": 39480
+    },
+    {
+      "epoch": 105.30666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.000298001357496914,
+      "loss": 0.4608,
+      "step": 39490
+    },
+    {
+      "epoch": 105.33333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029800033441569965,
+      "loss": 0.4766,
+      "step": 39500
+    },
+    {
+      "epoch": 105.36,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002979993110744578,
+      "loss": 0.4783,
+      "step": 39510
+    },
+    {
+      "epoch": 105.38666666666667,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002979982874731904,
+      "loss": 0.4706,
+      "step": 39520
+    },
+    {
+      "epoch": 105.41333333333333,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029799726361189906,
+      "loss": 0.4752,
+      "step": 39530
+    },
+    {
+      "epoch": 105.44,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002979962394905858,
+      "loss": 0.4779,
+      "step": 39540
+    },
+    {
+      "epoch": 105.46666666666667,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029799521510925225,
+      "loss": 0.4871,
+      "step": 39550
+    },
+    {
+      "epoch": 105.49333333333334,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0002979941904679003,
+      "loss": 0.4789,
+      "step": 39560
+    },
+    {
+      "epoch": 105.52,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029799316556653165,
+      "loss": 0.4727,
+      "step": 39570
+    },
+    {
+      "epoch": 105.54666666666667,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002979921404051482,
+      "loss": 0.47,
+      "step": 39580
+    },
+    {
+      "epoch": 105.57333333333334,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029799111498375163,
+      "loss": 0.4687,
+      "step": 39590
+    },
+    {
+      "epoch": 105.6,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029799008930234393,
+      "loss": 0.4626,
+      "step": 39600
+    },
+    {
+      "epoch": 105.62666666666667,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029798906336092677,
+      "loss": 0.45,
+      "step": 39610
+    },
+    {
+      "epoch": 105.65333333333334,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.000297988037159502,
+      "loss": 0.4699,
+      "step": 39620
+    },
+    {
+      "epoch": 105.68,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002979870106980713,
+      "loss": 0.4608,
+      "step": 39630
+    },
+    {
+      "epoch": 105.70666666666666,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002979859839766367,
+      "loss": 0.4608,
+      "step": 39640
+    },
+    {
+      "epoch": 105.73333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002979849569951998,
+      "loss": 0.4803,
+      "step": 39650
+    },
+    {
+      "epoch": 105.76,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002979839297537625,
+      "loss": 0.4818,
+      "step": 39660
+    },
+    {
+      "epoch": 105.78666666666666,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029798290225232664,
+      "loss": 0.4803,
+      "step": 39670
+    },
+    {
+      "epoch": 105.81333333333333,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.000297981874490894,
+      "loss": 0.4658,
+      "step": 39680
+    },
+    {
+      "epoch": 105.84,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029798084646946623,
+      "loss": 0.4714,
+      "step": 39690
+    },
+    {
+      "epoch": 105.86666666666666,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002979798181880454,
+      "loss": 0.47,
+      "step": 39700
+    },
+    {
+      "epoch": 105.89333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002979787896466331,
+      "loss": 0.4613,
+      "step": 39710
+    },
+    {
+      "epoch": 105.92,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029797776084523127,
+      "loss": 0.4722,
+      "step": 39720
+    },
+    {
+      "epoch": 105.94666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002979767317838416,
+      "loss": 0.464,
+      "step": 39730
+    },
+    {
+      "epoch": 105.97333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029797570246246604,
+      "loss": 0.4754,
+      "step": 39740
+    },
+    {
+      "epoch": 106.0,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029797467288110625,
+      "loss": 0.4648,
+      "step": 39750
+    },
+    {
+      "epoch": 106.0,
+      "eval_loss": 0.47966986894607544,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.7977,
+      "eval_samples_per_second": 1.482,
+      "eval_steps_per_second": 0.093,
+      "step": 39750
+    },
+    {
+      "epoch": 106.02666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029797364303976417,
+      "loss": 0.4845,
+      "step": 39760
+    },
+    {
+      "epoch": 106.05333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029797261293844156,
+      "loss": 0.4906,
+      "step": 39770
+    },
+    {
+      "epoch": 106.08,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002979715825771402,
+      "loss": 0.4774,
+      "step": 39780
+    },
+    {
+      "epoch": 106.10666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002979705519558619,
+      "loss": 0.4722,
+      "step": 39790
+    },
+    {
+      "epoch": 106.13333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002979695210746085,
+      "loss": 0.4735,
+      "step": 39800
+    },
+    {
+      "epoch": 106.16,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029796848993338186,
+      "loss": 0.4786,
+      "step": 39810
+    },
+    {
+      "epoch": 106.18666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002979674585321837,
+      "loss": 0.4703,
+      "step": 39820
+    },
+    {
+      "epoch": 106.21333333333334,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029796642687101584,
+      "loss": 0.4649,
+      "step": 39830
+    },
+    {
+      "epoch": 106.24,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029796539494988017,
+      "loss": 0.4683,
+      "step": 39840
+    },
+    {
+      "epoch": 106.26666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029796436276877846,
+      "loss": 0.4738,
+      "step": 39850
+    },
+    {
+      "epoch": 106.29333333333334,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029796333032771244,
+      "loss": 0.4677,
+      "step": 39860
+    },
+    {
+      "epoch": 106.32,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029796229762668407,
+      "loss": 0.4702,
+      "step": 39870
+    },
+    {
+      "epoch": 106.34666666666666,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0002979612646656951,
+      "loss": 0.4795,
+      "step": 39880
+    },
+    {
+      "epoch": 106.37333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002979602314447473,
+      "loss": 0.4748,
+      "step": 39890
+    },
+    {
+      "epoch": 106.4,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029795919796384257,
+      "loss": 0.4716,
+      "step": 39900
+    },
+    {
+      "epoch": 106.42666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002979581642229826,
+      "loss": 0.4779,
+      "step": 39910
+    },
+    {
+      "epoch": 106.45333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029795713022216937,
+      "loss": 0.4798,
+      "step": 39920
+    },
+    {
+      "epoch": 106.48,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002979560959614046,
+      "loss": 0.4864,
+      "step": 39930
+    },
+    {
+      "epoch": 106.50666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002979550614406901,
+      "loss": 0.4762,
+      "step": 39940
+    },
+    {
+      "epoch": 106.53333333333333,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002979540266600277,
+      "loss": 0.4695,
+      "step": 39950
+    },
+    {
+      "epoch": 106.56,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029795299161941926,
+      "loss": 0.4705,
+      "step": 39960
+    },
+    {
+      "epoch": 106.58666666666667,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029795195631886657,
+      "loss": 0.4662,
+      "step": 39970
+    },
+    {
+      "epoch": 106.61333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002979509207583714,
+      "loss": 0.4529,
+      "step": 39980
+    },
+    {
+      "epoch": 106.64,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029794988493793564,
+      "loss": 0.4574,
+      "step": 39990
+    },
+    {
+      "epoch": 106.66666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002979488488575611,
+      "loss": 0.4732,
+      "step": 40000
+    },
+    {
+      "epoch": 106.69333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029794781251724957,
+      "loss": 0.4556,
+      "step": 40010
+    },
+    {
+      "epoch": 106.72,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002979467759170029,
+      "loss": 0.4732,
+      "step": 40020
+    },
+    {
+      "epoch": 106.74666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029794573905682283,
+      "loss": 0.4771,
+      "step": 40030
+    },
+    {
+      "epoch": 106.77333333333333,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0002979447019367113,
+      "loss": 0.485,
+      "step": 40040
+    },
+    {
+      "epoch": 106.8,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002979436645566701,
+      "loss": 0.4724,
+      "step": 40050
+    },
+    {
+      "epoch": 106.82666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029794262691670105,
+      "loss": 0.4667,
+      "step": 40060
+    },
+    {
+      "epoch": 106.85333333333334,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002979415890168059,
+      "loss": 0.4752,
+      "step": 40070
+    },
+    {
+      "epoch": 106.88,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.00029794055085698655,
+      "loss": 0.4635,
+      "step": 40080
+    },
+    {
+      "epoch": 106.90666666666667,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002979395124372448,
+      "loss": 0.466,
+      "step": 40090
+    },
+    {
+      "epoch": 106.93333333333334,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029793847375758246,
+      "loss": 0.4676,
+      "step": 40100
+    },
+    {
+      "epoch": 106.96,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029793743481800145,
+      "loss": 0.466,
+      "step": 40110
+    },
+    {
+      "epoch": 106.98666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029793639561850345,
+      "loss": 0.479,
+      "step": 40120
+    },
+    {
+      "epoch": 107.0,
+      "eval_loss": 0.48041215538978577,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0882,
+      "eval_samples_per_second": 1.586,
+      "eval_steps_per_second": 0.099,
+      "step": 40125
+    },
+    {
+      "epoch": 107.01333333333334,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002979353561590904,
+      "loss": 0.4689,
+      "step": 40130
+    },
+    {
+      "epoch": 107.04,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.000297934316439764,
+      "loss": 0.4929,
+      "step": 40140
+    },
+    {
+      "epoch": 107.06666666666666,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002979332764605262,
+      "loss": 0.4798,
+      "step": 40150
+    },
+    {
+      "epoch": 107.09333333333333,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002979322362213788,
+      "loss": 0.4746,
+      "step": 40160
+    },
+    {
+      "epoch": 107.12,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0002979311957223236,
+      "loss": 0.4713,
+      "step": 40170
+    },
+    {
+      "epoch": 107.14666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002979301549633624,
+      "loss": 0.48,
+      "step": 40180
+    },
+    {
+      "epoch": 107.17333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002979291139444971,
+      "loss": 0.4706,
+      "step": 40190
+    },
+    {
+      "epoch": 107.2,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029792807266572954,
+      "loss": 0.469,
+      "step": 40200
+    },
+    {
+      "epoch": 107.22666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029792703112706145,
+      "loss": 0.4658,
+      "step": 40210
+    },
+    {
+      "epoch": 107.25333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029792598932849477,
+      "loss": 0.467,
+      "step": 40220
+    },
+    {
+      "epoch": 107.28,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029792494727003127,
+      "loss": 0.4804,
+      "step": 40230
+    },
+    {
+      "epoch": 107.30666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029792390495167275,
+      "loss": 0.4612,
+      "step": 40240
+    },
+    {
+      "epoch": 107.33333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002979228623734211,
+      "loss": 0.4765,
+      "step": 40250
+    },
+    {
+      "epoch": 107.36,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002979218195352781,
+      "loss": 0.4787,
+      "step": 40260
+    },
+    {
+      "epoch": 107.38666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029792077643724563,
+      "loss": 0.4702,
+      "step": 40270
+    },
+    {
+      "epoch": 107.41333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029791973307932555,
+      "loss": 0.4752,
+      "step": 40280
+    },
+    {
+      "epoch": 107.44,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0002979186894615196,
+      "loss": 0.4782,
+      "step": 40290
+    },
+    {
+      "epoch": 107.46666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002979176455838297,
+      "loss": 0.4866,
+      "step": 40300
+    },
+    {
+      "epoch": 107.49333333333334,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002979166014462576,
+      "loss": 0.4777,
+      "step": 40310
+    },
+    {
+      "epoch": 107.52,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0002979155570488052,
+      "loss": 0.4726,
+      "step": 40320
+    },
+    {
+      "epoch": 107.54666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029791451239147434,
+      "loss": 0.4697,
+      "step": 40330
+    },
+    {
+      "epoch": 107.57333333333334,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002979134674742668,
+      "loss": 0.4693,
+      "step": 40340
+    },
+    {
+      "epoch": 107.6,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002979124222971844,
+      "loss": 0.4629,
+      "step": 40350
+    },
+    {
+      "epoch": 107.62666666666667,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029791137686022915,
+      "loss": 0.4501,
+      "step": 40360
+    },
+    {
+      "epoch": 107.65333333333334,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002979103311634027,
+      "loss": 0.4696,
+      "step": 40370
+    },
+    {
+      "epoch": 107.68,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002979092852067069,
+      "loss": 0.461,
+      "step": 40380
+    },
+    {
+      "epoch": 107.70666666666666,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029790823899014375,
+      "loss": 0.4607,
+      "step": 40390
+    },
+    {
+      "epoch": 107.73333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029790719251371485,
+      "loss": 0.4806,
+      "step": 40400
+    },
+    {
+      "epoch": 107.76,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029790614577742227,
+      "loss": 0.4815,
+      "step": 40410
+    },
+    {
+      "epoch": 107.78666666666666,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002979050987812676,
+      "loss": 0.4802,
+      "step": 40420
+    },
+    {
+      "epoch": 107.81333333333333,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.00029790405152525296,
+      "loss": 0.4652,
+      "step": 40430
+    },
+    {
+      "epoch": 107.84,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029790300400938,
+      "loss": 0.471,
+      "step": 40440
+    },
+    {
+      "epoch": 107.86666666666666,
+      "grad_norm": 0.5,
+      "learning_rate": 0.00029790195623365063,
+      "loss": 0.4706,
+      "step": 40450
+    },
+    {
+      "epoch": 107.89333333333333,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029790090819806664,
+      "loss": 0.4613,
+      "step": 40460
+    },
+    {
+      "epoch": 107.92,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002978998599026299,
+      "loss": 0.4717,
+      "step": 40470
+    },
+    {
+      "epoch": 107.94666666666667,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029789881134734223,
+      "loss": 0.464,
+      "step": 40480
+    },
+    {
+      "epoch": 107.97333333333333,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029789776253220555,
+      "loss": 0.476,
+      "step": 40490
+    },
+    {
+      "epoch": 108.0,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002978967134572216,
+      "loss": 0.4653,
+      "step": 40500
+    },
+    {
+      "epoch": 108.0,
+      "eval_loss": 0.4787892997264862,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2779,
+      "eval_samples_per_second": 1.557,
+      "eval_steps_per_second": 0.097,
+      "step": 40500
+    },
+    {
+      "epoch": 108.02666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029789566412239234,
+      "loss": 0.4844,
+      "step": 40510
+    },
+    {
+      "epoch": 108.05333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002978946145277195,
+      "loss": 0.4897,
+      "step": 40520
+    },
+    {
+      "epoch": 108.08,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029789356467320503,
+      "loss": 0.4772,
+      "step": 40530
+    },
+    {
+      "epoch": 108.10666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002978925145588507,
+      "loss": 0.4727,
+      "step": 40540
+    },
+    {
+      "epoch": 108.13333333333334,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029789146418465827,
+      "loss": 0.4737,
+      "step": 40550
+    },
+    {
+      "epoch": 108.16,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029789041355062976,
+      "loss": 0.478,
+      "step": 40560
+    },
+    {
+      "epoch": 108.18666666666667,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00029788936265676694,
+      "loss": 0.4702,
+      "step": 40570
+    },
+    {
+      "epoch": 108.21333333333334,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029788831150307166,
+      "loss": 0.4647,
+      "step": 40580
+    },
+    {
+      "epoch": 108.24,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.00029788726008954576,
+      "loss": 0.469,
+      "step": 40590
+    },
+    {
+      "epoch": 108.26666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029788620841619107,
+      "loss": 0.4732,
+      "step": 40600
+    },
+    {
+      "epoch": 108.29333333333334,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002978851564830095,
+      "loss": 0.4676,
+      "step": 40610
+    },
+    {
+      "epoch": 108.32,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029788410429000284,
+      "loss": 0.4709,
+      "step": 40620
+    },
+    {
+      "epoch": 108.34666666666666,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.000297883051837173,
+      "loss": 0.4781,
+      "step": 40630
+    },
+    {
+      "epoch": 108.37333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002978819991245217,
+      "loss": 0.4749,
+      "step": 40640
+    },
+    {
+      "epoch": 108.4,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029788094615205095,
+      "loss": 0.4721,
+      "step": 40650
+    },
+    {
+      "epoch": 108.42666666666666,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002978798929197625,
+      "loss": 0.4772,
+      "step": 40660
+    },
+    {
+      "epoch": 108.45333333333333,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0002978788394276582,
+      "loss": 0.4805,
+      "step": 40670
+    },
+    {
+      "epoch": 108.48,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00029787778567573996,
+      "loss": 0.4864,
+      "step": 40680
+    },
+    {
+      "epoch": 108.50666666666666,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002978767316640096,
+      "loss": 0.4762,
+      "step": 40690
+    },
+    {
+      "epoch": 108.53333333333333,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029787567739246896,
+      "loss": 0.47,
+      "step": 40700
+    },
+    {
+      "epoch": 108.56,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002978746228611199,
+      "loss": 0.4702,
+      "step": 40710
+    },
+    {
+      "epoch": 108.58666666666667,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029787356806996426,
+      "loss": 0.4666,
+      "step": 40720
+    },
+    {
+      "epoch": 108.61333333333333,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0002978725130190039,
+      "loss": 0.4525,
+      "step": 40730
+    },
+    {
+      "epoch": 108.64,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029787145770824073,
+      "loss": 0.4577,
+      "step": 40740
+    },
+    {
+      "epoch": 108.66666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002978704021376765,
+      "loss": 0.4728,
+      "step": 40750
+    },
+    {
+      "epoch": 108.69333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029786934630731315,
+      "loss": 0.4557,
+      "step": 40760
+    },
+    {
+      "epoch": 108.72,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002978682902171525,
+      "loss": 0.4726,
+      "step": 40770
+    },
+    {
+      "epoch": 108.74666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029786723386719644,
+      "loss": 0.4775,
+      "step": 40780
+    },
+    {
+      "epoch": 108.77333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029786617725744677,
+      "loss": 0.4851,
+      "step": 40790
+    },
+    {
+      "epoch": 108.8,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002978651203879054,
+      "loss": 0.4717,
+      "step": 40800
+    },
+    {
+      "epoch": 108.82666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002978640632585741,
+      "loss": 0.4666,
+      "step": 40810
+    },
+    {
+      "epoch": 108.85333333333334,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029786300586945487,
+      "loss": 0.4755,
+      "step": 40820
+    },
+    {
+      "epoch": 108.88,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029786194822054946,
+      "loss": 0.4639,
+      "step": 40830
+    },
+    {
+      "epoch": 108.90666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002978608903118597,
+      "loss": 0.4662,
+      "step": 40840
+    },
+    {
+      "epoch": 108.93333333333334,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00029785983214338754,
+      "loss": 0.4676,
+      "step": 40850
+    },
+    {
+      "epoch": 108.96,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002978587737151348,
+      "loss": 0.4663,
+      "step": 40860
+    },
+    {
+      "epoch": 108.98666666666666,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029785771502710334,
+      "loss": 0.4794,
+      "step": 40870
+    },
+    {
+      "epoch": 109.0,
+      "eval_loss": 0.47901442646980286,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8116,
+      "eval_samples_per_second": 1.631,
+      "eval_steps_per_second": 0.102,
+      "step": 40875
+    },
+    {
+      "epoch": 109.01333333333334,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.000297856656079295,
+      "loss": 0.4682,
+      "step": 40880
+    },
+    {
+      "epoch": 109.04,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002978555968717117,
+      "loss": 0.493,
+      "step": 40890
+    },
+    {
+      "epoch": 109.06666666666666,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029785453740435524,
+      "loss": 0.4806,
+      "step": 40900
+    },
+    {
+      "epoch": 109.09333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029785347767722747,
+      "loss": 0.4747,
+      "step": 40910
+    },
+    {
+      "epoch": 109.12,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029785241769033033,
+      "loss": 0.4716,
+      "step": 40920
+    },
+    {
+      "epoch": 109.14666666666666,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00029785135744366567,
+      "loss": 0.48,
+      "step": 40930
+    },
+    {
+      "epoch": 109.17333333333333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002978502969372353,
+      "loss": 0.4701,
+      "step": 40940
+    },
+    {
+      "epoch": 109.2,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029784923617104104,
+      "loss": 0.4691,
+      "step": 40950
+    },
+    {
+      "epoch": 109.22666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002978481751450849,
+      "loss": 0.4664,
+      "step": 40960
+    },
+    {
+      "epoch": 109.25333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002978471138593686,
+      "loss": 0.4665,
+      "step": 40970
+    },
+    {
+      "epoch": 109.28,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029784605231389404,
+      "loss": 0.4802,
+      "step": 40980
+    },
+    {
+      "epoch": 109.30666666666667,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00029784499050866317,
+      "loss": 0.4616,
+      "step": 40990
+    },
+    {
+      "epoch": 109.33333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029784392844367784,
+      "loss": 0.4772,
+      "step": 41000
+    },
+    {
+      "epoch": 109.36,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029784286611893985,
+      "loss": 0.4785,
+      "step": 41010
+    },
+    {
+      "epoch": 109.38666666666667,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.000297841803534451,
+      "loss": 0.4702,
+      "step": 41020
+    },
+    {
+      "epoch": 109.41333333333333,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029784074069021333,
+      "loss": 0.4754,
+      "step": 41030
+    },
+    {
+      "epoch": 109.44,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029783967758622865,
+      "loss": 0.478,
+      "step": 41040
+    },
+    {
+      "epoch": 109.46666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029783861422249874,
+      "loss": 0.4874,
+      "step": 41050
+    },
+    {
+      "epoch": 109.49333333333334,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029783755059902553,
+      "loss": 0.4787,
+      "step": 41060
+    },
+    {
+      "epoch": 109.52,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002978364867158109,
+      "loss": 0.4719,
+      "step": 41070
+    },
+    {
+      "epoch": 109.54666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029783542257285674,
+      "loss": 0.4698,
+      "step": 41080
+    },
+    {
+      "epoch": 109.57333333333334,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029783435817016483,
+      "loss": 0.4685,
+      "step": 41090
+    },
+    {
+      "epoch": 109.6,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029783329350773716,
+      "loss": 0.4626,
+      "step": 41100
+    },
+    {
+      "epoch": 109.62666666666667,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002978322285855755,
+      "loss": 0.4493,
+      "step": 41110
+    },
+    {
+      "epoch": 109.65333333333334,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029783116340368173,
+      "loss": 0.4691,
+      "step": 41120
+    },
+    {
+      "epoch": 109.68,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002978300979620578,
+      "loss": 0.4601,
+      "step": 41130
+    },
+    {
+      "epoch": 109.70666666666666,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002978290322607055,
+      "loss": 0.4612,
+      "step": 41140
+    },
+    {
+      "epoch": 109.73333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029782796629962676,
+      "loss": 0.4806,
+      "step": 41150
+    },
+    {
+      "epoch": 109.76,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029782690007882344,
+      "loss": 0.4818,
+      "step": 41160
+    },
+    {
+      "epoch": 109.78666666666666,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029782583359829737,
+      "loss": 0.4797,
+      "step": 41170
+    },
+    {
+      "epoch": 109.81333333333333,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002978247668580505,
+      "loss": 0.4654,
+      "step": 41180
+    },
+    {
+      "epoch": 109.84,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002978236998580846,
+      "loss": 0.471,
+      "step": 41190
+    },
+    {
+      "epoch": 109.86666666666666,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002978226325984016,
+      "loss": 0.4706,
+      "step": 41200
+    },
+    {
+      "epoch": 109.89333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002978215650790034,
+      "loss": 0.4612,
+      "step": 41210
+    },
+    {
+      "epoch": 109.92,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029782049729989193,
+      "loss": 0.472,
+      "step": 41220
+    },
+    {
+      "epoch": 109.94666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002978194292610689,
+      "loss": 0.4642,
+      "step": 41230
+    },
+    {
+      "epoch": 109.97333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029781836096253627,
+      "loss": 0.4754,
+      "step": 41240
+    },
+    {
+      "epoch": 110.0,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029781729240429595,
+      "loss": 0.4642,
+      "step": 41250
+    },
+    {
+      "epoch": 110.0,
+      "eval_loss": 0.47956129908561707,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 11.0804,
+      "eval_samples_per_second": 1.444,
+      "eval_steps_per_second": 0.09,
+      "step": 41250
+    },
+    {
+      "epoch": 110.02666666666667,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002978162235863498,
+      "loss": 0.4841,
+      "step": 41260
+    },
+    {
+      "epoch": 110.05333333333333,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002978151545086997,
+      "loss": 0.4894,
+      "step": 41270
+    },
+    {
+      "epoch": 110.08,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002978140851713475,
+      "loss": 0.4773,
+      "step": 41280
+    },
+    {
+      "epoch": 110.10666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029781301557429507,
+      "loss": 0.4721,
+      "step": 41290
+    },
+    {
+      "epoch": 110.13333333333334,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029781194571754435,
+      "loss": 0.4734,
+      "step": 41300
+    },
+    {
+      "epoch": 110.16,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002978108756010972,
+      "loss": 0.4783,
+      "step": 41310
+    },
+    {
+      "epoch": 110.18666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029780980522495546,
+      "loss": 0.4705,
+      "step": 41320
+    },
+    {
+      "epoch": 110.21333333333334,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.000297808734589121,
+      "loss": 0.4647,
+      "step": 41330
+    },
+    {
+      "epoch": 110.24,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002978076636935958,
+      "loss": 0.4684,
+      "step": 41340
+    },
+    {
+      "epoch": 110.26666666666667,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029780659253838166,
+      "loss": 0.473,
+      "step": 41350
+    },
+    {
+      "epoch": 110.29333333333334,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029780552112348047,
+      "loss": 0.4672,
+      "step": 41360
+    },
+    {
+      "epoch": 110.32,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029780444944889416,
+      "loss": 0.4712,
+      "step": 41370
+    },
+    {
+      "epoch": 110.34666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002978033775146245,
+      "loss": 0.4787,
+      "step": 41380
+    },
+    {
+      "epoch": 110.37333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002978023053206735,
+      "loss": 0.4751,
+      "step": 41390
+    },
+    {
+      "epoch": 110.4,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029780123286704297,
+      "loss": 0.4714,
+      "step": 41400
+    },
+    {
+      "epoch": 110.42666666666666,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002978001601537348,
+      "loss": 0.4778,
+      "step": 41410
+    },
+    {
+      "epoch": 110.45333333333333,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.00029779908718075097,
+      "loss": 0.4793,
+      "step": 41420
+    },
+    {
+      "epoch": 110.48,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0002977980139480932,
+      "loss": 0.4864,
+      "step": 41430
+    },
+    {
+      "epoch": 110.50666666666666,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002977969404557635,
+      "loss": 0.4767,
+      "step": 41440
+    },
+    {
+      "epoch": 110.53333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002977958667037637,
+      "loss": 0.4697,
+      "step": 41450
+    },
+    {
+      "epoch": 110.56,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00029779479269209576,
+      "loss": 0.4709,
+      "step": 41460
+    },
+    {
+      "epoch": 110.58666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029779371842076144,
+      "loss": 0.4664,
+      "step": 41470
+    },
+    {
+      "epoch": 110.61333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029779264388976275,
+      "loss": 0.452,
+      "step": 41480
+    },
+    {
+      "epoch": 110.64,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029779156909910155,
+      "loss": 0.4574,
+      "step": 41490
+    },
+    {
+      "epoch": 110.66666666666667,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002977904940487796,
+      "loss": 0.4732,
+      "step": 41500
+    },
+    {
+      "epoch": 110.69333333333333,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029778941873879897,
+      "loss": 0.4559,
+      "step": 41510
+    },
+    {
+      "epoch": 110.72,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002977883431691615,
+      "loss": 0.4738,
+      "step": 41520
+    },
+    {
+      "epoch": 110.74666666666667,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.000297787267339869,
+      "loss": 0.4776,
+      "step": 41530
+    },
+    {
+      "epoch": 110.77333333333333,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029778619125092343,
+      "loss": 0.4847,
+      "step": 41540
+    },
+    {
+      "epoch": 110.8,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029778511490232665,
+      "loss": 0.472,
+      "step": 41550
+    },
+    {
+      "epoch": 110.82666666666667,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0002977840382940806,
+      "loss": 0.4653,
+      "step": 41560
+    },
+    {
+      "epoch": 110.85333333333334,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002977829614261871,
+      "loss": 0.4749,
+      "step": 41570
+    },
+    {
+      "epoch": 110.88,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002977818842986481,
+      "loss": 0.4636,
+      "step": 41580
+    },
+    {
+      "epoch": 110.90666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002977808069114654,
+      "loss": 0.466,
+      "step": 41590
+    },
+    {
+      "epoch": 110.93333333333334,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029777972926464105,
+      "loss": 0.4678,
+      "step": 41600
+    },
+    {
+      "epoch": 110.96,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002977786513581769,
+      "loss": 0.4655,
+      "step": 41610
+    },
+    {
+      "epoch": 110.98666666666666,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002977775731920747,
+      "loss": 0.4793,
+      "step": 41620
+    },
+    {
+      "epoch": 111.0,
+      "eval_loss": 0.47923967242240906,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9483,
+      "eval_samples_per_second": 1.608,
+      "eval_steps_per_second": 0.101,
+      "step": 41625
+    },
+    {
+      "epoch": 111.01333333333334,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029777649476633644,
+      "loss": 0.4685,
+      "step": 41630
+    },
+    {
+      "epoch": 111.04,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0002977754160809641,
+      "loss": 0.4923,
+      "step": 41640
+    },
+    {
+      "epoch": 111.06666666666666,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029777433713595945,
+      "loss": 0.4804,
+      "step": 41650
+    },
+    {
+      "epoch": 111.09333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002977732579313244,
+      "loss": 0.475,
+      "step": 41660
+    },
+    {
+      "epoch": 111.12,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002977721784670609,
+      "loss": 0.4712,
+      "step": 41670
+    },
+    {
+      "epoch": 111.14666666666666,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00029777109874317087,
+      "loss": 0.4799,
+      "step": 41680
+    },
+    {
+      "epoch": 111.17333333333333,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002977700187596561,
+      "loss": 0.4701,
+      "step": 41690
+    },
+    {
+      "epoch": 111.2,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002977689385165186,
+      "loss": 0.4689,
+      "step": 41700
+    },
+    {
+      "epoch": 111.22666666666667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002977678580137602,
+      "loss": 0.4666,
+      "step": 41710
+    },
+    {
+      "epoch": 111.25333333333333,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00029776677725138276,
+      "loss": 0.4664,
+      "step": 41720
+    },
+    {
+      "epoch": 111.28,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002977656962293883,
+      "loss": 0.4792,
+      "step": 41730
+    },
+    {
+      "epoch": 111.30666666666667,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029776461494777863,
+      "loss": 0.4605,
+      "step": 41740
+    },
+    {
+      "epoch": 111.33333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002977635334065557,
+      "loss": 0.4765,
+      "step": 41750
+    },
+    {
+      "epoch": 111.36,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002977624516057213,
+      "loss": 0.4786,
+      "step": 41760
+    },
+    {
+      "epoch": 111.38666666666667,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0002977613695452775,
+      "loss": 0.4706,
+      "step": 41770
+    },
+    {
+      "epoch": 111.41333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002977602872252261,
+      "loss": 0.4748,
+      "step": 41780
+    },
+    {
+      "epoch": 111.44,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029775920464556897,
+      "loss": 0.478,
+      "step": 41790
+    },
+    {
+      "epoch": 111.46666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029775812180630806,
+      "loss": 0.4873,
+      "step": 41800
+    },
+    {
+      "epoch": 111.49333333333334,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002977570387074453,
+      "loss": 0.4784,
+      "step": 41810
+    },
+    {
+      "epoch": 111.52,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029775595534898257,
+      "loss": 0.4735,
+      "step": 41820
+    },
+    {
+      "epoch": 111.54666666666667,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002977548717309218,
+      "loss": 0.4691,
+      "step": 41830
+    },
+    {
+      "epoch": 111.57333333333334,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029775378785326477,
+      "loss": 0.4689,
+      "step": 41840
+    },
+    {
+      "epoch": 111.6,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002977527037160135,
+      "loss": 0.4623,
+      "step": 41850
+    },
+    {
+      "epoch": 111.62666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002977516193191699,
+      "loss": 0.4491,
+      "step": 41860
+    },
+    {
+      "epoch": 111.65333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029775053466273585,
+      "loss": 0.4691,
+      "step": 41870
+    },
+    {
+      "epoch": 111.68,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002977494497467132,
+      "loss": 0.4605,
+      "step": 41880
+    },
+    {
+      "epoch": 111.70666666666666,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029774836457110396,
+      "loss": 0.4614,
+      "step": 41890
+    },
+    {
+      "epoch": 111.73333333333333,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029774727913591,
+      "loss": 0.4806,
+      "step": 41900
+    },
+    {
+      "epoch": 111.76,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029774619344113317,
+      "loss": 0.4818,
+      "step": 41910
+    },
+    {
+      "epoch": 111.78666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002977451074867754,
+      "loss": 0.4805,
+      "step": 41920
+    },
+    {
+      "epoch": 111.81333333333333,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002977440212728386,
+      "loss": 0.4646,
+      "step": 41930
+    },
+    {
+      "epoch": 111.84,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029774293479932474,
+      "loss": 0.4709,
+      "step": 41940
+    },
+    {
+      "epoch": 111.86666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002977418480662357,
+      "loss": 0.4701,
+      "step": 41950
+    },
+    {
+      "epoch": 111.89333333333333,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002977407610735733,
+      "loss": 0.4612,
+      "step": 41960
+    },
+    {
+      "epoch": 111.92,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002977396738213396,
+      "loss": 0.4714,
+      "step": 41970
+    },
+    {
+      "epoch": 111.94666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002977385863095364,
+      "loss": 0.4636,
+      "step": 41980
+    },
+    {
+      "epoch": 111.97333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002977374985381656,
+      "loss": 0.4752,
+      "step": 41990
+    },
+    {
+      "epoch": 112.0,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002977364105072292,
+      "loss": 0.465,
+      "step": 42000
+    },
+    {
+      "epoch": 112.0,
+      "eval_loss": 0.4801514148712158,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2003,
+      "eval_samples_per_second": 1.569,
+      "eval_steps_per_second": 0.098,
+      "step": 42000
+    },
+    {
+      "epoch": 112.02666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002977353222167291,
+      "loss": 0.4844,
+      "step": 42010
+    },
+    {
+      "epoch": 112.05333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029773423366666714,
+      "loss": 0.4898,
+      "step": 42020
+    },
+    {
+      "epoch": 112.08,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029773314485704525,
+      "loss": 0.4769,
+      "step": 42030
+    },
+    {
+      "epoch": 112.10666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002977320557878654,
+      "loss": 0.472,
+      "step": 42040
+    },
+    {
+      "epoch": 112.13333333333334,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0002977309664591294,
+      "loss": 0.4743,
+      "step": 42050
+    },
+    {
+      "epoch": 112.16,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029772987687083933,
+      "loss": 0.4785,
+      "step": 42060
+    },
+    {
+      "epoch": 112.18666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029772878702299694,
+      "loss": 0.4702,
+      "step": 42070
+    },
+    {
+      "epoch": 112.21333333333334,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029772769691560424,
+      "loss": 0.465,
+      "step": 42080
+    },
+    {
+      "epoch": 112.24,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002977266065486631,
+      "loss": 0.4689,
+      "step": 42090
+    },
+    {
+      "epoch": 112.26666666666667,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029772551592217546,
+      "loss": 0.473,
+      "step": 42100
+    },
+    {
+      "epoch": 112.29333333333334,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002977244250361432,
+      "loss": 0.4669,
+      "step": 42110
+    },
+    {
+      "epoch": 112.32,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029772333389056825,
+      "loss": 0.4701,
+      "step": 42120
+    },
+    {
+      "epoch": 112.34666666666666,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029772224248545255,
+      "loss": 0.4787,
+      "step": 42130
+    },
+    {
+      "epoch": 112.37333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029772115082079805,
+      "loss": 0.4751,
+      "step": 42140
+    },
+    {
+      "epoch": 112.4,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.00029772005889660663,
+      "loss": 0.4709,
+      "step": 42150
+    },
+    {
+      "epoch": 112.42666666666666,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029771896671288014,
+      "loss": 0.4776,
+      "step": 42160
+    },
+    {
+      "epoch": 112.45333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029771787426962064,
+      "loss": 0.48,
+      "step": 42170
+    },
+    {
+      "epoch": 112.48,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002977167815668299,
+      "loss": 0.486,
+      "step": 42180
+    },
+    {
+      "epoch": 112.50666666666666,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029771568860450994,
+      "loss": 0.4755,
+      "step": 42190
+    },
+    {
+      "epoch": 112.53333333333333,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002977145953826627,
+      "loss": 0.4687,
+      "step": 42200
+    },
+    {
+      "epoch": 112.56,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029771350190128997,
+      "loss": 0.4704,
+      "step": 42210
+    },
+    {
+      "epoch": 112.58666666666667,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002977124081603938,
+      "loss": 0.4669,
+      "step": 42220
+    },
+    {
+      "epoch": 112.61333333333333,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029771131415997607,
+      "loss": 0.4526,
+      "step": 42230
+    },
+    {
+      "epoch": 112.64,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002977102199000387,
+      "loss": 0.4574,
+      "step": 42240
+    },
+    {
+      "epoch": 112.66666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002977091253805836,
+      "loss": 0.4725,
+      "step": 42250
+    },
+    {
+      "epoch": 112.69333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002977080306016127,
+      "loss": 0.4557,
+      "step": 42260
+    },
+    {
+      "epoch": 112.72,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002977069355631279,
+      "loss": 0.4731,
+      "step": 42270
+    },
+    {
+      "epoch": 112.74666666666667,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0002977058402651312,
+      "loss": 0.4772,
+      "step": 42280
+    },
+    {
+      "epoch": 112.77333333333333,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0002977047447076245,
+      "loss": 0.4848,
+      "step": 42290
+    },
+    {
+      "epoch": 112.8,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002977036488906096,
+      "loss": 0.4706,
+      "step": 42300
+    },
+    {
+      "epoch": 112.82666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002977025528140886,
+      "loss": 0.4664,
+      "step": 42310
+    },
+    {
+      "epoch": 112.85333333333334,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029770145647806334,
+      "loss": 0.4748,
+      "step": 42320
+    },
+    {
+      "epoch": 112.88,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029770035988253574,
+      "loss": 0.4634,
+      "step": 42330
+    },
+    {
+      "epoch": 112.90666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029769926302750777,
+      "loss": 0.4654,
+      "step": 42340
+    },
+    {
+      "epoch": 112.93333333333334,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002976981659129813,
+      "loss": 0.4668,
+      "step": 42350
+    },
+    {
+      "epoch": 112.96,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029769706853895834,
+      "loss": 0.4655,
+      "step": 42360
+    },
+    {
+      "epoch": 112.98666666666666,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002976959709054407,
+      "loss": 0.4789,
+      "step": 42370
+    },
+    {
+      "epoch": 113.0,
+      "eval_loss": 0.4794241786003113,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5463,
+      "eval_samples_per_second": 1.517,
+      "eval_steps_per_second": 0.095,
+      "step": 42375
+    },
+    {
+      "epoch": 113.01333333333334,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029769487301243045,
+      "loss": 0.4678,
+      "step": 42380
+    },
+    {
+      "epoch": 113.04,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002976937748599294,
+      "loss": 0.4923,
+      "step": 42390
+    },
+    {
+      "epoch": 113.06666666666666,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0002976926764479395,
+      "loss": 0.4798,
+      "step": 42400
+    },
+    {
+      "epoch": 113.09333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002976915777764628,
+      "loss": 0.4748,
+      "step": 42410
+    },
+    {
+      "epoch": 113.12,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.000297690478845501,
+      "loss": 0.4715,
+      "step": 42420
+    },
+    {
+      "epoch": 113.14666666666666,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029768937965505627,
+      "loss": 0.48,
+      "step": 42430
+    },
+    {
+      "epoch": 113.17333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029768828020513037,
+      "loss": 0.4706,
+      "step": 42440
+    },
+    {
+      "epoch": 113.2,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029768718049572534,
+      "loss": 0.4686,
+      "step": 42450
+    },
+    {
+      "epoch": 113.22666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002976860805268431,
+      "loss": 0.4668,
+      "step": 42460
+    },
+    {
+      "epoch": 113.25333333333333,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029768498029848544,
+      "loss": 0.4664,
+      "step": 42470
+    },
+    {
+      "epoch": 113.28,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002976838798106545,
+      "loss": 0.4798,
+      "step": 42480
+    },
+    {
+      "epoch": 113.30666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002976827790633521,
+      "loss": 0.4604,
+      "step": 42490
+    },
+    {
+      "epoch": 113.33333333333333,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002976816780565802,
+      "loss": 0.477,
+      "step": 42500
+    },
+    {
+      "epoch": 113.36,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002976805767903407,
+      "loss": 0.4788,
+      "step": 42510
+    },
+    {
+      "epoch": 113.38666666666667,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029767947526463556,
+      "loss": 0.4705,
+      "step": 42520
+    },
+    {
+      "epoch": 113.41333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002976783734794667,
+      "loss": 0.4746,
+      "step": 42530
+    },
+    {
+      "epoch": 113.44,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029767727143483613,
+      "loss": 0.4773,
+      "step": 42540
+    },
+    {
+      "epoch": 113.46666666666667,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0002976761691307457,
+      "loss": 0.4871,
+      "step": 42550
+    },
+    {
+      "epoch": 113.49333333333334,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029767506656719735,
+      "loss": 0.4782,
+      "step": 42560
+    },
+    {
+      "epoch": 113.52,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002976739637441931,
+      "loss": 0.4726,
+      "step": 42570
+    },
+    {
+      "epoch": 113.54666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029767286066173477,
+      "loss": 0.4699,
+      "step": 42580
+    },
+    {
+      "epoch": 113.57333333333334,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002976717573198244,
+      "loss": 0.469,
+      "step": 42590
+    },
+    {
+      "epoch": 113.6,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029767065371846384,
+      "loss": 0.4624,
+      "step": 42600
+    },
+    {
+      "epoch": 113.62666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029766954985765515,
+      "loss": 0.449,
+      "step": 42610
+    },
+    {
+      "epoch": 113.65333333333334,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002976684457374001,
+      "loss": 0.4687,
+      "step": 42620
+    },
+    {
+      "epoch": 113.68,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029766734135770084,
+      "loss": 0.4605,
+      "step": 42630
+    },
+    {
+      "epoch": 113.70666666666666,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002976662367185592,
+      "loss": 0.4611,
+      "step": 42640
+    },
+    {
+      "epoch": 113.73333333333333,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.000297665131819977,
+      "loss": 0.4801,
+      "step": 42650
+    },
+    {
+      "epoch": 113.76,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002976640266619564,
+      "loss": 0.4816,
+      "step": 42660
+    },
+    {
+      "epoch": 113.78666666666666,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002976629212444991,
+      "loss": 0.4797,
+      "step": 42670
+    },
+    {
+      "epoch": 113.81333333333333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002976618155676073,
+      "loss": 0.4657,
+      "step": 42680
+    },
+    {
+      "epoch": 113.84,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002976607096312828,
+      "loss": 0.4704,
+      "step": 42690
+    },
+    {
+      "epoch": 113.86666666666666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002976596034355276,
+      "loss": 0.4706,
+      "step": 42700
+    },
+    {
+      "epoch": 113.89333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029765849698034356,
+      "loss": 0.4607,
+      "step": 42710
+    },
+    {
+      "epoch": 113.92,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002976573902657327,
+      "loss": 0.4721,
+      "step": 42720
+    },
+    {
+      "epoch": 113.94666666666667,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029765628329169694,
+      "loss": 0.4637,
+      "step": 42730
+    },
+    {
+      "epoch": 113.97333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002976551760582382,
+      "loss": 0.4752,
+      "step": 42740
+    },
+    {
+      "epoch": 114.0,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002976540685653585,
+      "loss": 0.4651,
+      "step": 42750
+    },
+    {
+      "epoch": 114.0,
+      "eval_loss": 0.4796581268310547,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.6668,
+      "eval_samples_per_second": 1.5,
+      "eval_steps_per_second": 0.094,
+      "step": 42750
+    },
+    {
+      "epoch": 114.02666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002976529608130597,
+      "loss": 0.4842,
+      "step": 42760
+    },
+    {
+      "epoch": 114.05333333333333,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00029765185280134377,
+      "loss": 0.4899,
+      "step": 42770
+    },
+    {
+      "epoch": 114.08,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002976507445302127,
+      "loss": 0.4774,
+      "step": 42780
+    },
+    {
+      "epoch": 114.10666666666667,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029764963599966833,
+      "loss": 0.4719,
+      "step": 42790
+    },
+    {
+      "epoch": 114.13333333333334,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.00029764852720971276,
+      "loss": 0.4739,
+      "step": 42800
+    },
+    {
+      "epoch": 114.16,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002976474181603478,
+      "loss": 0.4781,
+      "step": 42810
+    },
+    {
+      "epoch": 114.18666666666667,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.00029764630885157554,
+      "loss": 0.4698,
+      "step": 42820
+    },
+    {
+      "epoch": 114.21333333333334,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002976451992833978,
+      "loss": 0.4648,
+      "step": 42830
+    },
+    {
+      "epoch": 114.24,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002976440894558166,
+      "loss": 0.4698,
+      "step": 42840
+    },
+    {
+      "epoch": 114.26666666666667,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002976429793688339,
+      "loss": 0.4731,
+      "step": 42850
+    },
+    {
+      "epoch": 114.29333333333334,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002976418690224515,
+      "loss": 0.467,
+      "step": 42860
+    },
+    {
+      "epoch": 114.32,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002976407584166716,
+      "loss": 0.4703,
+      "step": 42870
+    },
+    {
+      "epoch": 114.34666666666666,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002976396475514959,
+      "loss": 0.4788,
+      "step": 42880
+    },
+    {
+      "epoch": 114.37333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029763853642692656,
+      "loss": 0.4743,
+      "step": 42890
+    },
+    {
+      "epoch": 114.4,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029763742504296546,
+      "loss": 0.4711,
+      "step": 42900
+    },
+    {
+      "epoch": 114.42666666666666,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.00029763631339961447,
+      "loss": 0.4771,
+      "step": 42910
+    },
+    {
+      "epoch": 114.45333333333333,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029763520149687564,
+      "loss": 0.4793,
+      "step": 42920
+    },
+    {
+      "epoch": 114.48,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0002976340893347509,
+      "loss": 0.4862,
+      "step": 42930
+    },
+    {
+      "epoch": 114.50666666666666,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002976329769132421,
+      "loss": 0.4753,
+      "step": 42940
+    },
+    {
+      "epoch": 114.53333333333333,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00029763186423235145,
+      "loss": 0.4691,
+      "step": 42950
+    },
+    {
+      "epoch": 114.56,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029763075129208064,
+      "loss": 0.4707,
+      "step": 42960
+    },
+    {
+      "epoch": 114.58666666666667,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002976296380924318,
+      "loss": 0.4663,
+      "step": 42970
+    },
+    {
+      "epoch": 114.61333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002976285246334068,
+      "loss": 0.4525,
+      "step": 42980
+    },
+    {
+      "epoch": 114.64,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029762741091500755,
+      "loss": 0.4572,
+      "step": 42990
+    },
+    {
+      "epoch": 114.66666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029762629693723614,
+      "loss": 0.4724,
+      "step": 43000
+    },
+    {
+      "epoch": 114.69333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029762518270009446,
+      "loss": 0.4558,
+      "step": 43010
+    },
+    {
+      "epoch": 114.72,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029762406820358446,
+      "loss": 0.473,
+      "step": 43020
+    },
+    {
+      "epoch": 114.74666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002976229534477081,
+      "loss": 0.4767,
+      "step": 43030
+    },
+    {
+      "epoch": 114.77333333333333,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002976218384324673,
+      "loss": 0.4848,
+      "step": 43040
+    },
+    {
+      "epoch": 114.8,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002976207231578641,
+      "loss": 0.4718,
+      "step": 43050
+    },
+    {
+      "epoch": 114.82666666666667,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0002976196076239004,
+      "loss": 0.4654,
+      "step": 43060
+    },
+    {
+      "epoch": 114.85333333333334,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002976184918305782,
+      "loss": 0.4749,
+      "step": 43070
+    },
+    {
+      "epoch": 114.88,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029761737577789943,
+      "loss": 0.463,
+      "step": 43080
+    },
+    {
+      "epoch": 114.90666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029761625946586606,
+      "loss": 0.4658,
+      "step": 43090
+    },
+    {
+      "epoch": 114.93333333333334,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00029761514289448004,
+      "loss": 0.4678,
+      "step": 43100
+    },
+    {
+      "epoch": 114.96,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002976140260637434,
+      "loss": 0.4658,
+      "step": 43110
+    },
+    {
+      "epoch": 114.98666666666666,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.000297612908973658,
+      "loss": 0.4793,
+      "step": 43120
+    },
+    {
+      "epoch": 115.0,
+      "eval_loss": 0.4808712601661682,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2067,
+      "eval_samples_per_second": 1.568,
+      "eval_steps_per_second": 0.098,
+      "step": 43125
+    },
+    {
+      "epoch": 115.01333333333334,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002976117916242259,
+      "loss": 0.4681,
+      "step": 43130
+    },
+    {
+      "epoch": 115.04,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029761067401544894,
+      "loss": 0.4929,
+      "step": 43140
+    },
+    {
+      "epoch": 115.06666666666666,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002976095561473292,
+      "loss": 0.48,
+      "step": 43150
+    },
+    {
+      "epoch": 115.09333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002976084380198686,
+      "loss": 0.4749,
+      "step": 43160
+    },
+    {
+      "epoch": 115.12,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002976073196330691,
+      "loss": 0.4716,
+      "step": 43170
+    },
+    {
+      "epoch": 115.14666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029760620098693273,
+      "loss": 0.48,
+      "step": 43180
+    },
+    {
+      "epoch": 115.17333333333333,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002976050820814613,
+      "loss": 0.4709,
+      "step": 43190
+    },
+    {
+      "epoch": 115.2,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002976039629166569,
+      "loss": 0.469,
+      "step": 43200
+    },
+    {
+      "epoch": 115.22666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002976028434925215,
+      "loss": 0.4668,
+      "step": 43210
+    },
+    {
+      "epoch": 115.25333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.000297601723809057,
+      "loss": 0.4669,
+      "step": 43220
+    },
+    {
+      "epoch": 115.28,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029760060386626545,
+      "loss": 0.4802,
+      "step": 43230
+    },
+    {
+      "epoch": 115.30666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029759948366414867,
+      "loss": 0.461,
+      "step": 43240
+    },
+    {
+      "epoch": 115.33333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002975983632027088,
+      "loss": 0.4762,
+      "step": 43250
+    },
+    {
+      "epoch": 115.36,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029759724248194775,
+      "loss": 0.4794,
+      "step": 43260
+    },
+    {
+      "epoch": 115.38666666666667,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00029759612150186745,
+      "loss": 0.4701,
+      "step": 43270
+    },
+    {
+      "epoch": 115.41333333333333,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0002975950002624699,
+      "loss": 0.4749,
+      "step": 43280
+    },
+    {
+      "epoch": 115.44,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029759387876375704,
+      "loss": 0.478,
+      "step": 43290
+    },
+    {
+      "epoch": 115.46666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002975927570057309,
+      "loss": 0.488,
+      "step": 43300
+    },
+    {
+      "epoch": 115.49333333333334,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002975916349883934,
+      "loss": 0.4784,
+      "step": 43310
+    },
+    {
+      "epoch": 115.52,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002975905127117465,
+      "loss": 0.4729,
+      "step": 43320
+    },
+    {
+      "epoch": 115.54666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029758939017579224,
+      "loss": 0.4695,
+      "step": 43330
+    },
+    {
+      "epoch": 115.57333333333334,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029758826738053253,
+      "loss": 0.4689,
+      "step": 43340
+    },
+    {
+      "epoch": 115.6,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002975871443259694,
+      "loss": 0.4629,
+      "step": 43350
+    },
+    {
+      "epoch": 115.62666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029758602101210473,
+      "loss": 0.4496,
+      "step": 43360
+    },
+    {
+      "epoch": 115.65333333333334,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002975848974389406,
+      "loss": 0.4695,
+      "step": 43370
+    },
+    {
+      "epoch": 115.68,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0002975837736064789,
+      "loss": 0.4606,
+      "step": 43380
+    },
+    {
+      "epoch": 115.70666666666666,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029758264951472164,
+      "loss": 0.4603,
+      "step": 43390
+    },
+    {
+      "epoch": 115.73333333333333,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.00029758152516367085,
+      "loss": 0.4803,
+      "step": 43400
+    },
+    {
+      "epoch": 115.76,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002975804005533284,
+      "loss": 0.4816,
+      "step": 43410
+    },
+    {
+      "epoch": 115.78666666666666,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002975792756836963,
+      "loss": 0.48,
+      "step": 43420
+    },
+    {
+      "epoch": 115.81333333333333,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0002975781505547766,
+      "loss": 0.4653,
+      "step": 43430
+    },
+    {
+      "epoch": 115.84,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002975770251665712,
+      "loss": 0.4708,
+      "step": 43440
+    },
+    {
+      "epoch": 115.86666666666666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.000297575899519082,
+      "loss": 0.4703,
+      "step": 43450
+    },
+    {
+      "epoch": 115.89333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002975747736123112,
+      "loss": 0.4607,
+      "step": 43460
+    },
+    {
+      "epoch": 115.92,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002975736474462606,
+      "loss": 0.4714,
+      "step": 43470
+    },
+    {
+      "epoch": 115.94666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029757252102093225,
+      "loss": 0.4646,
+      "step": 43480
+    },
+    {
+      "epoch": 115.97333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002975713943363281,
+      "loss": 0.4753,
+      "step": 43490
+    },
+    {
+      "epoch": 116.0,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029757026739245007,
+      "loss": 0.4636,
+      "step": 43500
+    },
+    {
+      "epoch": 116.0,
+      "eval_loss": 0.4808787405490875,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7649,
+      "eval_samples_per_second": 1.639,
+      "eval_steps_per_second": 0.102,
+      "step": 43500
+    },
+    {
+      "epoch": 116.02666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002975691401893003,
+      "loss": 0.4842,
+      "step": 43510
+    },
+    {
+      "epoch": 116.05333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029756801272688065,
+      "loss": 0.4897,
+      "step": 43520
+    },
+    {
+      "epoch": 116.08,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029756688500519313,
+      "loss": 0.4768,
+      "step": 43530
+    },
+    {
+      "epoch": 116.10666666666667,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029756575702423975,
+      "loss": 0.4726,
+      "step": 43540
+    },
+    {
+      "epoch": 116.13333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002975646287840224,
+      "loss": 0.4738,
+      "step": 43550
+    },
+    {
+      "epoch": 116.16,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002975635002845432,
+      "loss": 0.478,
+      "step": 43560
+    },
+    {
+      "epoch": 116.18666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029756237152580396,
+      "loss": 0.47,
+      "step": 43570
+    },
+    {
+      "epoch": 116.21333333333334,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029756124250780686,
+      "loss": 0.4647,
+      "step": 43580
+    },
+    {
+      "epoch": 116.24,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002975601132305537,
+      "loss": 0.468,
+      "step": 43590
+    },
+    {
+      "epoch": 116.26666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029755898369404664,
+      "loss": 0.4731,
+      "step": 43600
+    },
+    {
+      "epoch": 116.29333333333334,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002975578538982875,
+      "loss": 0.4676,
+      "step": 43610
+    },
+    {
+      "epoch": 116.32,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002975567238432784,
+      "loss": 0.4704,
+      "step": 43620
+    },
+    {
+      "epoch": 116.34666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002975555935290212,
+      "loss": 0.4787,
+      "step": 43630
+    },
+    {
+      "epoch": 116.37333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.000297554462955518,
+      "loss": 0.4744,
+      "step": 43640
+    },
+    {
+      "epoch": 116.4,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029755333212277073,
+      "loss": 0.4714,
+      "step": 43650
+    },
+    {
+      "epoch": 116.42666666666666,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029755220103078135,
+      "loss": 0.4774,
+      "step": 43660
+    },
+    {
+      "epoch": 116.45333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002975510696795519,
+      "loss": 0.48,
+      "step": 43670
+    },
+    {
+      "epoch": 116.48,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029754993806908436,
+      "loss": 0.4857,
+      "step": 43680
+    },
+    {
+      "epoch": 116.50666666666666,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002975488061993807,
+      "loss": 0.4761,
+      "step": 43690
+    },
+    {
+      "epoch": 116.53333333333333,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002975476740704429,
+      "loss": 0.4692,
+      "step": 43700
+    },
+    {
+      "epoch": 116.56,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029754654168227297,
+      "loss": 0.4703,
+      "step": 43710
+    },
+    {
+      "epoch": 116.58666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002975454090348729,
+      "loss": 0.4665,
+      "step": 43720
+    },
+    {
+      "epoch": 116.61333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029754427612824463,
+      "loss": 0.4525,
+      "step": 43730
+    },
+    {
+      "epoch": 116.64,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029754314296239024,
+      "loss": 0.4574,
+      "step": 43740
+    },
+    {
+      "epoch": 116.66666666666667,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002975420095373117,
+      "loss": 0.4729,
+      "step": 43750
+    },
+    {
+      "epoch": 116.69333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002975408758530109,
+      "loss": 0.4562,
+      "step": 43760
+    },
+    {
+      "epoch": 116.72,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029753974190948994,
+      "loss": 0.4727,
+      "step": 43770
+    },
+    {
+      "epoch": 116.74666666666667,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.00029753860770675083,
+      "loss": 0.4767,
+      "step": 43780
+    },
+    {
+      "epoch": 116.77333333333333,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029753747324479546,
+      "loss": 0.485,
+      "step": 43790
+    },
+    {
+      "epoch": 116.8,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002975363385236259,
+      "loss": 0.4714,
+      "step": 43800
+    },
+    {
+      "epoch": 116.82666666666667,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002975352035432441,
+      "loss": 0.4661,
+      "step": 43810
+    },
+    {
+      "epoch": 116.85333333333334,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002975340683036521,
+      "loss": 0.4746,
+      "step": 43820
+    },
+    {
+      "epoch": 116.88,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029753293280485184,
+      "loss": 0.4624,
+      "step": 43830
+    },
+    {
+      "epoch": 116.90666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002975317970468453,
+      "loss": 0.4657,
+      "step": 43840
+    },
+    {
+      "epoch": 116.93333333333334,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029753066102963463,
+      "loss": 0.4672,
+      "step": 43850
+    },
+    {
+      "epoch": 116.96,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029752952475322166,
+      "loss": 0.466,
+      "step": 43860
+    },
+    {
+      "epoch": 116.98666666666666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002975283882176084,
+      "loss": 0.4789,
+      "step": 43870
+    },
+    {
+      "epoch": 117.0,
+      "eval_loss": 0.4790605902671814,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8144,
+      "eval_samples_per_second": 1.63,
+      "eval_steps_per_second": 0.102,
+      "step": 43875
+    },
+    {
+      "epoch": 117.01333333333334,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029752725142279693,
+      "loss": 0.4681,
+      "step": 43880
+    },
+    {
+      "epoch": 117.04,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029752611436878923,
+      "loss": 0.4925,
+      "step": 43890
+    },
+    {
+      "epoch": 117.06666666666666,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0002975249770555872,
+      "loss": 0.4798,
+      "step": 43900
+    },
+    {
+      "epoch": 117.09333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.000297523839483193,
+      "loss": 0.4755,
+      "step": 43910
+    },
+    {
+      "epoch": 117.12,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002975227016516085,
+      "loss": 0.471,
+      "step": 43920
+    },
+    {
+      "epoch": 117.14666666666666,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002975215635608357,
+      "loss": 0.4792,
+      "step": 43930
+    },
+    {
+      "epoch": 117.17333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029752042521087667,
+      "loss": 0.4708,
+      "step": 43940
+    },
+    {
+      "epoch": 117.2,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002975192866017334,
+      "loss": 0.4687,
+      "step": 43950
+    },
+    {
+      "epoch": 117.22666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002975181477334079,
+      "loss": 0.4661,
+      "step": 43960
+    },
+    {
+      "epoch": 117.25333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029751700860590203,
+      "loss": 0.4662,
+      "step": 43970
+    },
+    {
+      "epoch": 117.28,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029751586921921797,
+      "loss": 0.4794,
+      "step": 43980
+    },
+    {
+      "epoch": 117.30666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029751472957335764,
+      "loss": 0.4615,
+      "step": 43990
+    },
+    {
+      "epoch": 117.33333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002975135896683231,
+      "loss": 0.477,
+      "step": 44000
+    },
+    {
+      "epoch": 117.36,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029751244950411627,
+      "loss": 0.4783,
+      "step": 44010
+    },
+    {
+      "epoch": 117.38666666666667,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029751130908073917,
+      "loss": 0.4705,
+      "step": 44020
+    },
+    {
+      "epoch": 117.41333333333333,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029751016839819384,
+      "loss": 0.4746,
+      "step": 44030
+    },
+    {
+      "epoch": 117.44,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029750902745648224,
+      "loss": 0.4778,
+      "step": 44040
+    },
+    {
+      "epoch": 117.46666666666667,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029750788625560647,
+      "loss": 0.4873,
+      "step": 44050
+    },
+    {
+      "epoch": 117.49333333333334,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029750674479556843,
+      "loss": 0.4787,
+      "step": 44060
+    },
+    {
+      "epoch": 117.52,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029750560307637014,
+      "loss": 0.4729,
+      "step": 44070
+    },
+    {
+      "epoch": 117.54666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029750446109801364,
+      "loss": 0.4697,
+      "step": 44080
+    },
+    {
+      "epoch": 117.57333333333334,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029750331886050095,
+      "loss": 0.4696,
+      "step": 44090
+    },
+    {
+      "epoch": 117.6,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029750217636383407,
+      "loss": 0.4623,
+      "step": 44100
+    },
+    {
+      "epoch": 117.62666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029750103360801495,
+      "loss": 0.4496,
+      "step": 44110
+    },
+    {
+      "epoch": 117.65333333333334,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002974998905930457,
+      "loss": 0.4692,
+      "step": 44120
+    },
+    {
+      "epoch": 117.68,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002974987473189282,
+      "loss": 0.4602,
+      "step": 44130
+    },
+    {
+      "epoch": 117.70666666666666,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.00029749760378566456,
+      "loss": 0.4603,
+      "step": 44140
+    },
+    {
+      "epoch": 117.73333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002974964599932567,
+      "loss": 0.4803,
+      "step": 44150
+    },
+    {
+      "epoch": 117.76,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.00029749531594170677,
+      "loss": 0.4807,
+      "step": 44160
+    },
+    {
+      "epoch": 117.78666666666666,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002974941716310166,
+      "loss": 0.4799,
+      "step": 44170
+    },
+    {
+      "epoch": 117.81333333333333,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0002974930270611884,
+      "loss": 0.4655,
+      "step": 44180
+    },
+    {
+      "epoch": 117.84,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.000297491882232224,
+      "loss": 0.4709,
+      "step": 44190
+    },
+    {
+      "epoch": 117.86666666666666,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002974907371441255,
+      "loss": 0.47,
+      "step": 44200
+    },
+    {
+      "epoch": 117.89333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002974895917968949,
+      "loss": 0.4611,
+      "step": 44210
+    },
+    {
+      "epoch": 117.92,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0002974884461905342,
+      "loss": 0.4718,
+      "step": 44220
+    },
+    {
+      "epoch": 117.94666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029748730032504547,
+      "loss": 0.4637,
+      "step": 44230
+    },
+    {
+      "epoch": 117.97333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029748615420043065,
+      "loss": 0.4752,
+      "step": 44240
+    },
+    {
+      "epoch": 118.0,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029748500781669174,
+      "loss": 0.4638,
+      "step": 44250
+    },
+    {
+      "epoch": 118.0,
+      "eval_loss": 0.4791935682296753,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.4583,
+      "eval_samples_per_second": 1.53,
+      "eval_steps_per_second": 0.096,
+      "step": 44250
+    },
+    {
+      "epoch": 118.02666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029748386117383084,
+      "loss": 0.4837,
+      "step": 44260
+    },
+    {
+      "epoch": 118.05333333333333,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029748271427184995,
+      "loss": 0.4899,
+      "step": 44270
+    },
+    {
+      "epoch": 118.08,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029748156711075093,
+      "loss": 0.4766,
+      "step": 44280
+    },
+    {
+      "epoch": 118.10666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029748041969053605,
+      "loss": 0.472,
+      "step": 44290
+    },
+    {
+      "epoch": 118.13333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029747927201120714,
+      "loss": 0.4732,
+      "step": 44300
+    },
+    {
+      "epoch": 118.16,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002974781240727663,
+      "loss": 0.4783,
+      "step": 44310
+    },
+    {
+      "epoch": 118.18666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029747697587521546,
+      "loss": 0.4708,
+      "step": 44320
+    },
+    {
+      "epoch": 118.21333333333334,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029747582741855675,
+      "loss": 0.4646,
+      "step": 44330
+    },
+    {
+      "epoch": 118.24,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002974746787027921,
+      "loss": 0.4689,
+      "step": 44340
+    },
+    {
+      "epoch": 118.26666666666667,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002974735297279236,
+      "loss": 0.4728,
+      "step": 44350
+    },
+    {
+      "epoch": 118.29333333333334,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002974723804939532,
+      "loss": 0.4664,
+      "step": 44360
+    },
+    {
+      "epoch": 118.32,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00029747123100088297,
+      "loss": 0.4707,
+      "step": 44370
+    },
+    {
+      "epoch": 118.34666666666666,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029747008124871493,
+      "loss": 0.4788,
+      "step": 44380
+    },
+    {
+      "epoch": 118.37333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029746893123745104,
+      "loss": 0.474,
+      "step": 44390
+    },
+    {
+      "epoch": 118.4,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029746778096709334,
+      "loss": 0.4713,
+      "step": 44400
+    },
+    {
+      "epoch": 118.42666666666666,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0002974666304376439,
+      "loss": 0.4774,
+      "step": 44410
+    },
+    {
+      "epoch": 118.45333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029746547964910473,
+      "loss": 0.4794,
+      "step": 44420
+    },
+    {
+      "epoch": 118.48,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029746432860147784,
+      "loss": 0.4853,
+      "step": 44430
+    },
+    {
+      "epoch": 118.50666666666666,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002974631772947652,
+      "loss": 0.4755,
+      "step": 44440
+    },
+    {
+      "epoch": 118.53333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029746202572896894,
+      "loss": 0.4693,
+      "step": 44450
+    },
+    {
+      "epoch": 118.56,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029746087390409095,
+      "loss": 0.4702,
+      "step": 44460
+    },
+    {
+      "epoch": 118.58666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029745972182013337,
+      "loss": 0.4665,
+      "step": 44470
+    },
+    {
+      "epoch": 118.61333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002974585694770982,
+      "loss": 0.4527,
+      "step": 44480
+    },
+    {
+      "epoch": 118.64,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0002974574168749874,
+      "loss": 0.4576,
+      "step": 44490
+    },
+    {
+      "epoch": 118.66666666666667,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029745626401380304,
+      "loss": 0.4724,
+      "step": 44500
+    },
+    {
+      "epoch": 118.69333333333333,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002974551108935472,
+      "loss": 0.455,
+      "step": 44510
+    },
+    {
+      "epoch": 118.72,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029745395751422176,
+      "loss": 0.4728,
+      "step": 44520
+    },
+    {
+      "epoch": 118.74666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029745280387582894,
+      "loss": 0.4765,
+      "step": 44530
+    },
+    {
+      "epoch": 118.77333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002974516499783706,
+      "loss": 0.4842,
+      "step": 44540
+    },
+    {
+      "epoch": 118.8,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029745049582184885,
+      "loss": 0.4714,
+      "step": 44550
+    },
+    {
+      "epoch": 118.82666666666667,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002974493414062657,
+      "loss": 0.466,
+      "step": 44560
+    },
+    {
+      "epoch": 118.85333333333334,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002974481867316232,
+      "loss": 0.4749,
+      "step": 44570
+    },
+    {
+      "epoch": 118.88,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029744703179792334,
+      "loss": 0.4631,
+      "step": 44580
+    },
+    {
+      "epoch": 118.90666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029744587660516814,
+      "loss": 0.4656,
+      "step": 44590
+    },
+    {
+      "epoch": 118.93333333333334,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002974447211533597,
+      "loss": 0.467,
+      "step": 44600
+    },
+    {
+      "epoch": 118.96,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029744356544249994,
+      "loss": 0.4655,
+      "step": 44610
+    },
+    {
+      "epoch": 118.98666666666666,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029744240947259106,
+      "loss": 0.4791,
+      "step": 44620
+    },
+    {
+      "epoch": 119.0,
+      "eval_loss": 0.47937506437301636,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9759,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 0.1,
+      "step": 44625
+    },
+    {
+      "epoch": 119.01333333333334,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002974412532436349,
+      "loss": 0.4683,
+      "step": 44630
+    },
+    {
+      "epoch": 119.04,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0002974400967556336,
+      "loss": 0.4924,
+      "step": 44640
+    },
+    {
+      "epoch": 119.06666666666666,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002974389400085892,
+      "loss": 0.4798,
+      "step": 44650
+    },
+    {
+      "epoch": 119.09333333333333,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029743778300250366,
+      "loss": 0.4742,
+      "step": 44660
+    },
+    {
+      "epoch": 119.12,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029743662573737906,
+      "loss": 0.4712,
+      "step": 44670
+    },
+    {
+      "epoch": 119.14666666666666,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029743546821321747,
+      "loss": 0.4796,
+      "step": 44680
+    },
+    {
+      "epoch": 119.17333333333333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029743431043002084,
+      "loss": 0.4706,
+      "step": 44690
+    },
+    {
+      "epoch": 119.2,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029743315238779124,
+      "loss": 0.4691,
+      "step": 44700
+    },
+    {
+      "epoch": 119.22666666666667,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002974319940865308,
+      "loss": 0.4663,
+      "step": 44710
+    },
+    {
+      "epoch": 119.25333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002974308355262414,
+      "loss": 0.4663,
+      "step": 44720
+    },
+    {
+      "epoch": 119.28,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029742967670692513,
+      "loss": 0.4796,
+      "step": 44730
+    },
+    {
+      "epoch": 119.30666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029742851762858406,
+      "loss": 0.4615,
+      "step": 44740
+    },
+    {
+      "epoch": 119.33333333333333,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002974273582912202,
+      "loss": 0.4771,
+      "step": 44750
+    },
+    {
+      "epoch": 119.36,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029742619869483554,
+      "loss": 0.478,
+      "step": 44760
+    },
+    {
+      "epoch": 119.38666666666667,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029742503883943225,
+      "loss": 0.4705,
+      "step": 44770
+    },
+    {
+      "epoch": 119.41333333333333,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029742387872501224,
+      "loss": 0.475,
+      "step": 44780
+    },
+    {
+      "epoch": 119.44,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029742271835157764,
+      "loss": 0.4775,
+      "step": 44790
+    },
+    {
+      "epoch": 119.46666666666667,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002974215577191304,
+      "loss": 0.4862,
+      "step": 44800
+    },
+    {
+      "epoch": 119.49333333333334,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029742039682767256,
+      "loss": 0.4782,
+      "step": 44810
+    },
+    {
+      "epoch": 119.52,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00029741923567720626,
+      "loss": 0.4724,
+      "step": 44820
+    },
+    {
+      "epoch": 119.54666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002974180742677335,
+      "loss": 0.4687,
+      "step": 44830
+    },
+    {
+      "epoch": 119.57333333333334,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029741691259925626,
+      "loss": 0.4681,
+      "step": 44840
+    },
+    {
+      "epoch": 119.6,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002974157506717767,
+      "loss": 0.4623,
+      "step": 44850
+    },
+    {
+      "epoch": 119.62666666666667,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002974145884852967,
+      "loss": 0.4492,
+      "step": 44860
+    },
+    {
+      "epoch": 119.65333333333334,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002974134260398184,
+      "loss": 0.4691,
+      "step": 44870
+    },
+    {
+      "epoch": 119.68,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002974122633353438,
+      "loss": 0.4607,
+      "step": 44880
+    },
+    {
+      "epoch": 119.70666666666666,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029741110037187507,
+      "loss": 0.4608,
+      "step": 44890
+    },
+    {
+      "epoch": 119.73333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002974099371494141,
+      "loss": 0.4801,
+      "step": 44900
+    },
+    {
+      "epoch": 119.76,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029740877366796296,
+      "loss": 0.4814,
+      "step": 44910
+    },
+    {
+      "epoch": 119.78666666666666,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002974076099275238,
+      "loss": 0.4795,
+      "step": 44920
+    },
+    {
+      "epoch": 119.81333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002974064459280985,
+      "loss": 0.4651,
+      "step": 44930
+    },
+    {
+      "epoch": 119.84,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029740528166968924,
+      "loss": 0.4708,
+      "step": 44940
+    },
+    {
+      "epoch": 119.86666666666666,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029740411715229804,
+      "loss": 0.4697,
+      "step": 44950
+    },
+    {
+      "epoch": 119.89333333333333,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029740295237592687,
+      "loss": 0.4613,
+      "step": 44960
+    },
+    {
+      "epoch": 119.92,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029740178734057786,
+      "loss": 0.4714,
+      "step": 44970
+    },
+    {
+      "epoch": 119.94666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.000297400622046253,
+      "loss": 0.4638,
+      "step": 44980
+    },
+    {
+      "epoch": 119.97333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029739945649295444,
+      "loss": 0.4755,
+      "step": 44990
+    },
+    {
+      "epoch": 120.0,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002973982906806841,
+      "loss": 0.4641,
+      "step": 45000
+    },
+    {
+      "epoch": 120.0,
+      "eval_loss": 0.4796341061592102,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 11.2339,
+      "eval_samples_per_second": 1.424,
+      "eval_steps_per_second": 0.089,
+      "step": 45000
+    },
+    {
+      "epoch": 120.02666666666667,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.00029739712460944405,
+      "loss": 0.4851,
+      "step": 45010
+    },
+    {
+      "epoch": 120.05333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029739595827923645,
+      "loss": 0.4904,
+      "step": 45020
+    },
+    {
+      "epoch": 120.08,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002973947916900632,
+      "loss": 0.4766,
+      "step": 45030
+    },
+    {
+      "epoch": 120.10666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002973936248419264,
+      "loss": 0.4719,
+      "step": 45040
+    },
+    {
+      "epoch": 120.13333333333334,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002973924577348282,
+      "loss": 0.4733,
+      "step": 45050
+    },
+    {
+      "epoch": 120.16,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002973912903687705,
+      "loss": 0.4774,
+      "step": 45060
+    },
+    {
+      "epoch": 120.18666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002973901227437555,
+      "loss": 0.4702,
+      "step": 45070
+    },
+    {
+      "epoch": 120.21333333333334,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002973889548597851,
+      "loss": 0.4644,
+      "step": 45080
+    },
+    {
+      "epoch": 120.24,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029738778671686145,
+      "loss": 0.4692,
+      "step": 45090
+    },
+    {
+      "epoch": 120.26666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002973866183149866,
+      "loss": 0.4728,
+      "step": 45100
+    },
+    {
+      "epoch": 120.29333333333334,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029738544965416254,
+      "loss": 0.4675,
+      "step": 45110
+    },
+    {
+      "epoch": 120.32,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029738428073439134,
+      "loss": 0.4706,
+      "step": 45120
+    },
+    {
+      "epoch": 120.34666666666666,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002973831115556751,
+      "loss": 0.4782,
+      "step": 45130
+    },
+    {
+      "epoch": 120.37333333333333,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029738194211801586,
+      "loss": 0.4746,
+      "step": 45140
+    },
+    {
+      "epoch": 120.4,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002973807724214157,
+      "loss": 0.4714,
+      "step": 45150
+    },
+    {
+      "epoch": 120.42666666666666,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.00029737960246587657,
+      "loss": 0.4782,
+      "step": 45160
+    },
+    {
+      "epoch": 120.45333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029737843225140063,
+      "loss": 0.4793,
+      "step": 45170
+    },
+    {
+      "epoch": 120.48,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029737726177798995,
+      "loss": 0.4853,
+      "step": 45180
+    },
+    {
+      "epoch": 120.50666666666666,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029737609104564647,
+      "loss": 0.4748,
+      "step": 45190
+    },
+    {
+      "epoch": 120.53333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002973749200543723,
+      "loss": 0.4686,
+      "step": 45200
+    },
+    {
+      "epoch": 120.56,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002973737488041696,
+      "loss": 0.4704,
+      "step": 45210
+    },
+    {
+      "epoch": 120.58666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029737257729504023,
+      "loss": 0.466,
+      "step": 45220
+    },
+    {
+      "epoch": 120.61333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002973714055269864,
+      "loss": 0.4523,
+      "step": 45230
+    },
+    {
+      "epoch": 120.64,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029737023350001016,
+      "loss": 0.4571,
+      "step": 45240
+    },
+    {
+      "epoch": 120.66666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002973690612141135,
+      "loss": 0.4724,
+      "step": 45250
+    },
+    {
+      "epoch": 120.69333333333333,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002973678886692985,
+      "loss": 0.4564,
+      "step": 45260
+    },
+    {
+      "epoch": 120.72,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002973667158655673,
+      "loss": 0.4723,
+      "step": 45270
+    },
+    {
+      "epoch": 120.74666666666667,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002973655428029218,
+      "loss": 0.4769,
+      "step": 45280
+    },
+    {
+      "epoch": 120.77333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002973643694813643,
+      "loss": 0.4835,
+      "step": 45290
+    },
+    {
+      "epoch": 120.8,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002973631959008966,
+      "loss": 0.4716,
+      "step": 45300
+    },
+    {
+      "epoch": 120.82666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.000297362022061521,
+      "loss": 0.4661,
+      "step": 45310
+    },
+    {
+      "epoch": 120.85333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029736084796323927,
+      "loss": 0.4749,
+      "step": 45320
+    },
+    {
+      "epoch": 120.88,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029735967360605377,
+      "loss": 0.4629,
+      "step": 45330
+    },
+    {
+      "epoch": 120.90666666666667,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002973584989899664,
+      "loss": 0.4658,
+      "step": 45340
+    },
+    {
+      "epoch": 120.93333333333334,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029735732411497926,
+      "loss": 0.4665,
+      "step": 45350
+    },
+    {
+      "epoch": 120.96,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002973561489810944,
+      "loss": 0.4652,
+      "step": 45360
+    },
+    {
+      "epoch": 120.98666666666666,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0002973549735883139,
+      "loss": 0.4785,
+      "step": 45370
+    },
+    {
+      "epoch": 121.0,
+      "eval_loss": 0.4802832305431366,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0946,
+      "eval_samples_per_second": 1.585,
+      "eval_steps_per_second": 0.099,
+      "step": 45375
+    },
+    {
+      "epoch": 121.01333333333334,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002973537979366399,
+      "loss": 0.4687,
+      "step": 45380
+    },
+    {
+      "epoch": 121.04,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029735262202607433,
+      "loss": 0.4921,
+      "step": 45390
+    },
+    {
+      "epoch": 121.06666666666666,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029735144585661937,
+      "loss": 0.4797,
+      "step": 45400
+    },
+    {
+      "epoch": 121.09333333333333,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029735026942827693,
+      "loss": 0.4746,
+      "step": 45410
+    },
+    {
+      "epoch": 121.12,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002973490927410493,
+      "loss": 0.4712,
+      "step": 45420
+    },
+    {
+      "epoch": 121.14666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002973479157949384,
+      "loss": 0.4799,
+      "step": 45430
+    },
+    {
+      "epoch": 121.17333333333333,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002973467385899463,
+      "loss": 0.4701,
+      "step": 45440
+    },
+    {
+      "epoch": 121.2,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002973455611260751,
+      "loss": 0.468,
+      "step": 45450
+    },
+    {
+      "epoch": 121.22666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029734438340332683,
+      "loss": 0.4665,
+      "step": 45460
+    },
+    {
+      "epoch": 121.25333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002973432054217036,
+      "loss": 0.4663,
+      "step": 45470
+    },
+    {
+      "epoch": 121.28,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002973420271812075,
+      "loss": 0.4798,
+      "step": 45480
+    },
+    {
+      "epoch": 121.30666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002973408486818406,
+      "loss": 0.461,
+      "step": 45490
+    },
+    {
+      "epoch": 121.33333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002973396699236049,
+      "loss": 0.4765,
+      "step": 45500
+    },
+    {
+      "epoch": 121.36,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002973384909065026,
+      "loss": 0.4781,
+      "step": 45510
+    },
+    {
+      "epoch": 121.38666666666667,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029733731163053556,
+      "loss": 0.4701,
+      "step": 45520
+    },
+    {
+      "epoch": 121.41333333333333,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029733613209570605,
+      "loss": 0.4742,
+      "step": 45530
+    },
+    {
+      "epoch": 121.44,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029733495230201606,
+      "loss": 0.4778,
+      "step": 45540
+    },
+    {
+      "epoch": 121.46666666666667,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029733377224946765,
+      "loss": 0.4865,
+      "step": 45550
+    },
+    {
+      "epoch": 121.49333333333334,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002973325919380629,
+      "loss": 0.4776,
+      "step": 45560
+    },
+    {
+      "epoch": 121.52,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.000297331411367804,
+      "loss": 0.4727,
+      "step": 45570
+    },
+    {
+      "epoch": 121.54666666666667,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002973302305386928,
+      "loss": 0.4689,
+      "step": 45580
+    },
+    {
+      "epoch": 121.57333333333334,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002973290494507316,
+      "loss": 0.4684,
+      "step": 45590
+    },
+    {
+      "epoch": 121.6,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002973278681039223,
+      "loss": 0.4624,
+      "step": 45600
+    },
+    {
+      "epoch": 121.62666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002973266864982671,
+      "loss": 0.4492,
+      "step": 45610
+    },
+    {
+      "epoch": 121.65333333333334,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029732550463376796,
+      "loss": 0.4688,
+      "step": 45620
+    },
+    {
+      "epoch": 121.68,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029732432251042707,
+      "loss": 0.4599,
+      "step": 45630
+    },
+    {
+      "epoch": 121.70666666666666,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.00029732314012824646,
+      "loss": 0.4605,
+      "step": 45640
+    },
+    {
+      "epoch": 121.73333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002973219574872282,
+      "loss": 0.4801,
+      "step": 45650
+    },
+    {
+      "epoch": 121.76,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0002973207745873744,
+      "loss": 0.4807,
+      "step": 45660
+    },
+    {
+      "epoch": 121.78666666666666,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029731959142868705,
+      "loss": 0.4798,
+      "step": 45670
+    },
+    {
+      "epoch": 121.81333333333333,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029731840801116834,
+      "loss": 0.4652,
+      "step": 45680
+    },
+    {
+      "epoch": 121.84,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002973172243348203,
+      "loss": 0.4706,
+      "step": 45690
+    },
+    {
+      "epoch": 121.86666666666666,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.000297316040399645,
+      "loss": 0.4699,
+      "step": 45700
+    },
+    {
+      "epoch": 121.89333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002973148562056445,
+      "loss": 0.4611,
+      "step": 45710
+    },
+    {
+      "epoch": 121.92,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002973136717528209,
+      "loss": 0.471,
+      "step": 45720
+    },
+    {
+      "epoch": 121.94666666666667,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00029731248704117637,
+      "loss": 0.464,
+      "step": 45730
+    },
+    {
+      "epoch": 121.97333333333333,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029731130207071287,
+      "loss": 0.4748,
+      "step": 45740
+    },
+    {
+      "epoch": 122.0,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029731011684143254,
+      "loss": 0.4637,
+      "step": 45750
+    },
+    {
+      "epoch": 122.0,
+      "eval_loss": 0.48008567094802856,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.8847,
+      "eval_samples_per_second": 1.47,
+      "eval_steps_per_second": 0.092,
+      "step": 45750
+    },
+    {
+      "epoch": 122.02666666666667,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002973089313533374,
+      "loss": 0.4841,
+      "step": 45760
+    },
+    {
+      "epoch": 122.05333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002973077456064296,
+      "loss": 0.4894,
+      "step": 45770
+    },
+    {
+      "epoch": 122.08,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002973065596007112,
+      "loss": 0.477,
+      "step": 45780
+    },
+    {
+      "epoch": 122.10666666666667,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029730537333618433,
+      "loss": 0.4715,
+      "step": 45790
+    },
+    {
+      "epoch": 122.13333333333334,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.000297304186812851,
+      "loss": 0.473,
+      "step": 45800
+    },
+    {
+      "epoch": 122.16,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029730300003071334,
+      "loss": 0.4781,
+      "step": 45810
+    },
+    {
+      "epoch": 122.18666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002973018129897734,
+      "loss": 0.47,
+      "step": 45820
+    },
+    {
+      "epoch": 122.21333333333334,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.00029730062569003333,
+      "loss": 0.4648,
+      "step": 45830
+    },
+    {
+      "epoch": 122.24,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002972994381314951,
+      "loss": 0.4683,
+      "step": 45840
+    },
+    {
+      "epoch": 122.26666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029729825031416093,
+      "loss": 0.4729,
+      "step": 45850
+    },
+    {
+      "epoch": 122.29333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029729706223803284,
+      "loss": 0.4668,
+      "step": 45860
+    },
+    {
+      "epoch": 122.32,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002972958739031129,
+      "loss": 0.4707,
+      "step": 45870
+    },
+    {
+      "epoch": 122.34666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002972946853094032,
+      "loss": 0.4779,
+      "step": 45880
+    },
+    {
+      "epoch": 122.37333333333333,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002972934964569059,
+      "loss": 0.4743,
+      "step": 45890
+    },
+    {
+      "epoch": 122.4,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.000297292307345623,
+      "loss": 0.4713,
+      "step": 45900
+    },
+    {
+      "epoch": 122.42666666666666,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00029729111797555666,
+      "loss": 0.4775,
+      "step": 45910
+    },
+    {
+      "epoch": 122.45333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002972899283467089,
+      "loss": 0.4787,
+      "step": 45920
+    },
+    {
+      "epoch": 122.48,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029728873845908185,
+      "loss": 0.4861,
+      "step": 45930
+    },
+    {
+      "epoch": 122.50666666666666,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002972875483126776,
+      "loss": 0.4742,
+      "step": 45940
+    },
+    {
+      "epoch": 122.53333333333333,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0002972863579074983,
+      "loss": 0.4687,
+      "step": 45950
+    },
+    {
+      "epoch": 122.56,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029728516724354594,
+      "loss": 0.4706,
+      "step": 45960
+    },
+    {
+      "epoch": 122.58666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002972839763208226,
+      "loss": 0.4655,
+      "step": 45970
+    },
+    {
+      "epoch": 122.61333333333333,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002972827851393305,
+      "loss": 0.4526,
+      "step": 45980
+    },
+    {
+      "epoch": 122.64,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029728159369907166,
+      "loss": 0.4568,
+      "step": 45990
+    },
+    {
+      "epoch": 122.66666666666667,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029728040200004813,
+      "loss": 0.4731,
+      "step": 46000
+    },
+    {
+      "epoch": 122.69333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002972792100422621,
+      "loss": 0.4549,
+      "step": 46010
+    },
+    {
+      "epoch": 122.72,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002972780178257155,
+      "loss": 0.4724,
+      "step": 46020
+    },
+    {
+      "epoch": 122.74666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002972768253504106,
+      "loss": 0.4772,
+      "step": 46030
+    },
+    {
+      "epoch": 122.77333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002972756326163494,
+      "loss": 0.484,
+      "step": 46040
+    },
+    {
+      "epoch": 122.8,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029727443962353407,
+      "loss": 0.4716,
+      "step": 46050
+    },
+    {
+      "epoch": 122.82666666666667,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029727324637196664,
+      "loss": 0.4653,
+      "step": 46060
+    },
+    {
+      "epoch": 122.85333333333334,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002972720528616492,
+      "loss": 0.4748,
+      "step": 46070
+    },
+    {
+      "epoch": 122.88,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002972708590925839,
+      "loss": 0.4626,
+      "step": 46080
+    },
+    {
+      "epoch": 122.90666666666667,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002972696650647728,
+      "loss": 0.4653,
+      "step": 46090
+    },
+    {
+      "epoch": 122.93333333333334,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.000297268470778218,
+      "loss": 0.4676,
+      "step": 46100
+    },
+    {
+      "epoch": 122.96,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029726727623292167,
+      "loss": 0.4654,
+      "step": 46110
+    },
+    {
+      "epoch": 122.98666666666666,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002972660814288858,
+      "loss": 0.4792,
+      "step": 46120
+    },
+    {
+      "epoch": 123.0,
+      "eval_loss": 0.47855737805366516,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2518,
+      "eval_samples_per_second": 1.561,
+      "eval_steps_per_second": 0.098,
+      "step": 46125
+    },
+    {
+      "epoch": 123.01333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002972648863661125,
+      "loss": 0.4678,
+      "step": 46130
+    },
+    {
+      "epoch": 123.04,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029726369104460395,
+      "loss": 0.4923,
+      "step": 46140
+    },
+    {
+      "epoch": 123.06666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029726249546436215,
+      "loss": 0.4795,
+      "step": 46150
+    },
+    {
+      "epoch": 123.09333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029726129962538933,
+      "loss": 0.4745,
+      "step": 46160
+    },
+    {
+      "epoch": 123.12,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002972601035276875,
+      "loss": 0.4709,
+      "step": 46170
+    },
+    {
+      "epoch": 123.14666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029725890717125875,
+      "loss": 0.4801,
+      "step": 46180
+    },
+    {
+      "epoch": 123.17333333333333,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002972577105561052,
+      "loss": 0.47,
+      "step": 46190
+    },
+    {
+      "epoch": 123.2,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.000297256513682229,
+      "loss": 0.4686,
+      "step": 46200
+    },
+    {
+      "epoch": 123.22666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029725531654963217,
+      "loss": 0.4666,
+      "step": 46210
+    },
+    {
+      "epoch": 123.25333333333333,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0002972541191583169,
+      "loss": 0.4659,
+      "step": 46220
+    },
+    {
+      "epoch": 123.28,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002972529215082852,
+      "loss": 0.4796,
+      "step": 46230
+    },
+    {
+      "epoch": 123.30666666666667,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0002972517235995393,
+      "loss": 0.4601,
+      "step": 46240
+    },
+    {
+      "epoch": 123.33333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002972505254320812,
+      "loss": 0.4762,
+      "step": 46250
+    },
+    {
+      "epoch": 123.36,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029724932700591304,
+      "loss": 0.4785,
+      "step": 46260
+    },
+    {
+      "epoch": 123.38666666666667,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029724812832103686,
+      "loss": 0.4694,
+      "step": 46270
+    },
+    {
+      "epoch": 123.41333333333333,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00029724692937745486,
+      "loss": 0.4741,
+      "step": 46280
+    },
+    {
+      "epoch": 123.44,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029724573017516915,
+      "loss": 0.4774,
+      "step": 46290
+    },
+    {
+      "epoch": 123.46666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029724453071418175,
+      "loss": 0.4869,
+      "step": 46300
+    },
+    {
+      "epoch": 123.49333333333334,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002972433309944949,
+      "loss": 0.4782,
+      "step": 46310
+    },
+    {
+      "epoch": 123.52,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029724213101611053,
+      "loss": 0.4726,
+      "step": 46320
+    },
+    {
+      "epoch": 123.54666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002972409307790309,
+      "loss": 0.4695,
+      "step": 46330
+    },
+    {
+      "epoch": 123.57333333333334,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00029723973028325805,
+      "loss": 0.4688,
+      "step": 46340
+    },
+    {
+      "epoch": 123.6,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002972385295287941,
+      "loss": 0.4626,
+      "step": 46350
+    },
+    {
+      "epoch": 123.62666666666667,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002972373285156411,
+      "loss": 0.4492,
+      "step": 46360
+    },
+    {
+      "epoch": 123.65333333333334,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029723612724380136,
+      "loss": 0.4686,
+      "step": 46370
+    },
+    {
+      "epoch": 123.68,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029723492571327677,
+      "loss": 0.4599,
+      "step": 46380
+    },
+    {
+      "epoch": 123.70666666666666,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002972337239240695,
+      "loss": 0.4616,
+      "step": 46390
+    },
+    {
+      "epoch": 123.73333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029723252187618173,
+      "loss": 0.4805,
+      "step": 46400
+    },
+    {
+      "epoch": 123.76,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002972313195696155,
+      "loss": 0.481,
+      "step": 46410
+    },
+    {
+      "epoch": 123.78666666666666,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.000297230117004373,
+      "loss": 0.4796,
+      "step": 46420
+    },
+    {
+      "epoch": 123.81333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029722891418045624,
+      "loss": 0.4647,
+      "step": 46430
+    },
+    {
+      "epoch": 123.84,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002972277110978674,
+      "loss": 0.4706,
+      "step": 46440
+    },
+    {
+      "epoch": 123.86666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029722650775660856,
+      "loss": 0.4697,
+      "step": 46450
+    },
+    {
+      "epoch": 123.89333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002972253041566819,
+      "loss": 0.4609,
+      "step": 46460
+    },
+    {
+      "epoch": 123.92,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002972241002980894,
+      "loss": 0.4714,
+      "step": 46470
+    },
+    {
+      "epoch": 123.94666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029722289618083335,
+      "loss": 0.4639,
+      "step": 46480
+    },
+    {
+      "epoch": 123.97333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002972216918049158,
+      "loss": 0.4759,
+      "step": 46490
+    },
+    {
+      "epoch": 124.0,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029722048717033874,
+      "loss": 0.4644,
+      "step": 46500
+    },
+    {
+      "epoch": 124.0,
+      "eval_loss": 0.478996604681015,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2562,
+      "eval_samples_per_second": 1.56,
+      "eval_steps_per_second": 0.098,
+      "step": 46500
+    },
+    {
+      "epoch": 124.02666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029721928227710446,
+      "loss": 0.4836,
+      "step": 46510
+    },
+    {
+      "epoch": 124.05333333333333,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.000297218077125215,
+      "loss": 0.4899,
+      "step": 46520
+    },
+    {
+      "epoch": 124.08,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029721687171467245,
+      "loss": 0.476,
+      "step": 46530
+    },
+    {
+      "epoch": 124.10666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.000297215666045479,
+      "loss": 0.4721,
+      "step": 46540
+    },
+    {
+      "epoch": 124.13333333333334,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002972144601176367,
+      "loss": 0.4734,
+      "step": 46550
+    },
+    {
+      "epoch": 124.16,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029721325393114776,
+      "loss": 0.4774,
+      "step": 46560
+    },
+    {
+      "epoch": 124.18666666666667,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0002972120474860142,
+      "loss": 0.4699,
+      "step": 46570
+    },
+    {
+      "epoch": 124.21333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029721084078223817,
+      "loss": 0.4647,
+      "step": 46580
+    },
+    {
+      "epoch": 124.24,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002972096338198218,
+      "loss": 0.4693,
+      "step": 46590
+    },
+    {
+      "epoch": 124.26666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002972084265987672,
+      "loss": 0.4729,
+      "step": 46600
+    },
+    {
+      "epoch": 124.29333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002972072191190765,
+      "loss": 0.4669,
+      "step": 46610
+    },
+    {
+      "epoch": 124.32,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029720601138075186,
+      "loss": 0.4697,
+      "step": 46620
+    },
+    {
+      "epoch": 124.34666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002972048033837954,
+      "loss": 0.4783,
+      "step": 46630
+    },
+    {
+      "epoch": 124.37333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002972035951282091,
+      "loss": 0.4745,
+      "step": 46640
+    },
+    {
+      "epoch": 124.4,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029720238661399526,
+      "loss": 0.4708,
+      "step": 46650
+    },
+    {
+      "epoch": 124.42666666666666,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002972011778411559,
+      "loss": 0.4779,
+      "step": 46660
+    },
+    {
+      "epoch": 124.45333333333333,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002971999688096932,
+      "loss": 0.4792,
+      "step": 46670
+    },
+    {
+      "epoch": 124.48,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002971987595196093,
+      "loss": 0.4856,
+      "step": 46680
+    },
+    {
+      "epoch": 124.50666666666666,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002971975499709062,
+      "loss": 0.4754,
+      "step": 46690
+    },
+    {
+      "epoch": 124.53333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029719634016358616,
+      "loss": 0.4688,
+      "step": 46700
+    },
+    {
+      "epoch": 124.56,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029719513009765123,
+      "loss": 0.4697,
+      "step": 46710
+    },
+    {
+      "epoch": 124.58666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029719391977310356,
+      "loss": 0.4657,
+      "step": 46720
+    },
+    {
+      "epoch": 124.61333333333333,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00029719270918994526,
+      "loss": 0.4521,
+      "step": 46730
+    },
+    {
+      "epoch": 124.64,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029719149834817857,
+      "loss": 0.4568,
+      "step": 46740
+    },
+    {
+      "epoch": 124.66666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002971902872478054,
+      "loss": 0.4727,
+      "step": 46750
+    },
+    {
+      "epoch": 124.69333333333333,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002971890758888281,
+      "loss": 0.4552,
+      "step": 46760
+    },
+    {
+      "epoch": 124.72,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002971878642712486,
+      "loss": 0.4725,
+      "step": 46770
+    },
+    {
+      "epoch": 124.74666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029718665239506924,
+      "loss": 0.4767,
+      "step": 46780
+    },
+    {
+      "epoch": 124.77333333333333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.000297185440260292,
+      "loss": 0.4842,
+      "step": 46790
+    },
+    {
+      "epoch": 124.8,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.000297184227866919,
+      "loss": 0.4713,
+      "step": 46800
+    },
+    {
+      "epoch": 124.82666666666667,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002971830152149524,
+      "loss": 0.4664,
+      "step": 46810
+    },
+    {
+      "epoch": 124.85333333333334,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0002971818023043944,
+      "loss": 0.4745,
+      "step": 46820
+    },
+    {
+      "epoch": 124.88,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002971805891352471,
+      "loss": 0.4628,
+      "step": 46830
+    },
+    {
+      "epoch": 124.90666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002971793757075126,
+      "loss": 0.4662,
+      "step": 46840
+    },
+    {
+      "epoch": 124.93333333333334,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.000297178162021193,
+      "loss": 0.4669,
+      "step": 46850
+    },
+    {
+      "epoch": 124.96,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029717694807629045,
+      "loss": 0.4652,
+      "step": 46860
+    },
+    {
+      "epoch": 124.98666666666666,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0002971757338728072,
+      "loss": 0.4793,
+      "step": 46870
+    },
+    {
+      "epoch": 125.0,
+      "eval_loss": 0.4781440794467926,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.8307,
+      "eval_samples_per_second": 1.477,
+      "eval_steps_per_second": 0.092,
+      "step": 46875
+    },
+    {
+      "epoch": 125.01333333333334,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029717451941074525,
+      "loss": 0.4684,
+      "step": 46880
+    },
+    {
+      "epoch": 125.04,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029717330469010677,
+      "loss": 0.4925,
+      "step": 46890
+    },
+    {
+      "epoch": 125.06666666666666,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002971720897108939,
+      "loss": 0.4796,
+      "step": 46900
+    },
+    {
+      "epoch": 125.09333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029717087447310876,
+      "loss": 0.4745,
+      "step": 46910
+    },
+    {
+      "epoch": 125.12,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029716965897675344,
+      "loss": 0.4713,
+      "step": 46920
+    },
+    {
+      "epoch": 125.14666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002971684432218302,
+      "loss": 0.4797,
+      "step": 46930
+    },
+    {
+      "epoch": 125.17333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029716722720834115,
+      "loss": 0.4707,
+      "step": 46940
+    },
+    {
+      "epoch": 125.2,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002971660109362883,
+      "loss": 0.4685,
+      "step": 46950
+    },
+    {
+      "epoch": 125.22666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029716479440567394,
+      "loss": 0.4667,
+      "step": 46960
+    },
+    {
+      "epoch": 125.25333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029716357761650007,
+      "loss": 0.4658,
+      "step": 46970
+    },
+    {
+      "epoch": 125.28,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0002971623605687689,
+      "loss": 0.4799,
+      "step": 46980
+    },
+    {
+      "epoch": 125.30666666666667,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029716114326248266,
+      "loss": 0.4605,
+      "step": 46990
+    },
+    {
+      "epoch": 125.33333333333333,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029715992569764334,
+      "loss": 0.4769,
+      "step": 47000
+    },
+    {
+      "epoch": 125.36,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029715870787425314,
+      "loss": 0.4779,
+      "step": 47010
+    },
+    {
+      "epoch": 125.38666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029715748979231417,
+      "loss": 0.4707,
+      "step": 47020
+    },
+    {
+      "epoch": 125.41333333333333,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002971562714518286,
+      "loss": 0.4745,
+      "step": 47030
+    },
+    {
+      "epoch": 125.44,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0002971550528527986,
+      "loss": 0.4773,
+      "step": 47040
+    },
+    {
+      "epoch": 125.46666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002971538339952262,
+      "loss": 0.4864,
+      "step": 47050
+    },
+    {
+      "epoch": 125.49333333333334,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002971526148791137,
+      "loss": 0.4778,
+      "step": 47060
+    },
+    {
+      "epoch": 125.52,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002971513955044631,
+      "loss": 0.4728,
+      "step": 47070
+    },
+    {
+      "epoch": 125.54666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002971501758712766,
+      "loss": 0.4692,
+      "step": 47080
+    },
+    {
+      "epoch": 125.57333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002971489559795564,
+      "loss": 0.468,
+      "step": 47090
+    },
+    {
+      "epoch": 125.6,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029714773582930454,
+      "loss": 0.4618,
+      "step": 47100
+    },
+    {
+      "epoch": 125.62666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029714651542052324,
+      "loss": 0.4493,
+      "step": 47110
+    },
+    {
+      "epoch": 125.65333333333334,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002971452947532146,
+      "loss": 0.4687,
+      "step": 47120
+    },
+    {
+      "epoch": 125.68,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002971440738273808,
+      "loss": 0.4604,
+      "step": 47130
+    },
+    {
+      "epoch": 125.70666666666666,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029714285264302386,
+      "loss": 0.461,
+      "step": 47140
+    },
+    {
+      "epoch": 125.73333333333333,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002971416312001462,
+      "loss": 0.4794,
+      "step": 47150
+    },
+    {
+      "epoch": 125.76,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002971404094987497,
+      "loss": 0.4803,
+      "step": 47160
+    },
+    {
+      "epoch": 125.78666666666666,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002971391875388366,
+      "loss": 0.4795,
+      "step": 47170
+    },
+    {
+      "epoch": 125.81333333333333,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.00029713796532040905,
+      "loss": 0.4643,
+      "step": 47180
+    },
+    {
+      "epoch": 125.84,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002971367428434692,
+      "loss": 0.4705,
+      "step": 47190
+    },
+    {
+      "epoch": 125.86666666666666,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002971355201080192,
+      "loss": 0.4699,
+      "step": 47200
+    },
+    {
+      "epoch": 125.89333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002971342971140612,
+      "loss": 0.4611,
+      "step": 47210
+    },
+    {
+      "epoch": 125.92,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029713307386159733,
+      "loss": 0.4711,
+      "step": 47220
+    },
+    {
+      "epoch": 125.94666666666667,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029713185035062977,
+      "loss": 0.4638,
+      "step": 47230
+    },
+    {
+      "epoch": 125.97333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002971306265811606,
+      "loss": 0.4753,
+      "step": 47240
+    },
+    {
+      "epoch": 126.0,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029712940255319204,
+      "loss": 0.4644,
+      "step": 47250
+    },
+    {
+      "epoch": 126.0,
+      "eval_loss": 0.4798469841480255,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3496,
+      "eval_samples_per_second": 1.546,
+      "eval_steps_per_second": 0.097,
+      "step": 47250
+    },
+    {
+      "epoch": 126.02666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029712817826672627,
+      "loss": 0.4832,
+      "step": 47260
+    },
+    {
+      "epoch": 126.05333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029712695372176534,
+      "loss": 0.4898,
+      "step": 47270
+    },
+    {
+      "epoch": 126.08,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002971257289183115,
+      "loss": 0.4771,
+      "step": 47280
+    },
+    {
+      "epoch": 126.10666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002971245038563668,
+      "loss": 0.4716,
+      "step": 47290
+    },
+    {
+      "epoch": 126.13333333333334,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002971232785359335,
+      "loss": 0.4734,
+      "step": 47300
+    },
+    {
+      "epoch": 126.16,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002971220529570137,
+      "loss": 0.4778,
+      "step": 47310
+    },
+    {
+      "epoch": 126.18666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002971208271196095,
+      "loss": 0.4695,
+      "step": 47320
+    },
+    {
+      "epoch": 126.21333333333334,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002971196010237231,
+      "loss": 0.4644,
+      "step": 47330
+    },
+    {
+      "epoch": 126.24,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029711837466935676,
+      "loss": 0.4687,
+      "step": 47340
+    },
+    {
+      "epoch": 126.26666666666667,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002971171480565125,
+      "loss": 0.4723,
+      "step": 47350
+    },
+    {
+      "epoch": 126.29333333333334,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.00029711592118519245,
+      "loss": 0.4671,
+      "step": 47360
+    },
+    {
+      "epoch": 126.32,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002971146940553989,
+      "loss": 0.4704,
+      "step": 47370
+    },
+    {
+      "epoch": 126.34666666666666,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029711346666713386,
+      "loss": 0.4785,
+      "step": 47380
+    },
+    {
+      "epoch": 126.37333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002971122390203996,
+      "loss": 0.4742,
+      "step": 47390
+    },
+    {
+      "epoch": 126.4,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029711101111519825,
+      "loss": 0.4702,
+      "step": 47400
+    },
+    {
+      "epoch": 126.42666666666666,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002971097829515319,
+      "loss": 0.4773,
+      "step": 47410
+    },
+    {
+      "epoch": 126.45333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002971085545294028,
+      "loss": 0.4792,
+      "step": 47420
+    },
+    {
+      "epoch": 126.48,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.00029710732584881306,
+      "loss": 0.4853,
+      "step": 47430
+    },
+    {
+      "epoch": 126.50666666666666,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0002971060969097649,
+      "loss": 0.475,
+      "step": 47440
+    },
+    {
+      "epoch": 126.53333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029710486771226035,
+      "loss": 0.4694,
+      "step": 47450
+    },
+    {
+      "epoch": 126.56,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002971036382563017,
+      "loss": 0.4704,
+      "step": 47460
+    },
+    {
+      "epoch": 126.58666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.000297102408541891,
+      "loss": 0.4666,
+      "step": 47470
+    },
+    {
+      "epoch": 126.61333333333333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002971011785690305,
+      "loss": 0.4518,
+      "step": 47480
+    },
+    {
+      "epoch": 126.64,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00029709994833772237,
+      "loss": 0.4578,
+      "step": 47490
+    },
+    {
+      "epoch": 126.66666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029709871784796866,
+      "loss": 0.4728,
+      "step": 47500
+    },
+    {
+      "epoch": 126.69333333333333,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029709748709977167,
+      "loss": 0.4551,
+      "step": 47510
+    },
+    {
+      "epoch": 126.72,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029709625609313346,
+      "loss": 0.4729,
+      "step": 47520
+    },
+    {
+      "epoch": 126.74666666666667,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00029709502482805626,
+      "loss": 0.4774,
+      "step": 47530
+    },
+    {
+      "epoch": 126.77333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029709379330454213,
+      "loss": 0.4844,
+      "step": 47540
+    },
+    {
+      "epoch": 126.8,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029709256152259334,
+      "loss": 0.4708,
+      "step": 47550
+    },
+    {
+      "epoch": 126.82666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.000297091329482212,
+      "loss": 0.4653,
+      "step": 47560
+    },
+    {
+      "epoch": 126.85333333333334,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029709009718340034,
+      "loss": 0.4742,
+      "step": 47570
+    },
+    {
+      "epoch": 126.88,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002970888646261605,
+      "loss": 0.4627,
+      "step": 47580
+    },
+    {
+      "epoch": 126.90666666666667,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002970876318104945,
+      "loss": 0.4648,
+      "step": 47590
+    },
+    {
+      "epoch": 126.93333333333334,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029708639873640474,
+      "loss": 0.4666,
+      "step": 47600
+    },
+    {
+      "epoch": 126.96,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029708516540389327,
+      "loss": 0.4647,
+      "step": 47610
+    },
+    {
+      "epoch": 126.98666666666666,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002970839318129622,
+      "loss": 0.4781,
+      "step": 47620
+    },
+    {
+      "epoch": 127.0,
+      "eval_loss": 0.47838637232780457,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.6113,
+      "eval_samples_per_second": 1.508,
+      "eval_steps_per_second": 0.094,
+      "step": 47625
+    },
+    {
+      "epoch": 127.01333333333334,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002970826979636138,
+      "loss": 0.4677,
+      "step": 47630
+    },
+    {
+      "epoch": 127.04,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002970814638558502,
+      "loss": 0.4926,
+      "step": 47640
+    },
+    {
+      "epoch": 127.06666666666666,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.00029708022948967354,
+      "loss": 0.4799,
+      "step": 47650
+    },
+    {
+      "epoch": 127.09333333333333,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002970789948650861,
+      "loss": 0.4745,
+      "step": 47660
+    },
+    {
+      "epoch": 127.12,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002970777599820899,
+      "loss": 0.4704,
+      "step": 47670
+    },
+    {
+      "epoch": 127.14666666666666,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002970765248406871,
+      "loss": 0.4794,
+      "step": 47680
+    },
+    {
+      "epoch": 127.17333333333333,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029707528944088006,
+      "loss": 0.4701,
+      "step": 47690
+    },
+    {
+      "epoch": 127.2,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002970740537826708,
+      "loss": 0.4684,
+      "step": 47700
+    },
+    {
+      "epoch": 127.22666666666667,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0002970728178660615,
+      "loss": 0.4663,
+      "step": 47710
+    },
+    {
+      "epoch": 127.25333333333333,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.00029707158169105436,
+      "loss": 0.4663,
+      "step": 47720
+    },
+    {
+      "epoch": 127.28,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029707034525765154,
+      "loss": 0.4801,
+      "step": 47730
+    },
+    {
+      "epoch": 127.30666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029706910856585526,
+      "loss": 0.4606,
+      "step": 47740
+    },
+    {
+      "epoch": 127.33333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029706787161566764,
+      "loss": 0.4764,
+      "step": 47750
+    },
+    {
+      "epoch": 127.36,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029706663440709084,
+      "loss": 0.4788,
+      "step": 47760
+    },
+    {
+      "epoch": 127.38666666666667,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0002970653969401271,
+      "loss": 0.4702,
+      "step": 47770
+    },
+    {
+      "epoch": 127.41333333333333,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029706415921477856,
+      "loss": 0.4746,
+      "step": 47780
+    },
+    {
+      "epoch": 127.44,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029706292123104735,
+      "loss": 0.4769,
+      "step": 47790
+    },
+    {
+      "epoch": 127.46666666666667,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002970616829889357,
+      "loss": 0.4867,
+      "step": 47800
+    },
+    {
+      "epoch": 127.49333333333334,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029706044448844576,
+      "loss": 0.478,
+      "step": 47810
+    },
+    {
+      "epoch": 127.52,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002970592057295797,
+      "loss": 0.4725,
+      "step": 47820
+    },
+    {
+      "epoch": 127.54666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002970579667123398,
+      "loss": 0.4688,
+      "step": 47830
+    },
+    {
+      "epoch": 127.57333333333334,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002970567274367281,
+      "loss": 0.4684,
+      "step": 47840
+    },
+    {
+      "epoch": 127.6,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002970554879027468,
+      "loss": 0.4613,
+      "step": 47850
+    },
+    {
+      "epoch": 127.62666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002970542481103981,
+      "loss": 0.4488,
+      "step": 47860
+    },
+    {
+      "epoch": 127.65333333333334,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029705300805968424,
+      "loss": 0.4684,
+      "step": 47870
+    },
+    {
+      "epoch": 127.68,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002970517677506073,
+      "loss": 0.4601,
+      "step": 47880
+    },
+    {
+      "epoch": 127.70666666666666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002970505271831695,
+      "loss": 0.4603,
+      "step": 47890
+    },
+    {
+      "epoch": 127.73333333333333,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029704928635737306,
+      "loss": 0.4793,
+      "step": 47900
+    },
+    {
+      "epoch": 127.76,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002970480452732201,
+      "loss": 0.4805,
+      "step": 47910
+    },
+    {
+      "epoch": 127.78666666666666,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002970468039307128,
+      "loss": 0.4791,
+      "step": 47920
+    },
+    {
+      "epoch": 127.81333333333333,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002970455623298533,
+      "loss": 0.4643,
+      "step": 47930
+    },
+    {
+      "epoch": 127.84,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.00029704432047064394,
+      "loss": 0.4696,
+      "step": 47940
+    },
+    {
+      "epoch": 127.86666666666666,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0002970430783530868,
+      "loss": 0.47,
+      "step": 47950
+    },
+    {
+      "epoch": 127.89333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002970418359771841,
+      "loss": 0.4608,
+      "step": 47960
+    },
+    {
+      "epoch": 127.92,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029704059334293787,
+      "loss": 0.4716,
+      "step": 47970
+    },
+    {
+      "epoch": 127.94666666666667,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029703935045035046,
+      "loss": 0.4634,
+      "step": 47980
+    },
+    {
+      "epoch": 127.97333333333333,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.000297038107299424,
+      "loss": 0.4752,
+      "step": 47990
+    },
+    {
+      "epoch": 128.0,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002970368638901607,
+      "loss": 0.4643,
+      "step": 48000
+    },
+    {
+      "epoch": 128.0,
+      "eval_loss": 0.47970858216285706,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9496,
+      "eval_samples_per_second": 1.608,
+      "eval_steps_per_second": 0.101,
+      "step": 48000
+    },
+    {
+      "epoch": 128.02666666666667,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0002970356202225628,
+      "loss": 0.4841,
+      "step": 48010
+    },
+    {
+      "epoch": 128.05333333333334,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002970343762966323,
+      "loss": 0.4894,
+      "step": 48020
+    },
+    {
+      "epoch": 128.08,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029703313211237154,
+      "loss": 0.4765,
+      "step": 48030
+    },
+    {
+      "epoch": 128.10666666666665,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002970318876697827,
+      "loss": 0.4722,
+      "step": 48040
+    },
+    {
+      "epoch": 128.13333333333333,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029703064296886787,
+      "loss": 0.4733,
+      "step": 48050
+    },
+    {
+      "epoch": 128.16,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002970293980096293,
+      "loss": 0.4777,
+      "step": 48060
+    },
+    {
+      "epoch": 128.18666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002970281527920692,
+      "loss": 0.4689,
+      "step": 48070
+    },
+    {
+      "epoch": 128.21333333333334,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002970269073161897,
+      "loss": 0.4643,
+      "step": 48080
+    },
+    {
+      "epoch": 128.24,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029702566158199304,
+      "loss": 0.4688,
+      "step": 48090
+    },
+    {
+      "epoch": 128.26666666666668,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002970244155894814,
+      "loss": 0.4729,
+      "step": 48100
+    },
+    {
+      "epoch": 128.29333333333332,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029702316933865697,
+      "loss": 0.4669,
+      "step": 48110
+    },
+    {
+      "epoch": 128.32,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029702192282952185,
+      "loss": 0.4695,
+      "step": 48120
+    },
+    {
+      "epoch": 128.34666666666666,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00029702067606207833,
+      "loss": 0.4785,
+      "step": 48130
+    },
+    {
+      "epoch": 128.37333333333333,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.00029701942903632864,
+      "loss": 0.4739,
+      "step": 48140
+    },
+    {
+      "epoch": 128.4,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0002970181817522749,
+      "loss": 0.471,
+      "step": 48150
+    },
+    {
+      "epoch": 128.42666666666668,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0002970169342099192,
+      "loss": 0.4775,
+      "step": 48160
+    },
+    {
+      "epoch": 128.45333333333335,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.000297015686409264,
+      "loss": 0.4797,
+      "step": 48170
+    },
+    {
+      "epoch": 128.48,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002970144383503112,
+      "loss": 0.4853,
+      "step": 48180
+    },
+    {
+      "epoch": 128.50666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029701319003306323,
+      "loss": 0.4752,
+      "step": 48190
+    },
+    {
+      "epoch": 128.53333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002970119414575221,
+      "loss": 0.4686,
+      "step": 48200
+    },
+    {
+      "epoch": 128.56,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002970106926236901,
+      "loss": 0.4697,
+      "step": 48210
+    },
+    {
+      "epoch": 128.58666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002970094435315694,
+      "loss": 0.4653,
+      "step": 48220
+    },
+    {
+      "epoch": 128.61333333333334,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002970081941811622,
+      "loss": 0.4517,
+      "step": 48230
+    },
+    {
+      "epoch": 128.64,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029700694457247075,
+      "loss": 0.4569,
+      "step": 48240
+    },
+    {
+      "epoch": 128.66666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029700569470549716,
+      "loss": 0.472,
+      "step": 48250
+    },
+    {
+      "epoch": 128.69333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002970044445802437,
+      "loss": 0.4552,
+      "step": 48260
+    },
+    {
+      "epoch": 128.72,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002970031941967124,
+      "loss": 0.472,
+      "step": 48270
+    },
+    {
+      "epoch": 128.74666666666667,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002970019435549057,
+      "loss": 0.4763,
+      "step": 48280
+    },
+    {
+      "epoch": 128.77333333333334,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0002970006926548256,
+      "loss": 0.4842,
+      "step": 48290
+    },
+    {
+      "epoch": 128.8,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029699944149647446,
+      "loss": 0.4708,
+      "step": 48300
+    },
+    {
+      "epoch": 128.82666666666665,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029699819007985434,
+      "loss": 0.4657,
+      "step": 48310
+    },
+    {
+      "epoch": 128.85333333333332,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002969969384049675,
+      "loss": 0.4748,
+      "step": 48320
+    },
+    {
+      "epoch": 128.88,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002969956864718161,
+      "loss": 0.4618,
+      "step": 48330
+    },
+    {
+      "epoch": 128.90666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029699443428040246,
+      "loss": 0.4652,
+      "step": 48340
+    },
+    {
+      "epoch": 128.93333333333334,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002969931818307286,
+      "loss": 0.4673,
+      "step": 48350
+    },
+    {
+      "epoch": 128.96,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002969919291227969,
+      "loss": 0.4648,
+      "step": 48360
+    },
+    {
+      "epoch": 128.98666666666668,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002969906761566094,
+      "loss": 0.4788,
+      "step": 48370
+    },
+    {
+      "epoch": 129.0,
+      "eval_loss": 0.47966912388801575,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9567,
+      "eval_samples_per_second": 1.607,
+      "eval_steps_per_second": 0.1,
+      "step": 48375
+    },
+    {
+      "epoch": 129.01333333333332,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002969894229321684,
+      "loss": 0.4686,
+      "step": 48380
+    },
+    {
+      "epoch": 129.04,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002969881694494761,
+      "loss": 0.4921,
+      "step": 48390
+    },
+    {
+      "epoch": 129.06666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002969869157085346,
+      "loss": 0.4794,
+      "step": 48400
+    },
+    {
+      "epoch": 129.09333333333333,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029698566170934627,
+      "loss": 0.4736,
+      "step": 48410
+    },
+    {
+      "epoch": 129.12,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029698440745191317,
+      "loss": 0.471,
+      "step": 48420
+    },
+    {
+      "epoch": 129.14666666666668,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002969831529362376,
+      "loss": 0.479,
+      "step": 48430
+    },
+    {
+      "epoch": 129.17333333333335,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0002969818981623217,
+      "loss": 0.4695,
+      "step": 48440
+    },
+    {
+      "epoch": 129.2,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002969806431301677,
+      "loss": 0.4678,
+      "step": 48450
+    },
+    {
+      "epoch": 129.22666666666666,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029697938783977777,
+      "loss": 0.466,
+      "step": 48460
+    },
+    {
+      "epoch": 129.25333333333333,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0002969781322911542,
+      "loss": 0.4658,
+      "step": 48470
+    },
+    {
+      "epoch": 129.28,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002969768764842991,
+      "loss": 0.4796,
+      "step": 48480
+    },
+    {
+      "epoch": 129.30666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029697562041921475,
+      "loss": 0.4613,
+      "step": 48490
+    },
+    {
+      "epoch": 129.33333333333334,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002969743640959033,
+      "loss": 0.4767,
+      "step": 48500
+    },
+    {
+      "epoch": 129.36,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.000296973107514367,
+      "loss": 0.4786,
+      "step": 48510
+    },
+    {
+      "epoch": 129.38666666666666,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029697185067460805,
+      "loss": 0.4697,
+      "step": 48520
+    },
+    {
+      "epoch": 129.41333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002969705935766287,
+      "loss": 0.4744,
+      "step": 48530
+    },
+    {
+      "epoch": 129.44,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.000296969336220431,
+      "loss": 0.4768,
+      "step": 48540
+    },
+    {
+      "epoch": 129.46666666666667,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0002969680786060173,
+      "loss": 0.4873,
+      "step": 48550
+    },
+    {
+      "epoch": 129.49333333333334,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029696682073338985,
+      "loss": 0.4782,
+      "step": 48560
+    },
+    {
+      "epoch": 129.52,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002969655626025507,
+      "loss": 0.4721,
+      "step": 48570
+    },
+    {
+      "epoch": 129.54666666666665,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002969643042135022,
+      "loss": 0.4689,
+      "step": 48580
+    },
+    {
+      "epoch": 129.57333333333332,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002969630455662465,
+      "loss": 0.4687,
+      "step": 48590
+    },
+    {
+      "epoch": 129.6,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002969617866607858,
+      "loss": 0.4625,
+      "step": 48600
+    },
+    {
+      "epoch": 129.62666666666667,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.00029696052749712235,
+      "loss": 0.4495,
+      "step": 48610
+    },
+    {
+      "epoch": 129.65333333333334,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029695926807525835,
+      "loss": 0.4686,
+      "step": 48620
+    },
+    {
+      "epoch": 129.68,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029695800839519594,
+      "loss": 0.46,
+      "step": 48630
+    },
+    {
+      "epoch": 129.70666666666668,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002969567484569375,
+      "loss": 0.4605,
+      "step": 48640
+    },
+    {
+      "epoch": 129.73333333333332,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002969554882604851,
+      "loss": 0.4797,
+      "step": 48650
+    },
+    {
+      "epoch": 129.76,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.000296954227805841,
+      "loss": 0.4812,
+      "step": 48660
+    },
+    {
+      "epoch": 129.78666666666666,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002969529670930074,
+      "loss": 0.4795,
+      "step": 48670
+    },
+    {
+      "epoch": 129.81333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002969517061219865,
+      "loss": 0.4639,
+      "step": 48680
+    },
+    {
+      "epoch": 129.84,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002969504448927806,
+      "loss": 0.4704,
+      "step": 48690
+    },
+    {
+      "epoch": 129.86666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029694918340539186,
+      "loss": 0.4689,
+      "step": 48700
+    },
+    {
+      "epoch": 129.89333333333335,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002969479216598224,
+      "loss": 0.4601,
+      "step": 48710
+    },
+    {
+      "epoch": 129.92,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029694665965607464,
+      "loss": 0.4713,
+      "step": 48720
+    },
+    {
+      "epoch": 129.94666666666666,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029694539739415064,
+      "loss": 0.463,
+      "step": 48730
+    },
+    {
+      "epoch": 129.97333333333333,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0002969441348740527,
+      "loss": 0.4752,
+      "step": 48740
+    },
+    {
+      "epoch": 130.0,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.000296942872095783,
+      "loss": 0.4645,
+      "step": 48750
+    },
+    {
+      "epoch": 130.0,
+      "eval_loss": 0.47943049669265747,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9185,
+      "eval_samples_per_second": 1.613,
+      "eval_steps_per_second": 0.101,
+      "step": 48750
+    },
+    {
+      "epoch": 130.02666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002969416090593437,
+      "loss": 0.4839,
+      "step": 48760
+    },
+    {
+      "epoch": 130.05333333333334,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029694034576473714,
+      "loss": 0.4896,
+      "step": 48770
+    },
+    {
+      "epoch": 130.08,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.00029693908221196545,
+      "loss": 0.4754,
+      "step": 48780
+    },
+    {
+      "epoch": 130.10666666666665,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002969378184010309,
+      "loss": 0.4716,
+      "step": 48790
+    },
+    {
+      "epoch": 130.13333333333333,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002969365543319357,
+      "loss": 0.4728,
+      "step": 48800
+    },
+    {
+      "epoch": 130.16,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.00029693529000468207,
+      "loss": 0.477,
+      "step": 48810
+    },
+    {
+      "epoch": 130.18666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002969340254192722,
+      "loss": 0.4699,
+      "step": 48820
+    },
+    {
+      "epoch": 130.21333333333334,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029693276057570834,
+      "loss": 0.4641,
+      "step": 48830
+    },
+    {
+      "epoch": 130.24,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029693149547399273,
+      "loss": 0.4682,
+      "step": 48840
+    },
+    {
+      "epoch": 130.26666666666668,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029693023011412756,
+      "loss": 0.472,
+      "step": 48850
+    },
+    {
+      "epoch": 130.29333333333332,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002969289644961151,
+      "loss": 0.4671,
+      "step": 48860
+    },
+    {
+      "epoch": 130.32,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.00029692769861995746,
+      "loss": 0.4702,
+      "step": 48870
+    },
+    {
+      "epoch": 130.34666666666666,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.000296926432485657,
+      "loss": 0.4789,
+      "step": 48880
+    },
+    {
+      "epoch": 130.37333333333333,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029692516609321586,
+      "loss": 0.4748,
+      "step": 48890
+    },
+    {
+      "epoch": 130.4,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002969238994426363,
+      "loss": 0.471,
+      "step": 48900
+    },
+    {
+      "epoch": 130.42666666666668,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029692263253392056,
+      "loss": 0.4774,
+      "step": 48910
+    },
+    {
+      "epoch": 130.45333333333335,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002969213653670708,
+      "loss": 0.4787,
+      "step": 48920
+    },
+    {
+      "epoch": 130.48,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029692009794208933,
+      "loss": 0.4856,
+      "step": 48930
+    },
+    {
+      "epoch": 130.50666666666666,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029691883025897834,
+      "loss": 0.4752,
+      "step": 48940
+    },
+    {
+      "epoch": 130.53333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029691756231774,
+      "loss": 0.4688,
+      "step": 48950
+    },
+    {
+      "epoch": 130.56,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002969162941183766,
+      "loss": 0.4698,
+      "step": 48960
+    },
+    {
+      "epoch": 130.58666666666667,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.00029691502566089043,
+      "loss": 0.4654,
+      "step": 48970
+    },
+    {
+      "epoch": 130.61333333333334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002969137569452836,
+      "loss": 0.4517,
+      "step": 48980
+    },
+    {
+      "epoch": 130.64,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002969124879715584,
+      "loss": 0.457,
+      "step": 48990
+    },
+    {
+      "epoch": 130.66666666666666,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.000296911218739717,
+      "loss": 0.4729,
+      "step": 49000
+    },
+    {
+      "epoch": 130.69333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002969099492497617,
+      "loss": 0.4555,
+      "step": 49010
+    },
+    {
+      "epoch": 130.72,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029690867950169474,
+      "loss": 0.4714,
+      "step": 49020
+    },
+    {
+      "epoch": 130.74666666666667,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029690740949551834,
+      "loss": 0.4767,
+      "step": 49030
+    },
+    {
+      "epoch": 130.77333333333334,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002969061392312346,
+      "loss": 0.4839,
+      "step": 49040
+    },
+    {
+      "epoch": 130.8,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002969048687088459,
+      "loss": 0.4714,
+      "step": 49050
+    },
+    {
+      "epoch": 130.82666666666665,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002969035979283545,
+      "loss": 0.4653,
+      "step": 49060
+    },
+    {
+      "epoch": 130.85333333333332,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029690232688976253,
+      "loss": 0.4741,
+      "step": 49070
+    },
+    {
+      "epoch": 130.88,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029690105559307227,
+      "loss": 0.4627,
+      "step": 49080
+    },
+    {
+      "epoch": 130.90666666666667,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0002968997840382859,
+      "loss": 0.4654,
+      "step": 49090
+    },
+    {
+      "epoch": 130.93333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029689851222540573,
+      "loss": 0.4663,
+      "step": 49100
+    },
+    {
+      "epoch": 130.96,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029689724015443395,
+      "loss": 0.465,
+      "step": 49110
+    },
+    {
+      "epoch": 130.98666666666668,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002968959678253728,
+      "loss": 0.4788,
+      "step": 49120
+    },
+    {
+      "epoch": 131.0,
+      "eval_loss": 0.47895506024360657,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.9459,
+      "eval_samples_per_second": 1.462,
+      "eval_steps_per_second": 0.091,
+      "step": 49125
+    },
+    {
+      "epoch": 131.01333333333332,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029689469523822454,
+      "loss": 0.4677,
+      "step": 49130
+    },
+    {
+      "epoch": 131.04,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002968934223929914,
+      "loss": 0.4924,
+      "step": 49140
+    },
+    {
+      "epoch": 131.06666666666666,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00029689214928967555,
+      "loss": 0.4797,
+      "step": 49150
+    },
+    {
+      "epoch": 131.09333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002968908759282793,
+      "loss": 0.4743,
+      "step": 49160
+    },
+    {
+      "epoch": 131.12,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029688960230880493,
+      "loss": 0.4702,
+      "step": 49170
+    },
+    {
+      "epoch": 131.14666666666668,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002968883284312545,
+      "loss": 0.4795,
+      "step": 49180
+    },
+    {
+      "epoch": 131.17333333333335,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0002968870542956305,
+      "loss": 0.4697,
+      "step": 49190
+    },
+    {
+      "epoch": 131.2,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002968857799019349,
+      "loss": 0.4685,
+      "step": 49200
+    },
+    {
+      "epoch": 131.22666666666666,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002968845052501702,
+      "loss": 0.4661,
+      "step": 49210
+    },
+    {
+      "epoch": 131.25333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002968832303403384,
+      "loss": 0.4657,
+      "step": 49220
+    },
+    {
+      "epoch": 131.28,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002968819551724419,
+      "loss": 0.48,
+      "step": 49230
+    },
+    {
+      "epoch": 131.30666666666667,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.00029688067974648286,
+      "loss": 0.4606,
+      "step": 49240
+    },
+    {
+      "epoch": 131.33333333333334,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0002968794040624636,
+      "loss": 0.4766,
+      "step": 49250
+    },
+    {
+      "epoch": 131.36,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029687812812038625,
+      "loss": 0.4782,
+      "step": 49260
+    },
+    {
+      "epoch": 131.38666666666666,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029687685192025315,
+      "loss": 0.4695,
+      "step": 49270
+    },
+    {
+      "epoch": 131.41333333333333,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002968755754620665,
+      "loss": 0.4743,
+      "step": 49280
+    },
+    {
+      "epoch": 131.44,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029687429874582856,
+      "loss": 0.4774,
+      "step": 49290
+    },
+    {
+      "epoch": 131.46666666666667,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.00029687302177154156,
+      "loss": 0.4868,
+      "step": 49300
+    },
+    {
+      "epoch": 131.49333333333334,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002968717445392077,
+      "loss": 0.4775,
+      "step": 49310
+    },
+    {
+      "epoch": 131.52,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002968704670488294,
+      "loss": 0.4724,
+      "step": 49320
+    },
+    {
+      "epoch": 131.54666666666665,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029686918930040863,
+      "loss": 0.4699,
+      "step": 49330
+    },
+    {
+      "epoch": 131.57333333333332,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002968679112939479,
+      "loss": 0.4682,
+      "step": 49340
+    },
+    {
+      "epoch": 131.6,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029686663302944924,
+      "loss": 0.4619,
+      "step": 49350
+    },
+    {
+      "epoch": 131.62666666666667,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029686535450691503,
+      "loss": 0.4493,
+      "step": 49360
+    },
+    {
+      "epoch": 131.65333333333334,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0002968640757263475,
+      "loss": 0.468,
+      "step": 49370
+    },
+    {
+      "epoch": 131.68,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002968627966877488,
+      "loss": 0.4603,
+      "step": 49380
+    },
+    {
+      "epoch": 131.70666666666668,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029686151739112136,
+      "loss": 0.4607,
+      "step": 49390
+    },
+    {
+      "epoch": 131.73333333333332,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029686023783646724,
+      "loss": 0.4792,
+      "step": 49400
+    },
+    {
+      "epoch": 131.76,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029685895802378877,
+      "loss": 0.4805,
+      "step": 49410
+    },
+    {
+      "epoch": 131.78666666666666,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.00029685767795308823,
+      "loss": 0.479,
+      "step": 49420
+    },
+    {
+      "epoch": 131.81333333333333,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0002968563976243678,
+      "loss": 0.4645,
+      "step": 49430
+    },
+    {
+      "epoch": 131.84,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029685511703762975,
+      "loss": 0.4707,
+      "step": 49440
+    },
+    {
+      "epoch": 131.86666666666667,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0002968538361928764,
+      "loss": 0.47,
+      "step": 49450
+    },
+    {
+      "epoch": 131.89333333333335,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002968525550901099,
+      "loss": 0.4606,
+      "step": 49460
+    },
+    {
+      "epoch": 131.92,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029685127372933254,
+      "loss": 0.471,
+      "step": 49470
+    },
+    {
+      "epoch": 131.94666666666666,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002968499921105466,
+      "loss": 0.4636,
+      "step": 49480
+    },
+    {
+      "epoch": 131.97333333333333,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0002968487102337543,
+      "loss": 0.4754,
+      "step": 49490
+    },
+    {
+      "epoch": 132.0,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0002968474280989579,
+      "loss": 0.4642,
+      "step": 49500
+    },
+    {
+      "epoch": 132.0,
+      "eval_loss": 0.4787776470184326,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 11.4216,
+      "eval_samples_per_second": 1.401,
+      "eval_steps_per_second": 0.088,
+      "step": 49500
+    },
+    {
+      "epoch": 132.02666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029684614570615967,
+      "loss": 0.4831,
+      "step": 49510
+    },
+    {
+      "epoch": 132.05333333333334,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029684486305536187,
+      "loss": 0.4898,
+      "step": 49520
+    },
+    {
+      "epoch": 132.08,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029684358014656667,
+      "loss": 0.4761,
+      "step": 49530
+    },
+    {
+      "epoch": 132.10666666666665,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0002968422969797764,
+      "loss": 0.4725,
+      "step": 49540
+    },
+    {
+      "epoch": 132.13333333333333,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029684101355499334,
+      "loss": 0.4729,
+      "step": 49550
+    },
+    {
+      "epoch": 132.16,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029683972987221965,
+      "loss": 0.4776,
+      "step": 49560
+    },
+    {
+      "epoch": 132.18666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002968384459314576,
+      "loss": 0.4695,
+      "step": 49570
+    },
+    {
+      "epoch": 132.21333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029683716173270957,
+      "loss": 0.4646,
+      "step": 49580
+    },
+    {
+      "epoch": 132.24,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002968358772759777,
+      "loss": 0.4676,
+      "step": 49590
+    },
+    {
+      "epoch": 132.26666666666668,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029683459256126427,
+      "loss": 0.4726,
+      "step": 49600
+    },
+    {
+      "epoch": 132.29333333333332,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029683330758857157,
+      "loss": 0.4667,
+      "step": 49610
+    },
+    {
+      "epoch": 132.32,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0002968320223579018,
+      "loss": 0.4697,
+      "step": 49620
+    },
+    {
+      "epoch": 132.34666666666666,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002968307368692572,
+      "loss": 0.4782,
+      "step": 49630
+    },
+    {
+      "epoch": 132.37333333333333,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.00029682945112264014,
+      "loss": 0.4745,
+      "step": 49640
+    },
+    {
+      "epoch": 132.4,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0002968281651180528,
+      "loss": 0.4713,
+      "step": 49650
+    },
+    {
+      "epoch": 132.42666666666668,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002968268788554975,
+      "loss": 0.4767,
+      "step": 49660
+    },
+    {
+      "epoch": 132.45333333333335,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029682559233497644,
+      "loss": 0.4792,
+      "step": 49670
+    },
+    {
+      "epoch": 132.48,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029682430555649184,
+      "loss": 0.4857,
+      "step": 49680
+    },
+    {
+      "epoch": 132.50666666666666,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002968230185200461,
+      "loss": 0.4744,
+      "step": 49690
+    },
+    {
+      "epoch": 132.53333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029682173122564135,
+      "loss": 0.469,
+      "step": 49700
+    },
+    {
+      "epoch": 132.56,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002968204436732799,
+      "loss": 0.4702,
+      "step": 49710
+    },
+    {
+      "epoch": 132.58666666666667,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00029681915586296406,
+      "loss": 0.4661,
+      "step": 49720
+    },
+    {
+      "epoch": 132.61333333333334,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.000296817867794696,
+      "loss": 0.4518,
+      "step": 49730
+    },
+    {
+      "epoch": 132.64,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.000296816579468478,
+      "loss": 0.4575,
+      "step": 49740
+    },
+    {
+      "epoch": 132.66666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002968152908843124,
+      "loss": 0.4721,
+      "step": 49750
+    },
+    {
+      "epoch": 132.69333333333333,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029681400204220143,
+      "loss": 0.4554,
+      "step": 49760
+    },
+    {
+      "epoch": 132.72,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002968127129421473,
+      "loss": 0.472,
+      "step": 49770
+    },
+    {
+      "epoch": 132.74666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029681142358415236,
+      "loss": 0.4757,
+      "step": 49780
+    },
+    {
+      "epoch": 132.77333333333334,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002968101339682188,
+      "loss": 0.4841,
+      "step": 49790
+    },
+    {
+      "epoch": 132.8,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029680884409434886,
+      "loss": 0.4705,
+      "step": 49800
+    },
+    {
+      "epoch": 132.82666666666665,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029680755396254495,
+      "loss": 0.466,
+      "step": 49810
+    },
+    {
+      "epoch": 132.85333333333332,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002968062635728092,
+      "loss": 0.4745,
+      "step": 49820
+    },
+    {
+      "epoch": 132.88,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029680497292514395,
+      "loss": 0.4631,
+      "step": 49830
+    },
+    {
+      "epoch": 132.90666666666667,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029680368201955145,
+      "loss": 0.4654,
+      "step": 49840
+    },
+    {
+      "epoch": 132.93333333333334,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029680239085603393,
+      "loss": 0.467,
+      "step": 49850
+    },
+    {
+      "epoch": 132.96,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002968010994345937,
+      "loss": 0.4645,
+      "step": 49860
+    },
+    {
+      "epoch": 132.98666666666668,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.000296799807755233,
+      "loss": 0.4786,
+      "step": 49870
+    },
+    {
+      "epoch": 133.0,
+      "eval_loss": 0.4792354702949524,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5036,
+      "eval_samples_per_second": 1.523,
+      "eval_steps_per_second": 0.095,
+      "step": 49875
+    },
+    {
+      "epoch": 133.01333333333332,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002967985158179542,
+      "loss": 0.4681,
+      "step": 49880
+    },
+    {
+      "epoch": 133.04,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029679722362275943,
+      "loss": 0.4921,
+      "step": 49890
+    },
+    {
+      "epoch": 133.06666666666666,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.000296795931169651,
+      "loss": 0.4797,
+      "step": 49900
+    },
+    {
+      "epoch": 133.09333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029679463845863125,
+      "loss": 0.4746,
+      "step": 49910
+    },
+    {
+      "epoch": 133.12,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002967933454897024,
+      "loss": 0.4712,
+      "step": 49920
+    },
+    {
+      "epoch": 133.14666666666668,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029679205226286666,
+      "loss": 0.4798,
+      "step": 49930
+    },
+    {
+      "epoch": 133.17333333333335,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002967907587781264,
+      "loss": 0.4697,
+      "step": 49940
+    },
+    {
+      "epoch": 133.2,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029678946503548386,
+      "loss": 0.468,
+      "step": 49950
+    },
+    {
+      "epoch": 133.22666666666666,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029678817103494134,
+      "loss": 0.4664,
+      "step": 49960
+    },
+    {
+      "epoch": 133.25333333333333,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.000296786876776501,
+      "loss": 0.4654,
+      "step": 49970
+    },
+    {
+      "epoch": 133.28,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029678558226016525,
+      "loss": 0.4786,
+      "step": 49980
+    },
+    {
+      "epoch": 133.30666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029678428748593634,
+      "loss": 0.4603,
+      "step": 49990
+    },
+    {
+      "epoch": 133.33333333333334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002967829924538165,
+      "loss": 0.4756,
+      "step": 50000
+    },
+    {
+      "epoch": 133.36,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.000296781697163808,
+      "loss": 0.4776,
+      "step": 50010
+    },
+    {
+      "epoch": 133.38666666666666,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002967804016159131,
+      "loss": 0.4695,
+      "step": 50020
+    },
+    {
+      "epoch": 133.41333333333333,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0002967791058101342,
+      "loss": 0.4748,
+      "step": 50030
+    },
+    {
+      "epoch": 133.44,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029677780974647345,
+      "loss": 0.4773,
+      "step": 50040
+    },
+    {
+      "epoch": 133.46666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029677651342493316,
+      "loss": 0.4864,
+      "step": 50050
+    },
+    {
+      "epoch": 133.49333333333334,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029677521684551565,
+      "loss": 0.4777,
+      "step": 50060
+    },
+    {
+      "epoch": 133.52,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029677392000822315,
+      "loss": 0.4722,
+      "step": 50070
+    },
+    {
+      "epoch": 133.54666666666665,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029677262291305794,
+      "loss": 0.4701,
+      "step": 50080
+    },
+    {
+      "epoch": 133.57333333333332,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002967713255600223,
+      "loss": 0.4683,
+      "step": 50090
+    },
+    {
+      "epoch": 133.6,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029677002794911857,
+      "loss": 0.4618,
+      "step": 50100
+    },
+    {
+      "epoch": 133.62666666666667,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029676873008034887,
+      "loss": 0.4487,
+      "step": 50110
+    },
+    {
+      "epoch": 133.65333333333334,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002967674319537157,
+      "loss": 0.468,
+      "step": 50120
+    },
+    {
+      "epoch": 133.68,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0002967661335692212,
+      "loss": 0.4605,
+      "step": 50130
+    },
+    {
+      "epoch": 133.70666666666668,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029676483492686764,
+      "loss": 0.4606,
+      "step": 50140
+    },
+    {
+      "epoch": 133.73333333333332,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029676353602665743,
+      "loss": 0.4794,
+      "step": 50150
+    },
+    {
+      "epoch": 133.76,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029676223686859267,
+      "loss": 0.4806,
+      "step": 50160
+    },
+    {
+      "epoch": 133.78666666666666,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00029676093745267584,
+      "loss": 0.4786,
+      "step": 50170
+    },
+    {
+      "epoch": 133.81333333333333,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.000296759637778909,
+      "loss": 0.4646,
+      "step": 50180
+    },
+    {
+      "epoch": 133.84,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0002967583378472947,
+      "loss": 0.471,
+      "step": 50190
+    },
+    {
+      "epoch": 133.86666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029675703765783494,
+      "loss": 0.4692,
+      "step": 50200
+    },
+    {
+      "epoch": 133.89333333333335,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002967557372105322,
+      "loss": 0.4609,
+      "step": 50210
+    },
+    {
+      "epoch": 133.92,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029675443650538873,
+      "loss": 0.4713,
+      "step": 50220
+    },
+    {
+      "epoch": 133.94666666666666,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002967531355424068,
+      "loss": 0.4639,
+      "step": 50230
+    },
+    {
+      "epoch": 133.97333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029675183432158864,
+      "loss": 0.4759,
+      "step": 50240
+    },
+    {
+      "epoch": 134.0,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0002967505328429366,
+      "loss": 0.4638,
+      "step": 50250
+    },
+    {
+      "epoch": 134.0,
+      "eval_loss": 0.47991687059402466,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8196,
+      "eval_samples_per_second": 1.629,
+      "eval_steps_per_second": 0.102,
+      "step": 50250
+    },
+    {
+      "epoch": 134.02666666666667,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029674923110645295,
+      "loss": 0.4837,
+      "step": 50260
+    },
+    {
+      "epoch": 134.05333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029674792911214,
+      "loss": 0.4895,
+      "step": 50270
+    },
+    {
+      "epoch": 134.08,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029674662685999996,
+      "loss": 0.4755,
+      "step": 50280
+    },
+    {
+      "epoch": 134.10666666666665,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002967453243500353,
+      "loss": 0.4715,
+      "step": 50290
+    },
+    {
+      "epoch": 134.13333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.000296744021582248,
+      "loss": 0.4738,
+      "step": 50300
+    },
+    {
+      "epoch": 134.16,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002967427185566407,
+      "loss": 0.4776,
+      "step": 50310
+    },
+    {
+      "epoch": 134.18666666666667,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029674141527321544,
+      "loss": 0.4694,
+      "step": 50320
+    },
+    {
+      "epoch": 134.21333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002967401117319746,
+      "loss": 0.4645,
+      "step": 50330
+    },
+    {
+      "epoch": 134.24,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029673880793292044,
+      "loss": 0.4684,
+      "step": 50340
+    },
+    {
+      "epoch": 134.26666666666668,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029673750387605534,
+      "loss": 0.4731,
+      "step": 50350
+    },
+    {
+      "epoch": 134.29333333333332,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029673619956138144,
+      "loss": 0.467,
+      "step": 50360
+    },
+    {
+      "epoch": 134.32,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029673489498890113,
+      "loss": 0.4706,
+      "step": 50370
+    },
+    {
+      "epoch": 134.34666666666666,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002967335901586167,
+      "loss": 0.4784,
+      "step": 50380
+    },
+    {
+      "epoch": 134.37333333333333,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.00029673228507053046,
+      "loss": 0.4742,
+      "step": 50390
+    },
+    {
+      "epoch": 134.4,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029673097972464466,
+      "loss": 0.4707,
+      "step": 50400
+    },
+    {
+      "epoch": 134.42666666666668,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002967296741209616,
+      "loss": 0.4772,
+      "step": 50410
+    },
+    {
+      "epoch": 134.45333333333335,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029672836825948357,
+      "loss": 0.4792,
+      "step": 50420
+    },
+    {
+      "epoch": 134.48,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002967270621402129,
+      "loss": 0.4849,
+      "step": 50430
+    },
+    {
+      "epoch": 134.50666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029672575576315184,
+      "loss": 0.4752,
+      "step": 50440
+    },
+    {
+      "epoch": 134.53333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002967244491283027,
+      "loss": 0.4688,
+      "step": 50450
+    },
+    {
+      "epoch": 134.56,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029672314223566773,
+      "loss": 0.47,
+      "step": 50460
+    },
+    {
+      "epoch": 134.58666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002967218350852494,
+      "loss": 0.4659,
+      "step": 50470
+    },
+    {
+      "epoch": 134.61333333333334,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0002967205276770498,
+      "loss": 0.4511,
+      "step": 50480
+    },
+    {
+      "epoch": 134.64,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029671922001107124,
+      "loss": 0.457,
+      "step": 50490
+    },
+    {
+      "epoch": 134.66666666666666,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002967179120873162,
+      "loss": 0.472,
+      "step": 50500
+    },
+    {
+      "epoch": 134.69333333333333,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002967166039057868,
+      "loss": 0.4553,
+      "step": 50510
+    },
+    {
+      "epoch": 134.72,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029671529546648545,
+      "loss": 0.4721,
+      "step": 50520
+    },
+    {
+      "epoch": 134.74666666666667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002967139867694144,
+      "loss": 0.4771,
+      "step": 50530
+    },
+    {
+      "epoch": 134.77333333333334,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029671267781457587,
+      "loss": 0.484,
+      "step": 50540
+    },
+    {
+      "epoch": 134.8,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002967113686019723,
+      "loss": 0.4713,
+      "step": 50550
+    },
+    {
+      "epoch": 134.82666666666665,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029671005913160597,
+      "loss": 0.4659,
+      "step": 50560
+    },
+    {
+      "epoch": 134.85333333333332,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029670874940347907,
+      "loss": 0.4745,
+      "step": 50570
+    },
+    {
+      "epoch": 134.88,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.000296707439417594,
+      "loss": 0.4623,
+      "step": 50580
+    },
+    {
+      "epoch": 134.90666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029670612917395304,
+      "loss": 0.4649,
+      "step": 50590
+    },
+    {
+      "epoch": 134.93333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002967048186725585,
+      "loss": 0.4675,
+      "step": 50600
+    },
+    {
+      "epoch": 134.96,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029670350791341266,
+      "loss": 0.4651,
+      "step": 50610
+    },
+    {
+      "epoch": 134.98666666666668,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029670219689651784,
+      "loss": 0.4785,
+      "step": 50620
+    },
+    {
+      "epoch": 135.0,
+      "eval_loss": 0.48060470819473267,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9735,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 0.1,
+      "step": 50625
+    },
+    {
+      "epoch": 135.01333333333332,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002967008856218763,
+      "loss": 0.4674,
+      "step": 50630
+    },
+    {
+      "epoch": 135.04,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002966995740894904,
+      "loss": 0.4918,
+      "step": 50640
+    },
+    {
+      "epoch": 135.06666666666666,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00029669826229936243,
+      "loss": 0.4794,
+      "step": 50650
+    },
+    {
+      "epoch": 135.09333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029669695025149467,
+      "loss": 0.4739,
+      "step": 50660
+    },
+    {
+      "epoch": 135.12,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029669563794588945,
+      "loss": 0.471,
+      "step": 50670
+    },
+    {
+      "epoch": 135.14666666666668,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029669432538254905,
+      "loss": 0.4787,
+      "step": 50680
+    },
+    {
+      "epoch": 135.17333333333335,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029669301256147584,
+      "loss": 0.4702,
+      "step": 50690
+    },
+    {
+      "epoch": 135.2,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029669169948267206,
+      "loss": 0.4689,
+      "step": 50700
+    },
+    {
+      "epoch": 135.22666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029669038614614003,
+      "loss": 0.4662,
+      "step": 50710
+    },
+    {
+      "epoch": 135.25333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002966890725518821,
+      "loss": 0.4657,
+      "step": 50720
+    },
+    {
+      "epoch": 135.28,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002966877586999005,
+      "loss": 0.4801,
+      "step": 50730
+    },
+    {
+      "epoch": 135.30666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029668644459019755,
+      "loss": 0.4604,
+      "step": 50740
+    },
+    {
+      "epoch": 135.33333333333334,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002966851302227757,
+      "loss": 0.4755,
+      "step": 50750
+    },
+    {
+      "epoch": 135.36,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029668381559763707,
+      "loss": 0.4788,
+      "step": 50760
+    },
+    {
+      "epoch": 135.38666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002966825007147841,
+      "loss": 0.469,
+      "step": 50770
+    },
+    {
+      "epoch": 135.41333333333333,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.000296681185574219,
+      "loss": 0.4744,
+      "step": 50780
+    },
+    {
+      "epoch": 135.44,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002966798701759441,
+      "loss": 0.477,
+      "step": 50790
+    },
+    {
+      "epoch": 135.46666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029667855451996183,
+      "loss": 0.4861,
+      "step": 50800
+    },
+    {
+      "epoch": 135.49333333333334,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00029667723860627435,
+      "loss": 0.4775,
+      "step": 50810
+    },
+    {
+      "epoch": 135.52,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029667592243488405,
+      "loss": 0.4721,
+      "step": 50820
+    },
+    {
+      "epoch": 135.54666666666665,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029667460600579326,
+      "loss": 0.4694,
+      "step": 50830
+    },
+    {
+      "epoch": 135.57333333333332,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002966732893190042,
+      "loss": 0.4681,
+      "step": 50840
+    },
+    {
+      "epoch": 135.6,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002966719723745193,
+      "loss": 0.4617,
+      "step": 50850
+    },
+    {
+      "epoch": 135.62666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029667065517234077,
+      "loss": 0.4491,
+      "step": 50860
+    },
+    {
+      "epoch": 135.65333333333334,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.000296669337712471,
+      "loss": 0.469,
+      "step": 50870
+    },
+    {
+      "epoch": 135.68,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002966680199949123,
+      "loss": 0.4603,
+      "step": 50880
+    },
+    {
+      "epoch": 135.70666666666668,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002966667020196669,
+      "loss": 0.4611,
+      "step": 50890
+    },
+    {
+      "epoch": 135.73333333333332,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029666538378673725,
+      "loss": 0.4793,
+      "step": 50900
+    },
+    {
+      "epoch": 135.76,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002966640652961255,
+      "loss": 0.481,
+      "step": 50910
+    },
+    {
+      "epoch": 135.78666666666666,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0002966627465478342,
+      "loss": 0.4792,
+      "step": 50920
+    },
+    {
+      "epoch": 135.81333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002966614275418654,
+      "loss": 0.4643,
+      "step": 50930
+    },
+    {
+      "epoch": 135.84,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002966601082782216,
+      "loss": 0.4706,
+      "step": 50940
+    },
+    {
+      "epoch": 135.86666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029665878875690505,
+      "loss": 0.4698,
+      "step": 50950
+    },
+    {
+      "epoch": 135.89333333333335,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002966574689779181,
+      "loss": 0.4609,
+      "step": 50960
+    },
+    {
+      "epoch": 135.92,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.000296656148941263,
+      "loss": 0.4711,
+      "step": 50970
+    },
+    {
+      "epoch": 135.94666666666666,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029665482864694217,
+      "loss": 0.4637,
+      "step": 50980
+    },
+    {
+      "epoch": 135.97333333333333,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029665350809495784,
+      "loss": 0.4755,
+      "step": 50990
+    },
+    {
+      "epoch": 136.0,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029665218728531237,
+      "loss": 0.4643,
+      "step": 51000
+    },
+    {
+      "epoch": 136.0,
+      "eval_loss": 0.4795243740081787,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3714,
+      "eval_samples_per_second": 1.543,
+      "eval_steps_per_second": 0.096,
+      "step": 51000
+    },
+    {
+      "epoch": 136.02666666666667,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002966508662180081,
+      "loss": 0.4833,
+      "step": 51010
+    },
+    {
+      "epoch": 136.05333333333334,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002966495448930473,
+      "loss": 0.4893,
+      "step": 51020
+    },
+    {
+      "epoch": 136.08,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002966482233104323,
+      "loss": 0.4764,
+      "step": 51030
+    },
+    {
+      "epoch": 136.10666666666665,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.00029664690147016557,
+      "loss": 0.4718,
+      "step": 51040
+    },
+    {
+      "epoch": 136.13333333333333,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002966455793722492,
+      "loss": 0.4726,
+      "step": 51050
+    },
+    {
+      "epoch": 136.16,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029664425701668565,
+      "loss": 0.4775,
+      "step": 51060
+    },
+    {
+      "epoch": 136.18666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002966429344034772,
+      "loss": 0.4692,
+      "step": 51070
+    },
+    {
+      "epoch": 136.21333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002966416115326262,
+      "loss": 0.4637,
+      "step": 51080
+    },
+    {
+      "epoch": 136.24,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002966402884041349,
+      "loss": 0.4677,
+      "step": 51090
+    },
+    {
+      "epoch": 136.26666666666668,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002966389650180058,
+      "loss": 0.4721,
+      "step": 51100
+    },
+    {
+      "epoch": 136.29333333333332,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.000296637641374241,
+      "loss": 0.4667,
+      "step": 51110
+    },
+    {
+      "epoch": 136.32,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029663631747284296,
+      "loss": 0.4704,
+      "step": 51120
+    },
+    {
+      "epoch": 136.34666666666666,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029663499331381395,
+      "loss": 0.4782,
+      "step": 51130
+    },
+    {
+      "epoch": 136.37333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029663366889715645,
+      "loss": 0.4737,
+      "step": 51140
+    },
+    {
+      "epoch": 136.4,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029663234422287255,
+      "loss": 0.4703,
+      "step": 51150
+    },
+    {
+      "epoch": 136.42666666666668,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029663101929096477,
+      "loss": 0.4766,
+      "step": 51160
+    },
+    {
+      "epoch": 136.45333333333335,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029662969410143526,
+      "loss": 0.4793,
+      "step": 51170
+    },
+    {
+      "epoch": 136.48,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0002966283686542865,
+      "loss": 0.4862,
+      "step": 51180
+    },
+    {
+      "epoch": 136.50666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002966270429495208,
+      "loss": 0.475,
+      "step": 51190
+    },
+    {
+      "epoch": 136.53333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029662571698714046,
+      "loss": 0.4683,
+      "step": 51200
+    },
+    {
+      "epoch": 136.56,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029662439076714775,
+      "loss": 0.4704,
+      "step": 51210
+    },
+    {
+      "epoch": 136.58666666666667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002966230642895451,
+      "loss": 0.4661,
+      "step": 51220
+    },
+    {
+      "epoch": 136.61333333333334,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002966217375543348,
+      "loss": 0.4518,
+      "step": 51230
+    },
+    {
+      "epoch": 136.64,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0002966204105615191,
+      "loss": 0.4572,
+      "step": 51240
+    },
+    {
+      "epoch": 136.66666666666666,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029661908331110045,
+      "loss": 0.4721,
+      "step": 51250
+    },
+    {
+      "epoch": 136.69333333333333,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002966177558030812,
+      "loss": 0.4556,
+      "step": 51260
+    },
+    {
+      "epoch": 136.72,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002966164280374635,
+      "loss": 0.4725,
+      "step": 51270
+    },
+    {
+      "epoch": 136.74666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029661510001424994,
+      "loss": 0.4768,
+      "step": 51280
+    },
+    {
+      "epoch": 136.77333333333334,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029661377173344265,
+      "loss": 0.4839,
+      "step": 51290
+    },
+    {
+      "epoch": 136.8,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.000296612443195044,
+      "loss": 0.471,
+      "step": 51300
+    },
+    {
+      "epoch": 136.82666666666665,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029661111439905643,
+      "loss": 0.4659,
+      "step": 51310
+    },
+    {
+      "epoch": 136.85333333333332,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002966097853454822,
+      "loss": 0.4752,
+      "step": 51320
+    },
+    {
+      "epoch": 136.88,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029660845603432355,
+      "loss": 0.4628,
+      "step": 51330
+    },
+    {
+      "epoch": 136.90666666666667,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.000296607126465583,
+      "loss": 0.4644,
+      "step": 51340
+    },
+    {
+      "epoch": 136.93333333333334,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029660579663926283,
+      "loss": 0.4672,
+      "step": 51350
+    },
+    {
+      "epoch": 136.96,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029660446655536526,
+      "loss": 0.4656,
+      "step": 51360
+    },
+    {
+      "epoch": 136.98666666666668,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029660313621389275,
+      "loss": 0.4786,
+      "step": 51370
+    },
+    {
+      "epoch": 137.0,
+      "eval_loss": 0.48131263256073,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.7597,
+      "eval_samples_per_second": 1.487,
+      "eval_steps_per_second": 0.093,
+      "step": 51375
+    },
+    {
+      "epoch": 137.01333333333332,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002966018056148476,
+      "loss": 0.4687,
+      "step": 51380
+    },
+    {
+      "epoch": 137.04,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002966004747582321,
+      "loss": 0.4916,
+      "step": 51390
+    },
+    {
+      "epoch": 137.06666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002965991436440487,
+      "loss": 0.4795,
+      "step": 51400
+    },
+    {
+      "epoch": 137.09333333333333,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002965978122722996,
+      "loss": 0.4739,
+      "step": 51410
+    },
+    {
+      "epoch": 137.12,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029659648064298726,
+      "loss": 0.4704,
+      "step": 51420
+    },
+    {
+      "epoch": 137.14666666666668,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029659514875611396,
+      "loss": 0.4798,
+      "step": 51430
+    },
+    {
+      "epoch": 137.17333333333335,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029659381661168203,
+      "loss": 0.4698,
+      "step": 51440
+    },
+    {
+      "epoch": 137.2,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002965924842096939,
+      "loss": 0.468,
+      "step": 51450
+    },
+    {
+      "epoch": 137.22666666666666,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002965911515501518,
+      "loss": 0.4659,
+      "step": 51460
+    },
+    {
+      "epoch": 137.25333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029658981863305814,
+      "loss": 0.4653,
+      "step": 51470
+    },
+    {
+      "epoch": 137.28,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002965884854584152,
+      "loss": 0.4789,
+      "step": 51480
+    },
+    {
+      "epoch": 137.30666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029658715202622533,
+      "loss": 0.4608,
+      "step": 51490
+    },
+    {
+      "epoch": 137.33333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029658581833649094,
+      "loss": 0.4757,
+      "step": 51500
+    },
+    {
+      "epoch": 137.36,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029658448438921436,
+      "loss": 0.4782,
+      "step": 51510
+    },
+    {
+      "epoch": 137.38666666666666,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002965831501843979,
+      "loss": 0.4689,
+      "step": 51520
+    },
+    {
+      "epoch": 137.41333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002965818157220439,
+      "loss": 0.4748,
+      "step": 51530
+    },
+    {
+      "epoch": 137.44,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029658048100215473,
+      "loss": 0.4771,
+      "step": 51540
+    },
+    {
+      "epoch": 137.46666666666667,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002965791460247327,
+      "loss": 0.4872,
+      "step": 51550
+    },
+    {
+      "epoch": 137.49333333333334,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029657781078978017,
+      "loss": 0.478,
+      "step": 51560
+    },
+    {
+      "epoch": 137.52,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002965764752972995,
+      "loss": 0.4714,
+      "step": 51570
+    },
+    {
+      "epoch": 137.54666666666665,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002965751395472931,
+      "loss": 0.4692,
+      "step": 51580
+    },
+    {
+      "epoch": 137.57333333333332,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029657380353976316,
+      "loss": 0.4688,
+      "step": 51590
+    },
+    {
+      "epoch": 137.6,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029657246727471213,
+      "loss": 0.4619,
+      "step": 51600
+    },
+    {
+      "epoch": 137.62666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002965711307521424,
+      "loss": 0.4492,
+      "step": 51610
+    },
+    {
+      "epoch": 137.65333333333334,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002965697939720562,
+      "loss": 0.4685,
+      "step": 51620
+    },
+    {
+      "epoch": 137.68,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029656845693445597,
+      "loss": 0.4599,
+      "step": 51630
+    },
+    {
+      "epoch": 137.70666666666668,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029656711963934397,
+      "loss": 0.46,
+      "step": 51640
+    },
+    {
+      "epoch": 137.73333333333332,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029656578208672263,
+      "loss": 0.4792,
+      "step": 51650
+    },
+    {
+      "epoch": 137.76,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002965644442765943,
+      "loss": 0.4801,
+      "step": 51660
+    },
+    {
+      "epoch": 137.78666666666666,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029656310620896134,
+      "loss": 0.4792,
+      "step": 51670
+    },
+    {
+      "epoch": 137.81333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.000296561767883826,
+      "loss": 0.4638,
+      "step": 51680
+    },
+    {
+      "epoch": 137.84,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029656042930119075,
+      "loss": 0.4706,
+      "step": 51690
+    },
+    {
+      "epoch": 137.86666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029655909046105787,
+      "loss": 0.4694,
+      "step": 51700
+    },
+    {
+      "epoch": 137.89333333333335,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002965577513634297,
+      "loss": 0.4606,
+      "step": 51710
+    },
+    {
+      "epoch": 137.92,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002965564120083087,
+      "loss": 0.4704,
+      "step": 51720
+    },
+    {
+      "epoch": 137.94666666666666,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029655507239569707,
+      "loss": 0.4627,
+      "step": 51730
+    },
+    {
+      "epoch": 137.97333333333333,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029655373252559733,
+      "loss": 0.4747,
+      "step": 51740
+    },
+    {
+      "epoch": 138.0,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002965523923980117,
+      "loss": 0.4634,
+      "step": 51750
+    },
+    {
+      "epoch": 138.0,
+      "eval_loss": 0.47989213466644287,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2738,
+      "eval_samples_per_second": 1.557,
+      "eval_steps_per_second": 0.097,
+      "step": 51750
+    },
+    {
+      "epoch": 138.02666666666667,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002965510520129426,
+      "loss": 0.4841,
+      "step": 51760
+    },
+    {
+      "epoch": 138.05333333333334,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00029654971137039234,
+      "loss": 0.4889,
+      "step": 51770
+    },
+    {
+      "epoch": 138.08,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002965483704703633,
+      "loss": 0.4767,
+      "step": 51780
+    },
+    {
+      "epoch": 138.10666666666665,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002965470293128579,
+      "loss": 0.4717,
+      "step": 51790
+    },
+    {
+      "epoch": 138.13333333333333,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029654568789787836,
+      "loss": 0.4731,
+      "step": 51800
+    },
+    {
+      "epoch": 138.16,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029654434622542717,
+      "loss": 0.4769,
+      "step": 51810
+    },
+    {
+      "epoch": 138.18666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002965430042955066,
+      "loss": 0.469,
+      "step": 51820
+    },
+    {
+      "epoch": 138.21333333333334,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029654166210811903,
+      "loss": 0.4644,
+      "step": 51830
+    },
+    {
+      "epoch": 138.24,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002965403196632668,
+      "loss": 0.4684,
+      "step": 51840
+    },
+    {
+      "epoch": 138.26666666666668,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029653897696095235,
+      "loss": 0.473,
+      "step": 51850
+    },
+    {
+      "epoch": 138.29333333333332,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.000296537634001178,
+      "loss": 0.466,
+      "step": 51860
+    },
+    {
+      "epoch": 138.32,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.000296536290783946,
+      "loss": 0.4706,
+      "step": 51870
+    },
+    {
+      "epoch": 138.34666666666666,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029653494730925886,
+      "loss": 0.4785,
+      "step": 51880
+    },
+    {
+      "epoch": 138.37333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029653360357711887,
+      "loss": 0.4745,
+      "step": 51890
+    },
+    {
+      "epoch": 138.4,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029653225958752835,
+      "loss": 0.471,
+      "step": 51900
+    },
+    {
+      "epoch": 138.42666666666668,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002965309153404898,
+      "loss": 0.4771,
+      "step": 51910
+    },
+    {
+      "epoch": 138.45333333333335,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002965295708360055,
+      "loss": 0.4787,
+      "step": 51920
+    },
+    {
+      "epoch": 138.48,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029652822607407776,
+      "loss": 0.486,
+      "step": 51930
+    },
+    {
+      "epoch": 138.50666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.000296526881054709,
+      "loss": 0.4745,
+      "step": 51940
+    },
+    {
+      "epoch": 138.53333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029652553577790155,
+      "loss": 0.4689,
+      "step": 51950
+    },
+    {
+      "epoch": 138.56,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002965241902436579,
+      "loss": 0.4704,
+      "step": 51960
+    },
+    {
+      "epoch": 138.58666666666667,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002965228444519802,
+      "loss": 0.4655,
+      "step": 51970
+    },
+    {
+      "epoch": 138.61333333333334,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029652149840287095,
+      "loss": 0.4512,
+      "step": 51980
+    },
+    {
+      "epoch": 138.64,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002965201520963325,
+      "loss": 0.4568,
+      "step": 51990
+    },
+    {
+      "epoch": 138.66666666666666,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002965188055323672,
+      "loss": 0.4718,
+      "step": 52000
+    },
+    {
+      "epoch": 138.69333333333333,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029651745871097745,
+      "loss": 0.4547,
+      "step": 52010
+    },
+    {
+      "epoch": 138.72,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029651611163216557,
+      "loss": 0.4725,
+      "step": 52020
+    },
+    {
+      "epoch": 138.74666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029651476429593394,
+      "loss": 0.4767,
+      "step": 52030
+    },
+    {
+      "epoch": 138.77333333333334,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029651341670228497,
+      "loss": 0.4832,
+      "step": 52040
+    },
+    {
+      "epoch": 138.8,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029651206885122097,
+      "loss": 0.471,
+      "step": 52050
+    },
+    {
+      "epoch": 138.82666666666665,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002965107207427443,
+      "loss": 0.4659,
+      "step": 52060
+    },
+    {
+      "epoch": 138.85333333333332,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.00029650937237685735,
+      "loss": 0.4744,
+      "step": 52070
+    },
+    {
+      "epoch": 138.88,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029650802375356255,
+      "loss": 0.4626,
+      "step": 52080
+    },
+    {
+      "epoch": 138.90666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029650667487286217,
+      "loss": 0.465,
+      "step": 52090
+    },
+    {
+      "epoch": 138.93333333333334,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00029650532573475863,
+      "loss": 0.467,
+      "step": 52100
+    },
+    {
+      "epoch": 138.96,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002965039763392543,
+      "loss": 0.4643,
+      "step": 52110
+    },
+    {
+      "epoch": 138.98666666666668,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002965026266863516,
+      "loss": 0.4791,
+      "step": 52120
+    },
+    {
+      "epoch": 139.0,
+      "eval_loss": 0.47840365767478943,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.325,
+      "eval_samples_per_second": 1.55,
+      "eval_steps_per_second": 0.097,
+      "step": 52125
+    },
+    {
+      "epoch": 139.01333333333332,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002965012767760528,
+      "loss": 0.4676,
+      "step": 52130
+    },
+    {
+      "epoch": 139.04,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002964999266083603,
+      "loss": 0.4924,
+      "step": 52140
+    },
+    {
+      "epoch": 139.06666666666666,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002964985761832765,
+      "loss": 0.4791,
+      "step": 52150
+    },
+    {
+      "epoch": 139.09333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029649722550080374,
+      "loss": 0.4739,
+      "step": 52160
+    },
+    {
+      "epoch": 139.12,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029649587456094447,
+      "loss": 0.4702,
+      "step": 52170
+    },
+    {
+      "epoch": 139.14666666666668,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.000296494523363701,
+      "loss": 0.4791,
+      "step": 52180
+    },
+    {
+      "epoch": 139.17333333333335,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002964931719090757,
+      "loss": 0.4698,
+      "step": 52190
+    },
+    {
+      "epoch": 139.2,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.000296491820197071,
+      "loss": 0.4683,
+      "step": 52200
+    },
+    {
+      "epoch": 139.22666666666666,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002964904682276892,
+      "loss": 0.4656,
+      "step": 52210
+    },
+    {
+      "epoch": 139.25333333333333,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002964891160009327,
+      "loss": 0.4663,
+      "step": 52220
+    },
+    {
+      "epoch": 139.28,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0002964877635168039,
+      "loss": 0.48,
+      "step": 52230
+    },
+    {
+      "epoch": 139.30666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002964864107753051,
+      "loss": 0.4602,
+      "step": 52240
+    },
+    {
+      "epoch": 139.33333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002964850577764388,
+      "loss": 0.4762,
+      "step": 52250
+    },
+    {
+      "epoch": 139.36,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002964837045202073,
+      "loss": 0.4771,
+      "step": 52260
+    },
+    {
+      "epoch": 139.38666666666666,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00029648235100661303,
+      "loss": 0.4695,
+      "step": 52270
+    },
+    {
+      "epoch": 139.41333333333333,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.00029648099723565826,
+      "loss": 0.4741,
+      "step": 52280
+    },
+    {
+      "epoch": 139.44,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.00029647964320734545,
+      "loss": 0.4775,
+      "step": 52290
+    },
+    {
+      "epoch": 139.46666666666667,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.00029647828892167703,
+      "loss": 0.4868,
+      "step": 52300
+    },
+    {
+      "epoch": 139.49333333333334,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0002964769343786553,
+      "loss": 0.4776,
+      "step": 52310
+    },
+    {
+      "epoch": 139.52,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.00029647557957828265,
+      "loss": 0.4722,
+      "step": 52320
+    },
+    {
+      "epoch": 139.54666666666665,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029647422452056145,
+      "loss": 0.4691,
+      "step": 52330
+    },
+    {
+      "epoch": 139.57333333333332,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002964728692054941,
+      "loss": 0.4681,
+      "step": 52340
+    },
+    {
+      "epoch": 139.6,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.000296471513633083,
+      "loss": 0.4618,
+      "step": 52350
+    },
+    {
+      "epoch": 139.62666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002964701578033305,
+      "loss": 0.4491,
+      "step": 52360
+    },
+    {
+      "epoch": 139.65333333333334,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029646880171623895,
+      "loss": 0.4688,
+      "step": 52370
+    },
+    {
+      "epoch": 139.68,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029646744537181085,
+      "loss": 0.4603,
+      "step": 52380
+    },
+    {
+      "epoch": 139.70666666666668,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029646608877004844,
+      "loss": 0.4603,
+      "step": 52390
+    },
+    {
+      "epoch": 139.73333333333332,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029646473191095423,
+      "loss": 0.4796,
+      "step": 52400
+    },
+    {
+      "epoch": 139.76,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00029646337479453055,
+      "loss": 0.4807,
+      "step": 52410
+    },
+    {
+      "epoch": 139.78666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002964620174207797,
+      "loss": 0.4792,
+      "step": 52420
+    },
+    {
+      "epoch": 139.81333333333333,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002964606597897042,
+      "loss": 0.4641,
+      "step": 52430
+    },
+    {
+      "epoch": 139.84,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.00029645930190130637,
+      "loss": 0.4705,
+      "step": 52440
+    },
+    {
+      "epoch": 139.86666666666667,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002964579437555886,
+      "loss": 0.4694,
+      "step": 52450
+    },
+    {
+      "epoch": 139.89333333333335,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029645658535255333,
+      "loss": 0.4606,
+      "step": 52460
+    },
+    {
+      "epoch": 139.92,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029645522669220287,
+      "loss": 0.4712,
+      "step": 52470
+    },
+    {
+      "epoch": 139.94666666666666,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029645386777453963,
+      "loss": 0.4637,
+      "step": 52480
+    },
+    {
+      "epoch": 139.97333333333333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.000296452508599566,
+      "loss": 0.4746,
+      "step": 52490
+    },
+    {
+      "epoch": 140.0,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029645114916728436,
+      "loss": 0.4636,
+      "step": 52500
+    },
+    {
+      "epoch": 140.0,
+      "eval_loss": 0.4796655476093292,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.272,
+      "eval_samples_per_second": 1.558,
+      "eval_steps_per_second": 0.097,
+      "step": 52500
+    },
+    {
+      "epoch": 140.02666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002964497894776972,
+      "loss": 0.4833,
+      "step": 52510
+    },
+    {
+      "epoch": 140.05333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002964484295308067,
+      "loss": 0.4887,
+      "step": 52520
+    },
+    {
+      "epoch": 140.08,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029644706932661544,
+      "loss": 0.4763,
+      "step": 52530
+    },
+    {
+      "epoch": 140.10666666666665,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0002964457088651257,
+      "loss": 0.4715,
+      "step": 52540
+    },
+    {
+      "epoch": 140.13333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002964443481463399,
+      "loss": 0.4732,
+      "step": 52550
+    },
+    {
+      "epoch": 140.16,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029644298717026047,
+      "loss": 0.4769,
+      "step": 52560
+    },
+    {
+      "epoch": 140.18666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029644162593688977,
+      "loss": 0.469,
+      "step": 52570
+    },
+    {
+      "epoch": 140.21333333333334,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002964402644462302,
+      "loss": 0.4644,
+      "step": 52580
+    },
+    {
+      "epoch": 140.24,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002964389026982841,
+      "loss": 0.4683,
+      "step": 52590
+    },
+    {
+      "epoch": 140.26666666666668,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029643754069305395,
+      "loss": 0.4718,
+      "step": 52600
+    },
+    {
+      "epoch": 140.29333333333332,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00029643617843054206,
+      "loss": 0.4664,
+      "step": 52610
+    },
+    {
+      "epoch": 140.32,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0002964348159107509,
+      "loss": 0.47,
+      "step": 52620
+    },
+    {
+      "epoch": 140.34666666666666,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029643345313368284,
+      "loss": 0.4782,
+      "step": 52630
+    },
+    {
+      "epoch": 140.37333333333333,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002964320900993402,
+      "loss": 0.4739,
+      "step": 52640
+    },
+    {
+      "epoch": 140.4,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029643072680772553,
+      "loss": 0.4709,
+      "step": 52650
+    },
+    {
+      "epoch": 140.42666666666668,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029642936325884105,
+      "loss": 0.4764,
+      "step": 52660
+    },
+    {
+      "epoch": 140.45333333333335,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029642799945268927,
+      "loss": 0.4789,
+      "step": 52670
+    },
+    {
+      "epoch": 140.48,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002964266353892725,
+      "loss": 0.4854,
+      "step": 52680
+    },
+    {
+      "epoch": 140.50666666666666,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029642527106859325,
+      "loss": 0.4751,
+      "step": 52690
+    },
+    {
+      "epoch": 140.53333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029642390649065384,
+      "loss": 0.4682,
+      "step": 52700
+    },
+    {
+      "epoch": 140.56,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029642254165545667,
+      "loss": 0.4695,
+      "step": 52710
+    },
+    {
+      "epoch": 140.58666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002964211765630042,
+      "loss": 0.4653,
+      "step": 52720
+    },
+    {
+      "epoch": 140.61333333333334,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002964198112132987,
+      "loss": 0.4515,
+      "step": 52730
+    },
+    {
+      "epoch": 140.64,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029641844560634274,
+      "loss": 0.4572,
+      "step": 52740
+    },
+    {
+      "epoch": 140.66666666666666,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029641707974213856,
+      "loss": 0.4722,
+      "step": 52750
+    },
+    {
+      "epoch": 140.69333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029641571362068865,
+      "loss": 0.4553,
+      "step": 52760
+    },
+    {
+      "epoch": 140.72,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002964143472419954,
+      "loss": 0.4722,
+      "step": 52770
+    },
+    {
+      "epoch": 140.74666666666667,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029641298060606116,
+      "loss": 0.4764,
+      "step": 52780
+    },
+    {
+      "epoch": 140.77333333333334,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0002964116137128884,
+      "loss": 0.4843,
+      "step": 52790
+    },
+    {
+      "epoch": 140.8,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002964102465624795,
+      "loss": 0.4708,
+      "step": 52800
+    },
+    {
+      "epoch": 140.82666666666665,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029640887915483683,
+      "loss": 0.4655,
+      "step": 52810
+    },
+    {
+      "epoch": 140.85333333333332,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029640751148996285,
+      "loss": 0.4745,
+      "step": 52820
+    },
+    {
+      "epoch": 140.88,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002964061435678599,
+      "loss": 0.4619,
+      "step": 52830
+    },
+    {
+      "epoch": 140.90666666666667,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002964047753885304,
+      "loss": 0.4648,
+      "step": 52840
+    },
+    {
+      "epoch": 140.93333333333334,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002964034069519768,
+      "loss": 0.4664,
+      "step": 52850
+    },
+    {
+      "epoch": 140.96,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002964020382582015,
+      "loss": 0.4644,
+      "step": 52860
+    },
+    {
+      "epoch": 140.98666666666668,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002964006693072068,
+      "loss": 0.4791,
+      "step": 52870
+    },
+    {
+      "epoch": 141.0,
+      "eval_loss": 0.4809083044528961,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.875,
+      "eval_samples_per_second": 1.62,
+      "eval_steps_per_second": 0.101,
+      "step": 52875
+    },
+    {
+      "epoch": 141.01333333333332,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002963993000989952,
+      "loss": 0.4674,
+      "step": 52880
+    },
+    {
+      "epoch": 141.04,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029639793063356907,
+      "loss": 0.4921,
+      "step": 52890
+    },
+    {
+      "epoch": 141.06666666666666,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029639656091093086,
+      "loss": 0.4794,
+      "step": 52900
+    },
+    {
+      "epoch": 141.09333333333333,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0002963951909310829,
+      "loss": 0.4747,
+      "step": 52910
+    },
+    {
+      "epoch": 141.12,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029639382069402776,
+      "loss": 0.471,
+      "step": 52920
+    },
+    {
+      "epoch": 141.14666666666668,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029639245019976763,
+      "loss": 0.4793,
+      "step": 52930
+    },
+    {
+      "epoch": 141.17333333333335,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002963910794483051,
+      "loss": 0.4698,
+      "step": 52940
+    },
+    {
+      "epoch": 141.2,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029638970843964244,
+      "loss": 0.4675,
+      "step": 52950
+    },
+    {
+      "epoch": 141.22666666666666,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029638833717378214,
+      "loss": 0.466,
+      "step": 52960
+    },
+    {
+      "epoch": 141.25333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002963869656507266,
+      "loss": 0.4655,
+      "step": 52970
+    },
+    {
+      "epoch": 141.28,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002963855938704782,
+      "loss": 0.4792,
+      "step": 52980
+    },
+    {
+      "epoch": 141.30666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002963842218330393,
+      "loss": 0.4611,
+      "step": 52990
+    },
+    {
+      "epoch": 141.33333333333334,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029638284953841247,
+      "loss": 0.4756,
+      "step": 53000
+    },
+    {
+      "epoch": 141.36,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002963814769866,
+      "loss": 0.4774,
+      "step": 53010
+    },
+    {
+      "epoch": 141.38666666666666,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002963801041776044,
+      "loss": 0.4695,
+      "step": 53020
+    },
+    {
+      "epoch": 141.41333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002963787311114279,
+      "loss": 0.4743,
+      "step": 53030
+    },
+    {
+      "epoch": 141.44,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029637735778807305,
+      "loss": 0.4769,
+      "step": 53040
+    },
+    {
+      "epoch": 141.46666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029637598420754227,
+      "loss": 0.487,
+      "step": 53050
+    },
+    {
+      "epoch": 141.49333333333334,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002963746103698379,
+      "loss": 0.4776,
+      "step": 53060
+    },
+    {
+      "epoch": 141.52,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029637323627496243,
+      "loss": 0.4722,
+      "step": 53070
+    },
+    {
+      "epoch": 141.54666666666665,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002963718619229182,
+      "loss": 0.469,
+      "step": 53080
+    },
+    {
+      "epoch": 141.57333333333332,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002963704873137077,
+      "loss": 0.4677,
+      "step": 53090
+    },
+    {
+      "epoch": 141.6,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002963691124473333,
+      "loss": 0.4615,
+      "step": 53100
+    },
+    {
+      "epoch": 141.62666666666667,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002963677373237974,
+      "loss": 0.4487,
+      "step": 53110
+    },
+    {
+      "epoch": 141.65333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002963663619431025,
+      "loss": 0.4685,
+      "step": 53120
+    },
+    {
+      "epoch": 141.68,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00029636498630525093,
+      "loss": 0.4596,
+      "step": 53130
+    },
+    {
+      "epoch": 141.70666666666668,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0002963636104102451,
+      "loss": 0.4605,
+      "step": 53140
+    },
+    {
+      "epoch": 141.73333333333332,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002963622342580875,
+      "loss": 0.4791,
+      "step": 53150
+    },
+    {
+      "epoch": 141.76,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029636085784878047,
+      "loss": 0.4807,
+      "step": 53160
+    },
+    {
+      "epoch": 141.78666666666666,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.00029635948118232646,
+      "loss": 0.4792,
+      "step": 53170
+    },
+    {
+      "epoch": 141.81333333333333,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029635810425872794,
+      "loss": 0.4642,
+      "step": 53180
+    },
+    {
+      "epoch": 141.84,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029635672707798723,
+      "loss": 0.4703,
+      "step": 53190
+    },
+    {
+      "epoch": 141.86666666666667,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002963553496401069,
+      "loss": 0.47,
+      "step": 53200
+    },
+    {
+      "epoch": 141.89333333333335,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029635397194508915,
+      "loss": 0.4605,
+      "step": 53210
+    },
+    {
+      "epoch": 141.92,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029635259399293665,
+      "loss": 0.4711,
+      "step": 53220
+    },
+    {
+      "epoch": 141.94666666666666,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002963512157836516,
+      "loss": 0.4629,
+      "step": 53230
+    },
+    {
+      "epoch": 141.97333333333333,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0002963498373172365,
+      "loss": 0.4751,
+      "step": 53240
+    },
+    {
+      "epoch": 142.0,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002963484585936939,
+      "loss": 0.4645,
+      "step": 53250
+    },
+    {
+      "epoch": 142.0,
+      "eval_loss": 0.4798355996608734,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.4414,
+      "eval_samples_per_second": 1.532,
+      "eval_steps_per_second": 0.096,
+      "step": 53250
+    },
+    {
+      "epoch": 142.02666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029634707961302605,
+      "loss": 0.4836,
+      "step": 53260
+    },
+    {
+      "epoch": 142.05333333333334,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002963457003752354,
+      "loss": 0.4895,
+      "step": 53270
+    },
+    {
+      "epoch": 142.08,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029634432088032446,
+      "loss": 0.476,
+      "step": 53280
+    },
+    {
+      "epoch": 142.10666666666665,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0002963429411282956,
+      "loss": 0.472,
+      "step": 53290
+    },
+    {
+      "epoch": 142.13333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002963415611191512,
+      "loss": 0.4728,
+      "step": 53300
+    },
+    {
+      "epoch": 142.16,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002963401808528938,
+      "loss": 0.4772,
+      "step": 53310
+    },
+    {
+      "epoch": 142.18666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002963388003295257,
+      "loss": 0.4693,
+      "step": 53320
+    },
+    {
+      "epoch": 142.21333333333334,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029633741954904934,
+      "loss": 0.4641,
+      "step": 53330
+    },
+    {
+      "epoch": 142.24,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029633603851146725,
+      "loss": 0.468,
+      "step": 53340
+    },
+    {
+      "epoch": 142.26666666666668,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002963346572167818,
+      "loss": 0.4727,
+      "step": 53350
+    },
+    {
+      "epoch": 142.29333333333332,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029633327566499536,
+      "loss": 0.4661,
+      "step": 53360
+    },
+    {
+      "epoch": 142.32,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00029633189385611043,
+      "loss": 0.4704,
+      "step": 53370
+    },
+    {
+      "epoch": 142.34666666666666,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029633051179012946,
+      "loss": 0.4778,
+      "step": 53380
+    },
+    {
+      "epoch": 142.37333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029632912946705477,
+      "loss": 0.4738,
+      "step": 53390
+    },
+    {
+      "epoch": 142.4,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0002963277468868889,
+      "loss": 0.4701,
+      "step": 53400
+    },
+    {
+      "epoch": 142.42666666666668,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002963263640496342,
+      "loss": 0.4769,
+      "step": 53410
+    },
+    {
+      "epoch": 142.45333333333335,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002963249809552931,
+      "loss": 0.4788,
+      "step": 53420
+    },
+    {
+      "epoch": 142.48,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002963235976038681,
+      "loss": 0.485,
+      "step": 53430
+    },
+    {
+      "epoch": 142.50666666666666,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002963222139953616,
+      "loss": 0.4746,
+      "step": 53440
+    },
+    {
+      "epoch": 142.53333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.000296320830129776,
+      "loss": 0.4682,
+      "step": 53450
+    },
+    {
+      "epoch": 142.56,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00029631944600711375,
+      "loss": 0.4695,
+      "step": 53460
+    },
+    {
+      "epoch": 142.58666666666667,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002963180616273773,
+      "loss": 0.4654,
+      "step": 53470
+    },
+    {
+      "epoch": 142.61333333333334,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029631667699056905,
+      "loss": 0.4513,
+      "step": 53480
+    },
+    {
+      "epoch": 142.64,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029631529209669146,
+      "loss": 0.457,
+      "step": 53490
+    },
+    {
+      "epoch": 142.66666666666666,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029631390694574694,
+      "loss": 0.472,
+      "step": 53500
+    },
+    {
+      "epoch": 142.69333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.000296312521537738,
+      "loss": 0.4546,
+      "step": 53510
+    },
+    {
+      "epoch": 142.72,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002963111358726669,
+      "loss": 0.4723,
+      "step": 53520
+    },
+    {
+      "epoch": 142.74666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029630974995053627,
+      "loss": 0.4757,
+      "step": 53530
+    },
+    {
+      "epoch": 142.77333333333334,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002963083637713484,
+      "loss": 0.484,
+      "step": 53540
+    },
+    {
+      "epoch": 142.8,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002963069773351059,
+      "loss": 0.4705,
+      "step": 53550
+    },
+    {
+      "epoch": 142.82666666666665,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002963055906418109,
+      "loss": 0.4654,
+      "step": 53560
+    },
+    {
+      "epoch": 142.85333333333332,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002963042036914662,
+      "loss": 0.4743,
+      "step": 53570
+    },
+    {
+      "epoch": 142.88,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.000296302816484074,
+      "loss": 0.4629,
+      "step": 53580
+    },
+    {
+      "epoch": 142.90666666666667,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029630142901963683,
+      "loss": 0.4649,
+      "step": 53590
+    },
+    {
+      "epoch": 142.93333333333334,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00029630004129815705,
+      "loss": 0.4671,
+      "step": 53600
+    },
+    {
+      "epoch": 142.96,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002962986533196372,
+      "loss": 0.465,
+      "step": 53610
+    },
+    {
+      "epoch": 142.98666666666668,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029629726508407963,
+      "loss": 0.4792,
+      "step": 53620
+    },
+    {
+      "epoch": 143.0,
+      "eval_loss": 0.4780339300632477,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9163,
+      "eval_samples_per_second": 1.614,
+      "eval_steps_per_second": 0.101,
+      "step": 53625
+    },
+    {
+      "epoch": 143.01333333333332,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002962958765914868,
+      "loss": 0.4674,
+      "step": 53630
+    },
+    {
+      "epoch": 143.04,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002962944878418612,
+      "loss": 0.4919,
+      "step": 53640
+    },
+    {
+      "epoch": 143.06666666666666,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002962930988352052,
+      "loss": 0.4789,
+      "step": 53650
+    },
+    {
+      "epoch": 143.09333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002962917095715213,
+      "loss": 0.4737,
+      "step": 53660
+    },
+    {
+      "epoch": 143.12,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002962903200508119,
+      "loss": 0.4704,
+      "step": 53670
+    },
+    {
+      "epoch": 143.14666666666668,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029628893027307946,
+      "loss": 0.4792,
+      "step": 53680
+    },
+    {
+      "epoch": 143.17333333333335,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029628754023832646,
+      "loss": 0.4696,
+      "step": 53690
+    },
+    {
+      "epoch": 143.2,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00029628614994655527,
+      "loss": 0.4679,
+      "step": 53700
+    },
+    {
+      "epoch": 143.22666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029628475939776837,
+      "loss": 0.4653,
+      "step": 53710
+    },
+    {
+      "epoch": 143.25333333333333,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.00029628336859196814,
+      "loss": 0.4658,
+      "step": 53720
+    },
+    {
+      "epoch": 143.28,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.00029628197752915714,
+      "loss": 0.4797,
+      "step": 53730
+    },
+    {
+      "epoch": 143.30666666666667,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.00029628058620933775,
+      "loss": 0.4607,
+      "step": 53740
+    },
+    {
+      "epoch": 143.33333333333334,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0002962791946325124,
+      "loss": 0.4758,
+      "step": 53750
+    },
+    {
+      "epoch": 143.36,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002962778027986836,
+      "loss": 0.4777,
+      "step": 53760
+    },
+    {
+      "epoch": 143.38666666666666,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002962764107078537,
+      "loss": 0.4699,
+      "step": 53770
+    },
+    {
+      "epoch": 143.41333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029627501836002523,
+      "loss": 0.4746,
+      "step": 53780
+    },
+    {
+      "epoch": 143.44,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002962736257552006,
+      "loss": 0.4771,
+      "step": 53790
+    },
+    {
+      "epoch": 143.46666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002962722328933822,
+      "loss": 0.4868,
+      "step": 53800
+    },
+    {
+      "epoch": 143.49333333333334,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002962708397745726,
+      "loss": 0.4779,
+      "step": 53810
+    },
+    {
+      "epoch": 143.52,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002962694463987741,
+      "loss": 0.4726,
+      "step": 53820
+    },
+    {
+      "epoch": 143.54666666666665,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002962680527659893,
+      "loss": 0.4689,
+      "step": 53830
+    },
+    {
+      "epoch": 143.57333333333332,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002962666588762206,
+      "loss": 0.4682,
+      "step": 53840
+    },
+    {
+      "epoch": 143.6,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029626526472947037,
+      "loss": 0.4626,
+      "step": 53850
+    },
+    {
+      "epoch": 143.62666666666667,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.00029626387032574117,
+      "loss": 0.4487,
+      "step": 53860
+    },
+    {
+      "epoch": 143.65333333333334,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029626247566503534,
+      "loss": 0.4684,
+      "step": 53870
+    },
+    {
+      "epoch": 143.68,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029626108074735545,
+      "loss": 0.46,
+      "step": 53880
+    },
+    {
+      "epoch": 143.70666666666668,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029625968557270386,
+      "loss": 0.4596,
+      "step": 53890
+    },
+    {
+      "epoch": 143.73333333333332,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.000296258290141083,
+      "loss": 0.4787,
+      "step": 53900
+    },
+    {
+      "epoch": 143.76,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002962568944524954,
+      "loss": 0.4809,
+      "step": 53910
+    },
+    {
+      "epoch": 143.78666666666666,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029625549850694356,
+      "loss": 0.4796,
+      "step": 53920
+    },
+    {
+      "epoch": 143.81333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029625410230442977,
+      "loss": 0.464,
+      "step": 53930
+    },
+    {
+      "epoch": 143.84,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029625270584495664,
+      "loss": 0.4704,
+      "step": 53940
+    },
+    {
+      "epoch": 143.86666666666667,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002962513091285265,
+      "loss": 0.4687,
+      "step": 53950
+    },
+    {
+      "epoch": 143.89333333333335,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029624991215514184,
+      "loss": 0.4605,
+      "step": 53960
+    },
+    {
+      "epoch": 143.92,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029624851492480515,
+      "loss": 0.4708,
+      "step": 53970
+    },
+    {
+      "epoch": 143.94666666666666,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002962471174375189,
+      "loss": 0.4638,
+      "step": 53980
+    },
+    {
+      "epoch": 143.97333333333333,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029624571969328543,
+      "loss": 0.4742,
+      "step": 53990
+    },
+    {
+      "epoch": 144.0,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002962443216921074,
+      "loss": 0.4636,
+      "step": 54000
+    },
+    {
+      "epoch": 144.0,
+      "eval_loss": 0.4806298017501831,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.7466,
+      "eval_samples_per_second": 1.489,
+      "eval_steps_per_second": 0.093,
+      "step": 54000
+    },
+    {
+      "epoch": 144.02666666666667,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002962429234339871,
+      "loss": 0.4834,
+      "step": 54010
+    },
+    {
+      "epoch": 144.05333333333334,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029624152491892694,
+      "loss": 0.4897,
+      "step": 54020
+    },
+    {
+      "epoch": 144.08,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029624012614692957,
+      "loss": 0.4766,
+      "step": 54030
+    },
+    {
+      "epoch": 144.10666666666665,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002962387271179973,
+      "loss": 0.4718,
+      "step": 54040
+    },
+    {
+      "epoch": 144.13333333333333,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002962373278321326,
+      "loss": 0.4734,
+      "step": 54050
+    },
+    {
+      "epoch": 144.16,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029623592828933803,
+      "loss": 0.4773,
+      "step": 54060
+    },
+    {
+      "epoch": 144.18666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029623452848961593,
+      "loss": 0.4699,
+      "step": 54070
+    },
+    {
+      "epoch": 144.21333333333334,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029623312843296887,
+      "loss": 0.4645,
+      "step": 54080
+    },
+    {
+      "epoch": 144.24,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002962317281193992,
+      "loss": 0.4681,
+      "step": 54090
+    },
+    {
+      "epoch": 144.26666666666668,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029623032754890947,
+      "loss": 0.4728,
+      "step": 54100
+    },
+    {
+      "epoch": 144.29333333333332,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.00029622892672150205,
+      "loss": 0.4667,
+      "step": 54110
+    },
+    {
+      "epoch": 144.32,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029622752563717943,
+      "loss": 0.4703,
+      "step": 54120
+    },
+    {
+      "epoch": 144.34666666666666,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029622612429594416,
+      "loss": 0.4781,
+      "step": 54130
+    },
+    {
+      "epoch": 144.37333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002962247226977986,
+      "loss": 0.4743,
+      "step": 54140
+    },
+    {
+      "epoch": 144.4,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00029622332084274525,
+      "loss": 0.4711,
+      "step": 54150
+    },
+    {
+      "epoch": 144.42666666666668,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002962219187307866,
+      "loss": 0.4769,
+      "step": 54160
+    },
+    {
+      "epoch": 144.45333333333335,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002962205163619251,
+      "loss": 0.4793,
+      "step": 54170
+    },
+    {
+      "epoch": 144.48,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029621911373616314,
+      "loss": 0.4857,
+      "step": 54180
+    },
+    {
+      "epoch": 144.50666666666666,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002962177108535032,
+      "loss": 0.475,
+      "step": 54190
+    },
+    {
+      "epoch": 144.53333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002962163077139479,
+      "loss": 0.4688,
+      "step": 54200
+    },
+    {
+      "epoch": 144.56,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002962149043174995,
+      "loss": 0.4697,
+      "step": 54210
+    },
+    {
+      "epoch": 144.58666666666667,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002962135006641606,
+      "loss": 0.4656,
+      "step": 54220
+    },
+    {
+      "epoch": 144.61333333333334,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029621209675393366,
+      "loss": 0.4519,
+      "step": 54230
+    },
+    {
+      "epoch": 144.64,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029621069258682105,
+      "loss": 0.4568,
+      "step": 54240
+    },
+    {
+      "epoch": 144.66666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029620928816282533,
+      "loss": 0.4718,
+      "step": 54250
+    },
+    {
+      "epoch": 144.69333333333333,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002962078834819489,
+      "loss": 0.4548,
+      "step": 54260
+    },
+    {
+      "epoch": 144.72,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002962064785441943,
+      "loss": 0.472,
+      "step": 54270
+    },
+    {
+      "epoch": 144.74666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029620507334956396,
+      "loss": 0.4762,
+      "step": 54280
+    },
+    {
+      "epoch": 144.77333333333334,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029620366789806035,
+      "loss": 0.4842,
+      "step": 54290
+    },
+    {
+      "epoch": 144.8,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002962022621896859,
+      "loss": 0.4709,
+      "step": 54300
+    },
+    {
+      "epoch": 144.82666666666665,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002962008562244431,
+      "loss": 0.4649,
+      "step": 54310
+    },
+    {
+      "epoch": 144.85333333333332,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002961994500023345,
+      "loss": 0.4743,
+      "step": 54320
+    },
+    {
+      "epoch": 144.88,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029619804352336255,
+      "loss": 0.4627,
+      "step": 54330
+    },
+    {
+      "epoch": 144.90666666666667,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002961966367875296,
+      "loss": 0.4651,
+      "step": 54340
+    },
+    {
+      "epoch": 144.93333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029619522979483823,
+      "loss": 0.4662,
+      "step": 54350
+    },
+    {
+      "epoch": 144.96,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002961938225452909,
+      "loss": 0.4642,
+      "step": 54360
+    },
+    {
+      "epoch": 144.98666666666668,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029619241503889006,
+      "loss": 0.4787,
+      "step": 54370
+    },
+    {
+      "epoch": 145.0,
+      "eval_loss": 0.4806334376335144,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.935,
+      "eval_samples_per_second": 1.463,
+      "eval_steps_per_second": 0.091,
+      "step": 54375
+    },
+    {
+      "epoch": 145.01333333333332,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002961910072756382,
+      "loss": 0.4674,
+      "step": 54380
+    },
+    {
+      "epoch": 145.04,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002961895992555378,
+      "loss": 0.4921,
+      "step": 54390
+    },
+    {
+      "epoch": 145.06666666666666,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.00029618819097859125,
+      "loss": 0.4797,
+      "step": 54400
+    },
+    {
+      "epoch": 145.09333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002961867824448011,
+      "loss": 0.4738,
+      "step": 54410
+    },
+    {
+      "epoch": 145.12,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.00029618537365416986,
+      "loss": 0.4707,
+      "step": 54420
+    },
+    {
+      "epoch": 145.14666666666668,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029618396460669996,
+      "loss": 0.4792,
+      "step": 54430
+    },
+    {
+      "epoch": 145.17333333333335,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029618255530239383,
+      "loss": 0.4699,
+      "step": 54440
+    },
+    {
+      "epoch": 145.2,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029618114574125404,
+      "loss": 0.4681,
+      "step": 54450
+    },
+    {
+      "epoch": 145.22666666666666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.000296179735923283,
+      "loss": 0.4654,
+      "step": 54460
+    },
+    {
+      "epoch": 145.25333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002961783258484832,
+      "loss": 0.4662,
+      "step": 54470
+    },
+    {
+      "epoch": 145.28,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.00029617691551685714,
+      "loss": 0.4791,
+      "step": 54480
+    },
+    {
+      "epoch": 145.30666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002961755049284073,
+      "loss": 0.4603,
+      "step": 54490
+    },
+    {
+      "epoch": 145.33333333333334,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002961740940831361,
+      "loss": 0.4757,
+      "step": 54500
+    },
+    {
+      "epoch": 145.36,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029617268298104605,
+      "loss": 0.4782,
+      "step": 54510
+    },
+    {
+      "epoch": 145.38666666666666,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002961712716221397,
+      "loss": 0.4692,
+      "step": 54520
+    },
+    {
+      "epoch": 145.41333333333333,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0002961698600064194,
+      "loss": 0.4741,
+      "step": 54530
+    },
+    {
+      "epoch": 145.44,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002961684481338878,
+      "loss": 0.4767,
+      "step": 54540
+    },
+    {
+      "epoch": 145.46666666666667,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002961670360045472,
+      "loss": 0.4865,
+      "step": 54550
+    },
+    {
+      "epoch": 145.49333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002961656236184001,
+      "loss": 0.4771,
+      "step": 54560
+    },
+    {
+      "epoch": 145.52,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029616421097544915,
+      "loss": 0.4718,
+      "step": 54570
+    },
+    {
+      "epoch": 145.54666666666665,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029616279807569666,
+      "loss": 0.4694,
+      "step": 54580
+    },
+    {
+      "epoch": 145.57333333333332,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.00029616138491914514,
+      "loss": 0.4682,
+      "step": 54590
+    },
+    {
+      "epoch": 145.6,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002961599715057972,
+      "loss": 0.4621,
+      "step": 54600
+    },
+    {
+      "epoch": 145.62666666666667,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002961585578356552,
+      "loss": 0.4486,
+      "step": 54610
+    },
+    {
+      "epoch": 145.65333333333334,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029615714390872165,
+      "loss": 0.4683,
+      "step": 54620
+    },
+    {
+      "epoch": 145.68,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029615572972499905,
+      "loss": 0.4594,
+      "step": 54630
+    },
+    {
+      "epoch": 145.70666666666668,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0002961543152844899,
+      "loss": 0.4596,
+      "step": 54640
+    },
+    {
+      "epoch": 145.73333333333332,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029615290058719655,
+      "loss": 0.4798,
+      "step": 54650
+    },
+    {
+      "epoch": 145.76,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002961514856331217,
+      "loss": 0.48,
+      "step": 54660
+    },
+    {
+      "epoch": 145.78666666666666,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002961500704222677,
+      "loss": 0.4795,
+      "step": 54670
+    },
+    {
+      "epoch": 145.81333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029614865495463706,
+      "loss": 0.4644,
+      "step": 54680
+    },
+    {
+      "epoch": 145.84,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00029614723923023227,
+      "loss": 0.47,
+      "step": 54690
+    },
+    {
+      "epoch": 145.86666666666667,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029614582324905583,
+      "loss": 0.4696,
+      "step": 54700
+    },
+    {
+      "epoch": 145.89333333333335,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00029614440701111023,
+      "loss": 0.46,
+      "step": 54710
+    },
+    {
+      "epoch": 145.92,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002961429905163979,
+      "loss": 0.4713,
+      "step": 54720
+    },
+    {
+      "epoch": 145.94666666666666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029614157376492146,
+      "loss": 0.4628,
+      "step": 54730
+    },
+    {
+      "epoch": 145.97333333333333,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029614015675668323,
+      "loss": 0.4747,
+      "step": 54740
+    },
+    {
+      "epoch": 146.0,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002961387394916858,
+      "loss": 0.4641,
+      "step": 54750
+    },
+    {
+      "epoch": 146.0,
+      "eval_loss": 0.47998303174972534,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.6278,
+      "eval_samples_per_second": 1.662,
+      "eval_steps_per_second": 0.104,
+      "step": 54750
+    },
+    {
+      "epoch": 146.02666666666667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002961373219699317,
+      "loss": 0.4833,
+      "step": 54760
+    },
+    {
+      "epoch": 146.05333333333334,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002961359041914233,
+      "loss": 0.4892,
+      "step": 54770
+    },
+    {
+      "epoch": 146.08,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002961344861561632,
+      "loss": 0.4762,
+      "step": 54780
+    },
+    {
+      "epoch": 146.10666666666665,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002961330678641539,
+      "loss": 0.4708,
+      "step": 54790
+    },
+    {
+      "epoch": 146.13333333333333,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002961316493153977,
+      "loss": 0.4731,
+      "step": 54800
+    },
+    {
+      "epoch": 146.16,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029613023050989733,
+      "loss": 0.4775,
+      "step": 54810
+    },
+    {
+      "epoch": 146.18666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029612881144765516,
+      "loss": 0.4692,
+      "step": 54820
+    },
+    {
+      "epoch": 146.21333333333334,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0002961273921286737,
+      "loss": 0.464,
+      "step": 54830
+    },
+    {
+      "epoch": 146.24,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002961259725529555,
+      "loss": 0.4674,
+      "step": 54840
+    },
+    {
+      "epoch": 146.26666666666668,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002961245527205029,
+      "loss": 0.4721,
+      "step": 54850
+    },
+    {
+      "epoch": 146.29333333333332,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002961231326313186,
+      "loss": 0.4666,
+      "step": 54860
+    },
+    {
+      "epoch": 146.32,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002961217122854049,
+      "loss": 0.4697,
+      "step": 54870
+    },
+    {
+      "epoch": 146.34666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029612029168276445,
+      "loss": 0.4776,
+      "step": 54880
+    },
+    {
+      "epoch": 146.37333333333333,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0002961188708233997,
+      "loss": 0.4741,
+      "step": 54890
+    },
+    {
+      "epoch": 146.4,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029611744970731315,
+      "loss": 0.47,
+      "step": 54900
+    },
+    {
+      "epoch": 146.42666666666668,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029611602833450723,
+      "loss": 0.4773,
+      "step": 54910
+    },
+    {
+      "epoch": 146.45333333333335,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029611460670498453,
+      "loss": 0.479,
+      "step": 54920
+    },
+    {
+      "epoch": 146.48,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029611318481874747,
+      "loss": 0.4848,
+      "step": 54930
+    },
+    {
+      "epoch": 146.50666666666666,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002961117626757986,
+      "loss": 0.4746,
+      "step": 54940
+    },
+    {
+      "epoch": 146.53333333333333,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002961103402761404,
+      "loss": 0.4685,
+      "step": 54950
+    },
+    {
+      "epoch": 146.56,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029610891761977536,
+      "loss": 0.4695,
+      "step": 54960
+    },
+    {
+      "epoch": 146.58666666666667,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.000296107494706706,
+      "loss": 0.4652,
+      "step": 54970
+    },
+    {
+      "epoch": 146.61333333333334,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029610607153693477,
+      "loss": 0.4521,
+      "step": 54980
+    },
+    {
+      "epoch": 146.64,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.00029610464811046427,
+      "loss": 0.4565,
+      "step": 54990
+    },
+    {
+      "epoch": 146.66666666666666,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002961032244272969,
+      "loss": 0.4725,
+      "step": 55000
+    },
+    {
+      "epoch": 146.69333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029610180048743524,
+      "loss": 0.4555,
+      "step": 55010
+    },
+    {
+      "epoch": 146.72,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029610037629088174,
+      "loss": 0.4717,
+      "step": 55020
+    },
+    {
+      "epoch": 146.74666666666667,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002960989518376389,
+      "loss": 0.4765,
+      "step": 55030
+    },
+    {
+      "epoch": 146.77333333333334,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0002960975271277092,
+      "loss": 0.4843,
+      "step": 55040
+    },
+    {
+      "epoch": 146.8,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029609610216109526,
+      "loss": 0.471,
+      "step": 55050
+    },
+    {
+      "epoch": 146.82666666666665,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002960946769377995,
+      "loss": 0.4655,
+      "step": 55060
+    },
+    {
+      "epoch": 146.85333333333332,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002960932514578244,
+      "loss": 0.4744,
+      "step": 55070
+    },
+    {
+      "epoch": 146.88,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002960918257211725,
+      "loss": 0.4625,
+      "step": 55080
+    },
+    {
+      "epoch": 146.90666666666667,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002960903997278463,
+      "loss": 0.465,
+      "step": 55090
+    },
+    {
+      "epoch": 146.93333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002960889734778483,
+      "loss": 0.4671,
+      "step": 55100
+    },
+    {
+      "epoch": 146.96,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.000296087546971181,
+      "loss": 0.4649,
+      "step": 55110
+    },
+    {
+      "epoch": 146.98666666666668,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0002960861202078469,
+      "loss": 0.4779,
+      "step": 55120
+    },
+    {
+      "epoch": 147.0,
+      "eval_loss": 0.4791259765625,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 12.1587,
+      "eval_samples_per_second": 1.316,
+      "eval_steps_per_second": 0.082,
+      "step": 55125
+    },
+    {
+      "epoch": 147.01333333333332,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002960846931878486,
+      "loss": 0.4683,
+      "step": 55130
+    },
+    {
+      "epoch": 147.04,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002960832659111885,
+      "loss": 0.4919,
+      "step": 55140
+    },
+    {
+      "epoch": 147.06666666666666,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002960818383778691,
+      "loss": 0.4797,
+      "step": 55150
+    },
+    {
+      "epoch": 147.09333333333333,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029608041058789295,
+      "loss": 0.4741,
+      "step": 55160
+    },
+    {
+      "epoch": 147.12,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002960789825412626,
+      "loss": 0.4707,
+      "step": 55170
+    },
+    {
+      "epoch": 147.14666666666668,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002960775542379805,
+      "loss": 0.4788,
+      "step": 55180
+    },
+    {
+      "epoch": 147.17333333333335,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002960761256780492,
+      "loss": 0.4701,
+      "step": 55190
+    },
+    {
+      "epoch": 147.2,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002960746968614711,
+      "loss": 0.4677,
+      "step": 55200
+    },
+    {
+      "epoch": 147.22666666666666,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002960732677882488,
+      "loss": 0.466,
+      "step": 55210
+    },
+    {
+      "epoch": 147.25333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002960718384583849,
+      "loss": 0.4659,
+      "step": 55220
+    },
+    {
+      "epoch": 147.28,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002960704088718817,
+      "loss": 0.4787,
+      "step": 55230
+    },
+    {
+      "epoch": 147.30666666666667,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029606897902874195,
+      "loss": 0.4604,
+      "step": 55240
+    },
+    {
+      "epoch": 147.33333333333334,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029606754892896794,
+      "loss": 0.4756,
+      "step": 55250
+    },
+    {
+      "epoch": 147.36,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002960661185725623,
+      "loss": 0.4776,
+      "step": 55260
+    },
+    {
+      "epoch": 147.38666666666666,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002960646879595276,
+      "loss": 0.4692,
+      "step": 55270
+    },
+    {
+      "epoch": 147.41333333333333,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0002960632570898662,
+      "loss": 0.474,
+      "step": 55280
+    },
+    {
+      "epoch": 147.44,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002960618259635807,
+      "loss": 0.4773,
+      "step": 55290
+    },
+    {
+      "epoch": 147.46666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002960603945806736,
+      "loss": 0.4863,
+      "step": 55300
+    },
+    {
+      "epoch": 147.49333333333334,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002960589629411475,
+      "loss": 0.4775,
+      "step": 55310
+    },
+    {
+      "epoch": 147.52,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029605753104500475,
+      "loss": 0.4723,
+      "step": 55320
+    },
+    {
+      "epoch": 147.54666666666665,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00029605609889224796,
+      "loss": 0.4683,
+      "step": 55330
+    },
+    {
+      "epoch": 147.57333333333332,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002960546664828797,
+      "loss": 0.4674,
+      "step": 55340
+    },
+    {
+      "epoch": 147.6,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0002960532338169024,
+      "loss": 0.461,
+      "step": 55350
+    },
+    {
+      "epoch": 147.62666666666667,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029605180089431856,
+      "loss": 0.4488,
+      "step": 55360
+    },
+    {
+      "epoch": 147.65333333333334,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00029605036771513076,
+      "loss": 0.4688,
+      "step": 55370
+    },
+    {
+      "epoch": 147.68,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029604893427934154,
+      "loss": 0.4596,
+      "step": 55380
+    },
+    {
+      "epoch": 147.70666666666668,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029604750058695337,
+      "loss": 0.4607,
+      "step": 55390
+    },
+    {
+      "epoch": 147.73333333333332,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029604606663796875,
+      "loss": 0.4796,
+      "step": 55400
+    },
+    {
+      "epoch": 147.76,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002960446324323902,
+      "loss": 0.4801,
+      "step": 55410
+    },
+    {
+      "epoch": 147.78666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002960431979702203,
+      "loss": 0.4793,
+      "step": 55420
+    },
+    {
+      "epoch": 147.81333333333333,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002960417632514615,
+      "loss": 0.4646,
+      "step": 55430
+    },
+    {
+      "epoch": 147.84,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002960403282761164,
+      "loss": 0.4702,
+      "step": 55440
+    },
+    {
+      "epoch": 147.86666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029603889304418747,
+      "loss": 0.4688,
+      "step": 55450
+    },
+    {
+      "epoch": 147.89333333333335,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029603745755567724,
+      "loss": 0.4601,
+      "step": 55460
+    },
+    {
+      "epoch": 147.92,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029603602181058823,
+      "loss": 0.4701,
+      "step": 55470
+    },
+    {
+      "epoch": 147.94666666666666,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029603458580892294,
+      "loss": 0.4625,
+      "step": 55480
+    },
+    {
+      "epoch": 147.97333333333333,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002960331495506839,
+      "loss": 0.4747,
+      "step": 55490
+    },
+    {
+      "epoch": 148.0,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002960317130358737,
+      "loss": 0.4637,
+      "step": 55500
+    },
+    {
+      "epoch": 148.0,
+      "eval_loss": 0.4773246943950653,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3415,
+      "eval_samples_per_second": 1.547,
+      "eval_steps_per_second": 0.097,
+      "step": 55500
+    },
+    {
+      "epoch": 148.02666666666667,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002960302762644948,
+      "loss": 0.4839,
+      "step": 55510
+    },
+    {
+      "epoch": 148.05333333333334,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002960288392365497,
+      "loss": 0.4889,
+      "step": 55520
+    },
+    {
+      "epoch": 148.08,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.000296027401952041,
+      "loss": 0.4762,
+      "step": 55530
+    },
+    {
+      "epoch": 148.10666666666665,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002960259644109712,
+      "loss": 0.4714,
+      "step": 55540
+    },
+    {
+      "epoch": 148.13333333333333,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0002960245266133428,
+      "loss": 0.4734,
+      "step": 55550
+    },
+    {
+      "epoch": 148.16,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029602308855915835,
+      "loss": 0.4773,
+      "step": 55560
+    },
+    {
+      "epoch": 148.18666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002960216502484203,
+      "loss": 0.4693,
+      "step": 55570
+    },
+    {
+      "epoch": 148.21333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002960202116811313,
+      "loss": 0.4633,
+      "step": 55580
+    },
+    {
+      "epoch": 148.24,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029601877285729384,
+      "loss": 0.4677,
+      "step": 55590
+    },
+    {
+      "epoch": 148.26666666666668,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029601733377691044,
+      "loss": 0.4725,
+      "step": 55600
+    },
+    {
+      "epoch": 148.29333333333332,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029601589443998356,
+      "loss": 0.4664,
+      "step": 55610
+    },
+    {
+      "epoch": 148.32,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002960144548465158,
+      "loss": 0.47,
+      "step": 55620
+    },
+    {
+      "epoch": 148.34666666666666,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002960130149965097,
+      "loss": 0.4777,
+      "step": 55630
+    },
+    {
+      "epoch": 148.37333333333333,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.00029601157488996777,
+      "loss": 0.4738,
+      "step": 55640
+    },
+    {
+      "epoch": 148.4,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002960101345268925,
+      "loss": 0.4714,
+      "step": 55650
+    },
+    {
+      "epoch": 148.42666666666668,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029600869390728645,
+      "loss": 0.476,
+      "step": 55660
+    },
+    {
+      "epoch": 148.45333333333335,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029600725303115217,
+      "loss": 0.4789,
+      "step": 55670
+    },
+    {
+      "epoch": 148.48,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029600581189849224,
+      "loss": 0.4845,
+      "step": 55680
+    },
+    {
+      "epoch": 148.50666666666666,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.000296004370509309,
+      "loss": 0.4755,
+      "step": 55690
+    },
+    {
+      "epoch": 148.53333333333333,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002960029288636052,
+      "loss": 0.4682,
+      "step": 55700
+    },
+    {
+      "epoch": 148.56,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002960014869613833,
+      "loss": 0.4695,
+      "step": 55710
+    },
+    {
+      "epoch": 148.58666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002960000448026458,
+      "loss": 0.4649,
+      "step": 55720
+    },
+    {
+      "epoch": 148.61333333333334,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029599860238739524,
+      "loss": 0.4518,
+      "step": 55730
+    },
+    {
+      "epoch": 148.64,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029599715971563416,
+      "loss": 0.4569,
+      "step": 55740
+    },
+    {
+      "epoch": 148.66666666666666,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002959957167873651,
+      "loss": 0.4717,
+      "step": 55750
+    },
+    {
+      "epoch": 148.69333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002959942736025906,
+      "loss": 0.4557,
+      "step": 55760
+    },
+    {
+      "epoch": 148.72,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002959928301613132,
+      "loss": 0.4717,
+      "step": 55770
+    },
+    {
+      "epoch": 148.74666666666667,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029599138646353544,
+      "loss": 0.4761,
+      "step": 55780
+    },
+    {
+      "epoch": 148.77333333333334,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029598994250925985,
+      "loss": 0.484,
+      "step": 55790
+    },
+    {
+      "epoch": 148.8,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029598849829848896,
+      "loss": 0.4708,
+      "step": 55800
+    },
+    {
+      "epoch": 148.82666666666665,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002959870538312253,
+      "loss": 0.4648,
+      "step": 55810
+    },
+    {
+      "epoch": 148.85333333333332,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0002959856091074714,
+      "loss": 0.4742,
+      "step": 55820
+    },
+    {
+      "epoch": 148.88,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029598416412722977,
+      "loss": 0.4623,
+      "step": 55830
+    },
+    {
+      "epoch": 148.90666666666667,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002959827188905031,
+      "loss": 0.4647,
+      "step": 55840
+    },
+    {
+      "epoch": 148.93333333333334,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029598127339729375,
+      "loss": 0.4666,
+      "step": 55850
+    },
+    {
+      "epoch": 148.96,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002959798276476044,
+      "loss": 0.4639,
+      "step": 55860
+    },
+    {
+      "epoch": 148.98666666666668,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029597838164143744,
+      "loss": 0.4785,
+      "step": 55870
+    },
+    {
+      "epoch": 149.0,
+      "eval_loss": 0.47737348079681396,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2614,
+      "eval_samples_per_second": 1.559,
+      "eval_steps_per_second": 0.097,
+      "step": 55875
+    },
+    {
+      "epoch": 149.01333333333332,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029597693537879556,
+      "loss": 0.4675,
+      "step": 55880
+    },
+    {
+      "epoch": 149.04,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029597548885968117,
+      "loss": 0.492,
+      "step": 55890
+    },
+    {
+      "epoch": 149.06666666666666,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002959740420840969,
+      "loss": 0.4798,
+      "step": 55900
+    },
+    {
+      "epoch": 149.09333333333333,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002959725950520453,
+      "loss": 0.4738,
+      "step": 55910
+    },
+    {
+      "epoch": 149.12,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002959711477635289,
+      "loss": 0.4702,
+      "step": 55920
+    },
+    {
+      "epoch": 149.14666666666668,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002959697002185502,
+      "loss": 0.479,
+      "step": 55930
+    },
+    {
+      "epoch": 149.17333333333335,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002959682524171117,
+      "loss": 0.4693,
+      "step": 55940
+    },
+    {
+      "epoch": 149.2,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002959668043592161,
+      "loss": 0.4684,
+      "step": 55950
+    },
+    {
+      "epoch": 149.22666666666666,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002959653560448658,
+      "loss": 0.4652,
+      "step": 55960
+    },
+    {
+      "epoch": 149.25333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002959639074740634,
+      "loss": 0.4654,
+      "step": 55970
+    },
+    {
+      "epoch": 149.28,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002959624586468115,
+      "loss": 0.4793,
+      "step": 55980
+    },
+    {
+      "epoch": 149.30666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029596100956311253,
+      "loss": 0.4602,
+      "step": 55990
+    },
+    {
+      "epoch": 149.33333333333334,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002959595602229691,
+      "loss": 0.4766,
+      "step": 56000
+    },
+    {
+      "epoch": 149.36,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002959581106263838,
+      "loss": 0.4777,
+      "step": 56010
+    },
+    {
+      "epoch": 149.38666666666666,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002959566607733591,
+      "loss": 0.4695,
+      "step": 56020
+    },
+    {
+      "epoch": 149.41333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002959552106638975,
+      "loss": 0.4742,
+      "step": 56030
+    },
+    {
+      "epoch": 149.44,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029595376029800175,
+      "loss": 0.4764,
+      "step": 56040
+    },
+    {
+      "epoch": 149.46666666666667,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002959523096756742,
+      "loss": 0.4864,
+      "step": 56050
+    },
+    {
+      "epoch": 149.49333333333334,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002959508587969175,
+      "loss": 0.4777,
+      "step": 56060
+    },
+    {
+      "epoch": 149.52,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002959494076617341,
+      "loss": 0.4719,
+      "step": 56070
+    },
+    {
+      "epoch": 149.54666666666665,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002959479562701267,
+      "loss": 0.4685,
+      "step": 56080
+    },
+    {
+      "epoch": 149.57333333333332,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029594650462209774,
+      "loss": 0.468,
+      "step": 56090
+    },
+    {
+      "epoch": 149.6,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002959450527176498,
+      "loss": 0.461,
+      "step": 56100
+    },
+    {
+      "epoch": 149.62666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029594360055678546,
+      "loss": 0.4487,
+      "step": 56110
+    },
+    {
+      "epoch": 149.65333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002959421481395072,
+      "loss": 0.4677,
+      "step": 56120
+    },
+    {
+      "epoch": 149.68,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002959406954658176,
+      "loss": 0.4598,
+      "step": 56130
+    },
+    {
+      "epoch": 149.70666666666668,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029593924253571926,
+      "loss": 0.4597,
+      "step": 56140
+    },
+    {
+      "epoch": 149.73333333333332,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002959377893492147,
+      "loss": 0.4794,
+      "step": 56150
+    },
+    {
+      "epoch": 149.76,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029593633590630647,
+      "loss": 0.4807,
+      "step": 56160
+    },
+    {
+      "epoch": 149.78666666666666,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002959348822069971,
+      "loss": 0.4794,
+      "step": 56170
+    },
+    {
+      "epoch": 149.81333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002959334282512892,
+      "loss": 0.4643,
+      "step": 56180
+    },
+    {
+      "epoch": 149.84,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0002959319740391853,
+      "loss": 0.47,
+      "step": 56190
+    },
+    {
+      "epoch": 149.86666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029593051957068787,
+      "loss": 0.4694,
+      "step": 56200
+    },
+    {
+      "epoch": 149.89333333333335,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029592906484579963,
+      "loss": 0.4607,
+      "step": 56210
+    },
+    {
+      "epoch": 149.92,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.000295927609864523,
+      "loss": 0.4705,
+      "step": 56220
+    },
+    {
+      "epoch": 149.94666666666666,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029592615462686064,
+      "loss": 0.4632,
+      "step": 56230
+    },
+    {
+      "epoch": 149.97333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029592469913281504,
+      "loss": 0.4746,
+      "step": 56240
+    },
+    {
+      "epoch": 150.0,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029592324338238875,
+      "loss": 0.4636,
+      "step": 56250
+    },
+    {
+      "epoch": 150.0,
+      "eval_loss": 0.4786826968193054,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 11.265,
+      "eval_samples_per_second": 1.42,
+      "eval_steps_per_second": 0.089,
+      "step": 56250
+    },
+    {
+      "epoch": 150.02666666666667,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00029592178737558437,
+      "loss": 0.4837,
+      "step": 56260
+    },
+    {
+      "epoch": 150.05333333333334,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002959203311124044,
+      "loss": 0.4892,
+      "step": 56270
+    },
+    {
+      "epoch": 150.08,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002959188745928515,
+      "loss": 0.4766,
+      "step": 56280
+    },
+    {
+      "epoch": 150.10666666666665,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029591741781692806,
+      "loss": 0.4712,
+      "step": 56290
+    },
+    {
+      "epoch": 150.13333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029591596078463686,
+      "loss": 0.4729,
+      "step": 56300
+    },
+    {
+      "epoch": 150.16,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029591450349598025,
+      "loss": 0.4763,
+      "step": 56310
+    },
+    {
+      "epoch": 150.18666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002959130459509609,
+      "loss": 0.469,
+      "step": 56320
+    },
+    {
+      "epoch": 150.21333333333334,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002959115881495814,
+      "loss": 0.4638,
+      "step": 56330
+    },
+    {
+      "epoch": 150.24,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029591013009184423,
+      "loss": 0.467,
+      "step": 56340
+    },
+    {
+      "epoch": 150.26666666666668,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029590867177775197,
+      "loss": 0.4718,
+      "step": 56350
+    },
+    {
+      "epoch": 150.29333333333332,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002959072132073072,
+      "loss": 0.4665,
+      "step": 56360
+    },
+    {
+      "epoch": 150.32,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002959057543805125,
+      "loss": 0.4695,
+      "step": 56370
+    },
+    {
+      "epoch": 150.34666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002959042952973705,
+      "loss": 0.4785,
+      "step": 56380
+    },
+    {
+      "epoch": 150.37333333333333,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.00029590283595788355,
+      "loss": 0.4736,
+      "step": 56390
+    },
+    {
+      "epoch": 150.4,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0002959013763620544,
+      "loss": 0.4703,
+      "step": 56400
+    },
+    {
+      "epoch": 150.42666666666668,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.00029589991650988553,
+      "loss": 0.4769,
+      "step": 56410
+    },
+    {
+      "epoch": 150.45333333333335,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002958984564013796,
+      "loss": 0.4789,
+      "step": 56420
+    },
+    {
+      "epoch": 150.48,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029589699603653903,
+      "loss": 0.4849,
+      "step": 56430
+    },
+    {
+      "epoch": 150.50666666666666,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002958955354153665,
+      "loss": 0.4752,
+      "step": 56440
+    },
+    {
+      "epoch": 150.53333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029589407453786454,
+      "loss": 0.4683,
+      "step": 56450
+    },
+    {
+      "epoch": 150.56,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029589261340403565,
+      "loss": 0.4696,
+      "step": 56460
+    },
+    {
+      "epoch": 150.58666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002958911520138826,
+      "loss": 0.4656,
+      "step": 56470
+    },
+    {
+      "epoch": 150.61333333333334,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002958896903674077,
+      "loss": 0.4516,
+      "step": 56480
+    },
+    {
+      "epoch": 150.64,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029588822846461367,
+      "loss": 0.4567,
+      "step": 56490
+    },
+    {
+      "epoch": 150.66666666666666,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.000295886766305503,
+      "loss": 0.4719,
+      "step": 56500
+    },
+    {
+      "epoch": 150.69333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029588530389007843,
+      "loss": 0.4552,
+      "step": 56510
+    },
+    {
+      "epoch": 150.72,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0002958838412183423,
+      "loss": 0.472,
+      "step": 56520
+    },
+    {
+      "epoch": 150.74666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029588237829029733,
+      "loss": 0.4759,
+      "step": 56530
+    },
+    {
+      "epoch": 150.77333333333334,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029588091510594604,
+      "loss": 0.4832,
+      "step": 56540
+    },
+    {
+      "epoch": 150.8,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029587945166529103,
+      "loss": 0.4703,
+      "step": 56550
+    },
+    {
+      "epoch": 150.82666666666665,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002958779879683348,
+      "loss": 0.4654,
+      "step": 56560
+    },
+    {
+      "epoch": 150.85333333333332,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029587652401508,
+      "loss": 0.4736,
+      "step": 56570
+    },
+    {
+      "epoch": 150.88,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002958750598055292,
+      "loss": 0.4623,
+      "step": 56580
+    },
+    {
+      "epoch": 150.90666666666667,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002958735953396849,
+      "loss": 0.4646,
+      "step": 56590
+    },
+    {
+      "epoch": 150.93333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002958721306175497,
+      "loss": 0.4667,
+      "step": 56600
+    },
+    {
+      "epoch": 150.96,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002958706656391262,
+      "loss": 0.4644,
+      "step": 56610
+    },
+    {
+      "epoch": 150.98666666666668,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000295869200404417,
+      "loss": 0.4788,
+      "step": 56620
+    },
+    {
+      "epoch": 151.0,
+      "eval_loss": 0.4782044291496277,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.4378,
+      "eval_samples_per_second": 1.533,
+      "eval_steps_per_second": 0.096,
+      "step": 56625
+    },
+    {
+      "epoch": 151.01333333333332,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002958677349134246,
+      "loss": 0.4672,
+      "step": 56630
+    },
+    {
+      "epoch": 151.04,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029586626916615165,
+      "loss": 0.4916,
+      "step": 56640
+    },
+    {
+      "epoch": 151.06666666666666,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002958648031626006,
+      "loss": 0.4791,
+      "step": 56650
+    },
+    {
+      "epoch": 151.09333333333333,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002958633369027742,
+      "loss": 0.4737,
+      "step": 56660
+    },
+    {
+      "epoch": 151.12,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002958618703866749,
+      "loss": 0.4703,
+      "step": 56670
+    },
+    {
+      "epoch": 151.14666666666668,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002958604036143054,
+      "loss": 0.4782,
+      "step": 56680
+    },
+    {
+      "epoch": 151.17333333333335,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002958589365856681,
+      "loss": 0.4698,
+      "step": 56690
+    },
+    {
+      "epoch": 151.2,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002958574693007657,
+      "loss": 0.4678,
+      "step": 56700
+    },
+    {
+      "epoch": 151.22666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029585600175960076,
+      "loss": 0.466,
+      "step": 56710
+    },
+    {
+      "epoch": 151.25333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002958545339621758,
+      "loss": 0.4654,
+      "step": 56720
+    },
+    {
+      "epoch": 151.28,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002958530659084935,
+      "loss": 0.4796,
+      "step": 56730
+    },
+    {
+      "epoch": 151.30666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002958515975985564,
+      "loss": 0.4607,
+      "step": 56740
+    },
+    {
+      "epoch": 151.33333333333334,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029585012903236705,
+      "loss": 0.4753,
+      "step": 56750
+    },
+    {
+      "epoch": 151.36,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.000295848660209928,
+      "loss": 0.4776,
+      "step": 56760
+    },
+    {
+      "epoch": 151.38666666666666,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002958471911312419,
+      "loss": 0.469,
+      "step": 56770
+    },
+    {
+      "epoch": 151.41333333333333,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.00029584572179631134,
+      "loss": 0.4739,
+      "step": 56780
+    },
+    {
+      "epoch": 151.44,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002958442522051388,
+      "loss": 0.4766,
+      "step": 56790
+    },
+    {
+      "epoch": 151.46666666666667,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.000295842782357727,
+      "loss": 0.4859,
+      "step": 56800
+    },
+    {
+      "epoch": 151.49333333333334,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002958413122540784,
+      "loss": 0.4778,
+      "step": 56810
+    },
+    {
+      "epoch": 151.52,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029583984189419563,
+      "loss": 0.4716,
+      "step": 56820
+    },
+    {
+      "epoch": 151.54666666666665,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029583837127808134,
+      "loss": 0.4689,
+      "step": 56830
+    },
+    {
+      "epoch": 151.57333333333332,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029583690040573804,
+      "loss": 0.4676,
+      "step": 56840
+    },
+    {
+      "epoch": 151.6,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029583542927716827,
+      "loss": 0.4618,
+      "step": 56850
+    },
+    {
+      "epoch": 151.62666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029583395789237475,
+      "loss": 0.4483,
+      "step": 56860
+    },
+    {
+      "epoch": 151.65333333333334,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029583248625136,
+      "loss": 0.4684,
+      "step": 56870
+    },
+    {
+      "epoch": 151.68,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002958310143541265,
+      "loss": 0.4596,
+      "step": 56880
+    },
+    {
+      "epoch": 151.70666666666668,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029582954220067694,
+      "loss": 0.4602,
+      "step": 56890
+    },
+    {
+      "epoch": 151.73333333333332,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029582806979101394,
+      "loss": 0.4789,
+      "step": 56900
+    },
+    {
+      "epoch": 151.76,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029582659712514003,
+      "loss": 0.4805,
+      "step": 56910
+    },
+    {
+      "epoch": 151.78666666666666,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0002958251242030578,
+      "loss": 0.4786,
+      "step": 56920
+    },
+    {
+      "epoch": 151.81333333333333,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029582365102476986,
+      "loss": 0.4642,
+      "step": 56930
+    },
+    {
+      "epoch": 151.84,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0002958221775902788,
+      "loss": 0.4702,
+      "step": 56940
+    },
+    {
+      "epoch": 151.86666666666667,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029582070389958713,
+      "loss": 0.4694,
+      "step": 56950
+    },
+    {
+      "epoch": 151.89333333333335,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029581922995269757,
+      "loss": 0.4602,
+      "step": 56960
+    },
+    {
+      "epoch": 151.92,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029581775574961266,
+      "loss": 0.4706,
+      "step": 56970
+    },
+    {
+      "epoch": 151.94666666666666,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002958162812903349,
+      "loss": 0.4635,
+      "step": 56980
+    },
+    {
+      "epoch": 151.97333333333333,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.000295814806574867,
+      "loss": 0.4742,
+      "step": 56990
+    },
+    {
+      "epoch": 152.0,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002958133316032115,
+      "loss": 0.4633,
+      "step": 57000
+    },
+    {
+      "epoch": 152.0,
+      "eval_loss": 0.4801050126552582,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 11.088,
+      "eval_samples_per_second": 1.443,
+      "eval_steps_per_second": 0.09,
+      "step": 57000
+    },
+    {
+      "epoch": 152.02666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.000295811856375371,
+      "loss": 0.4835,
+      "step": 57010
+    },
+    {
+      "epoch": 152.05333333333334,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.000295810380891348,
+      "loss": 0.4886,
+      "step": 57020
+    },
+    {
+      "epoch": 152.08,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029580890515114533,
+      "loss": 0.4753,
+      "step": 57030
+    },
+    {
+      "epoch": 152.10666666666665,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029580742915476536,
+      "loss": 0.471,
+      "step": 57040
+    },
+    {
+      "epoch": 152.13333333333333,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029580595290221076,
+      "loss": 0.4723,
+      "step": 57050
+    },
+    {
+      "epoch": 152.16,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002958044763934841,
+      "loss": 0.4767,
+      "step": 57060
+    },
+    {
+      "epoch": 152.18666666666667,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029580299962858803,
+      "loss": 0.4693,
+      "step": 57070
+    },
+    {
+      "epoch": 152.21333333333334,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029580152260752507,
+      "loss": 0.4642,
+      "step": 57080
+    },
+    {
+      "epoch": 152.24,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029580004533029787,
+      "loss": 0.4685,
+      "step": 57090
+    },
+    {
+      "epoch": 152.26666666666668,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.000295798567796909,
+      "loss": 0.4721,
+      "step": 57100
+    },
+    {
+      "epoch": 152.29333333333332,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002957970900073611,
+      "loss": 0.4665,
+      "step": 57110
+    },
+    {
+      "epoch": 152.32,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002957956119616567,
+      "loss": 0.4689,
+      "step": 57120
+    },
+    {
+      "epoch": 152.34666666666666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029579413365979845,
+      "loss": 0.4782,
+      "step": 57130
+    },
+    {
+      "epoch": 152.37333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002957926551017889,
+      "loss": 0.4742,
+      "step": 57140
+    },
+    {
+      "epoch": 152.4,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002957911762876307,
+      "loss": 0.4696,
+      "step": 57150
+    },
+    {
+      "epoch": 152.42666666666668,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00029578969721732643,
+      "loss": 0.4762,
+      "step": 57160
+    },
+    {
+      "epoch": 152.45333333333335,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029578821789087863,
+      "loss": 0.4787,
+      "step": 57170
+    },
+    {
+      "epoch": 152.48,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029578673830828997,
+      "loss": 0.4848,
+      "step": 57180
+    },
+    {
+      "epoch": 152.50666666666666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002957852584695631,
+      "loss": 0.4744,
+      "step": 57190
+    },
+    {
+      "epoch": 152.53333333333333,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002957837783747005,
+      "loss": 0.4686,
+      "step": 57200
+    },
+    {
+      "epoch": 152.56,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029578229802370475,
+      "loss": 0.4699,
+      "step": 57210
+    },
+    {
+      "epoch": 152.58666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002957808174165786,
+      "loss": 0.4651,
+      "step": 57220
+    },
+    {
+      "epoch": 152.61333333333334,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029577933655332457,
+      "loss": 0.4511,
+      "step": 57230
+    },
+    {
+      "epoch": 152.64,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029577785543394525,
+      "loss": 0.4565,
+      "step": 57240
+    },
+    {
+      "epoch": 152.66666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002957763740584432,
+      "loss": 0.4722,
+      "step": 57250
+    },
+    {
+      "epoch": 152.69333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002957748924268212,
+      "loss": 0.4543,
+      "step": 57260
+    },
+    {
+      "epoch": 152.72,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002957734105390816,
+      "loss": 0.4716,
+      "step": 57270
+    },
+    {
+      "epoch": 152.74666666666667,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002957719283952272,
+      "loss": 0.4767,
+      "step": 57280
+    },
+    {
+      "epoch": 152.77333333333334,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002957704459952606,
+      "loss": 0.4836,
+      "step": 57290
+    },
+    {
+      "epoch": 152.8,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002957689633391842,
+      "loss": 0.4712,
+      "step": 57300
+    },
+    {
+      "epoch": 152.82666666666665,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029576748042700086,
+      "loss": 0.4655,
+      "step": 57310
+    },
+    {
+      "epoch": 152.85333333333332,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000295765997258713,
+      "loss": 0.4742,
+      "step": 57320
+    },
+    {
+      "epoch": 152.88,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029576451383432335,
+      "loss": 0.4624,
+      "step": 57330
+    },
+    {
+      "epoch": 152.90666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002957630301538345,
+      "loss": 0.4651,
+      "step": 57340
+    },
+    {
+      "epoch": 152.93333333333334,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029576154621724897,
+      "loss": 0.4671,
+      "step": 57350
+    },
+    {
+      "epoch": 152.96,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002957600620245694,
+      "loss": 0.464,
+      "step": 57360
+    },
+    {
+      "epoch": 152.98666666666668,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029575857757579846,
+      "loss": 0.4777,
+      "step": 57370
+    },
+    {
+      "epoch": 153.0,
+      "eval_loss": 0.4788782000541687,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2479,
+      "eval_samples_per_second": 1.561,
+      "eval_steps_per_second": 0.098,
+      "step": 57375
+    },
+    {
+      "epoch": 153.01333333333332,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002957570928709387,
+      "loss": 0.4673,
+      "step": 57380
+    },
+    {
+      "epoch": 153.04,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002957556079099928,
+      "loss": 0.4914,
+      "step": 57390
+    },
+    {
+      "epoch": 153.06666666666666,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029575412269296317,
+      "loss": 0.4789,
+      "step": 57400
+    },
+    {
+      "epoch": 153.09333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029575263721985266,
+      "loss": 0.474,
+      "step": 57410
+    },
+    {
+      "epoch": 153.12,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002957511514906638,
+      "loss": 0.4703,
+      "step": 57420
+    },
+    {
+      "epoch": 153.14666666666668,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029574966550539915,
+      "loss": 0.4791,
+      "step": 57430
+    },
+    {
+      "epoch": 153.17333333333335,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029574817926406136,
+      "loss": 0.4697,
+      "step": 57440
+    },
+    {
+      "epoch": 153.2,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029574669276665307,
+      "loss": 0.4679,
+      "step": 57450
+    },
+    {
+      "epoch": 153.22666666666666,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029574520601317677,
+      "loss": 0.4649,
+      "step": 57460
+    },
+    {
+      "epoch": 153.25333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002957437190036352,
+      "loss": 0.4655,
+      "step": 57470
+    },
+    {
+      "epoch": 153.28,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029574223173803093,
+      "loss": 0.4788,
+      "step": 57480
+    },
+    {
+      "epoch": 153.30666666666667,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029574074421636666,
+      "loss": 0.4605,
+      "step": 57490
+    },
+    {
+      "epoch": 153.33333333333334,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029573925643864484,
+      "loss": 0.4753,
+      "step": 57500
+    },
+    {
+      "epoch": 153.36,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002957377684048682,
+      "loss": 0.478,
+      "step": 57510
+    },
+    {
+      "epoch": 153.38666666666666,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00029573628011503924,
+      "loss": 0.4689,
+      "step": 57520
+    },
+    {
+      "epoch": 153.41333333333333,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.00029573479156916067,
+      "loss": 0.4739,
+      "step": 57530
+    },
+    {
+      "epoch": 153.44,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002957333027672351,
+      "loss": 0.4773,
+      "step": 57540
+    },
+    {
+      "epoch": 153.46666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029573181370926515,
+      "loss": 0.4865,
+      "step": 57550
+    },
+    {
+      "epoch": 153.49333333333334,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.00029573032439525346,
+      "loss": 0.4772,
+      "step": 57560
+    },
+    {
+      "epoch": 153.52,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029572883482520255,
+      "loss": 0.472,
+      "step": 57570
+    },
+    {
+      "epoch": 153.54666666666665,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.000295727344999115,
+      "loss": 0.4691,
+      "step": 57580
+    },
+    {
+      "epoch": 153.57333333333332,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002957258549169937,
+      "loss": 0.4683,
+      "step": 57590
+    },
+    {
+      "epoch": 153.6,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.000295724364578841,
+      "loss": 0.4615,
+      "step": 57600
+    },
+    {
+      "epoch": 153.62666666666667,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029572287398465955,
+      "loss": 0.4486,
+      "step": 57610
+    },
+    {
+      "epoch": 153.65333333333334,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002957213831344521,
+      "loss": 0.4675,
+      "step": 57620
+    },
+    {
+      "epoch": 153.68,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002957198920282212,
+      "loss": 0.4602,
+      "step": 57630
+    },
+    {
+      "epoch": 153.70666666666668,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002957184006659694,
+      "loss": 0.4598,
+      "step": 57640
+    },
+    {
+      "epoch": 153.73333333333332,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002957169090476995,
+      "loss": 0.4793,
+      "step": 57650
+    },
+    {
+      "epoch": 153.76,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002957154171734139,
+      "loss": 0.4798,
+      "step": 57660
+    },
+    {
+      "epoch": 153.78666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002957139250431153,
+      "loss": 0.4794,
+      "step": 57670
+    },
+    {
+      "epoch": 153.81333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029571243265680644,
+      "loss": 0.4645,
+      "step": 57680
+    },
+    {
+      "epoch": 153.84,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029571094001448983,
+      "loss": 0.4694,
+      "step": 57690
+    },
+    {
+      "epoch": 153.86666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029570944711616805,
+      "loss": 0.4688,
+      "step": 57700
+    },
+    {
+      "epoch": 153.89333333333335,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002957079539618439,
+      "loss": 0.4597,
+      "step": 57710
+    },
+    {
+      "epoch": 153.92,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0002957064605515198,
+      "loss": 0.4709,
+      "step": 57720
+    },
+    {
+      "epoch": 153.94666666666666,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029570496688519847,
+      "loss": 0.4634,
+      "step": 57730
+    },
+    {
+      "epoch": 153.97333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029570347296288253,
+      "loss": 0.4753,
+      "step": 57740
+    },
+    {
+      "epoch": 154.0,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002957019787845746,
+      "loss": 0.4637,
+      "step": 57750
+    },
+    {
+      "epoch": 154.0,
+      "eval_loss": 0.4789036810398102,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9362,
+      "eval_samples_per_second": 1.61,
+      "eval_steps_per_second": 0.101,
+      "step": 57750
+    },
+    {
+      "epoch": 154.02666666666667,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0002957004843502773,
+      "loss": 0.4838,
+      "step": 57760
+    },
+    {
+      "epoch": 154.05333333333334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002956989896599933,
+      "loss": 0.4894,
+      "step": 57770
+    },
+    {
+      "epoch": 154.08,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029569749471372516,
+      "loss": 0.4765,
+      "step": 57780
+    },
+    {
+      "epoch": 154.10666666666665,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029569599951147555,
+      "loss": 0.4716,
+      "step": 57790
+    },
+    {
+      "epoch": 154.13333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029569450405324705,
+      "loss": 0.4725,
+      "step": 57800
+    },
+    {
+      "epoch": 154.16,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002956930083390423,
+      "loss": 0.4766,
+      "step": 57810
+    },
+    {
+      "epoch": 154.18666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.000295691512368864,
+      "loss": 0.4689,
+      "step": 57820
+    },
+    {
+      "epoch": 154.21333333333334,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029569001614271475,
+      "loss": 0.4636,
+      "step": 57830
+    },
+    {
+      "epoch": 154.24,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002956885196605971,
+      "loss": 0.4677,
+      "step": 57840
+    },
+    {
+      "epoch": 154.26666666666668,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002956870229225137,
+      "loss": 0.4721,
+      "step": 57850
+    },
+    {
+      "epoch": 154.29333333333332,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002956855259284673,
+      "loss": 0.4664,
+      "step": 57860
+    },
+    {
+      "epoch": 154.32,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029568402867846043,
+      "loss": 0.4698,
+      "step": 57870
+    },
+    {
+      "epoch": 154.34666666666666,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002956825311724957,
+      "loss": 0.4782,
+      "step": 57880
+    },
+    {
+      "epoch": 154.37333333333333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029568103341057574,
+      "loss": 0.4739,
+      "step": 57890
+    },
+    {
+      "epoch": 154.4,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002956795353927033,
+      "loss": 0.4697,
+      "step": 57900
+    },
+    {
+      "epoch": 154.42666666666668,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002956780371188809,
+      "loss": 0.4771,
+      "step": 57910
+    },
+    {
+      "epoch": 154.45333333333335,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002956765385891111,
+      "loss": 0.4791,
+      "step": 57920
+    },
+    {
+      "epoch": 154.48,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029567503980339676,
+      "loss": 0.4847,
+      "step": 57930
+    },
+    {
+      "epoch": 154.50666666666666,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029567354076174033,
+      "loss": 0.4745,
+      "step": 57940
+    },
+    {
+      "epoch": 154.53333333333333,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029567204146414453,
+      "loss": 0.4673,
+      "step": 57950
+    },
+    {
+      "epoch": 154.56,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029567054191061197,
+      "loss": 0.4692,
+      "step": 57960
+    },
+    {
+      "epoch": 154.58666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002956690421011452,
+      "loss": 0.4656,
+      "step": 57970
+    },
+    {
+      "epoch": 154.61333333333334,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.000295667542035747,
+      "loss": 0.4507,
+      "step": 57980
+    },
+    {
+      "epoch": 154.64,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029566604171441993,
+      "loss": 0.4571,
+      "step": 57990
+    },
+    {
+      "epoch": 154.66666666666666,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029566454113716666,
+      "loss": 0.4722,
+      "step": 58000
+    },
+    {
+      "epoch": 154.69333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002956630403039898,
+      "loss": 0.4553,
+      "step": 58010
+    },
+    {
+      "epoch": 154.72,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.00029566153921489194,
+      "loss": 0.4724,
+      "step": 58020
+    },
+    {
+      "epoch": 154.74666666666667,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002956600378698758,
+      "loss": 0.4768,
+      "step": 58030
+    },
+    {
+      "epoch": 154.77333333333334,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029565853626894393,
+      "loss": 0.4837,
+      "step": 58040
+    },
+    {
+      "epoch": 154.8,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002956570344120991,
+      "loss": 0.4701,
+      "step": 58050
+    },
+    {
+      "epoch": 154.82666666666665,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029565553229934385,
+      "loss": 0.4656,
+      "step": 58060
+    },
+    {
+      "epoch": 154.85333333333332,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029565402993068083,
+      "loss": 0.4735,
+      "step": 58070
+    },
+    {
+      "epoch": 154.88,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002956525273061127,
+      "loss": 0.4624,
+      "step": 58080
+    },
+    {
+      "epoch": 154.90666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002956510244256421,
+      "loss": 0.4646,
+      "step": 58090
+    },
+    {
+      "epoch": 154.93333333333334,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002956495212892716,
+      "loss": 0.4656,
+      "step": 58100
+    },
+    {
+      "epoch": 154.96,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029564801789700395,
+      "loss": 0.4641,
+      "step": 58110
+    },
+    {
+      "epoch": 154.98666666666668,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029564651424884175,
+      "loss": 0.478,
+      "step": 58120
+    },
+    {
+      "epoch": 155.0,
+      "eval_loss": 0.47817400097846985,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3606,
+      "eval_samples_per_second": 1.544,
+      "eval_steps_per_second": 0.097,
+      "step": 58125
+    },
+    {
+      "epoch": 155.01333333333332,
+      "grad_norm": 0.5,
+      "learning_rate": 0.00029564501034478756,
+      "loss": 0.4681,
+      "step": 58130
+    },
+    {
+      "epoch": 155.04,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002956435061848442,
+      "loss": 0.4919,
+      "step": 58140
+    },
+    {
+      "epoch": 155.06666666666666,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002956420017690141,
+      "loss": 0.4791,
+      "step": 58150
+    },
+    {
+      "epoch": 155.09333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002956404970973001,
+      "loss": 0.4743,
+      "step": 58160
+    },
+    {
+      "epoch": 155.12,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002956389921697047,
+      "loss": 0.4699,
+      "step": 58170
+    },
+    {
+      "epoch": 155.14666666666668,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029563748698623065,
+      "loss": 0.4787,
+      "step": 58180
+    },
+    {
+      "epoch": 155.17333333333335,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029563598154688054,
+      "loss": 0.4692,
+      "step": 58190
+    },
+    {
+      "epoch": 155.2,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.000295634475851657,
+      "loss": 0.4675,
+      "step": 58200
+    },
+    {
+      "epoch": 155.22666666666666,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002956329699005627,
+      "loss": 0.4659,
+      "step": 58210
+    },
+    {
+      "epoch": 155.25333333333333,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029563146369360026,
+      "loss": 0.4653,
+      "step": 58220
+    },
+    {
+      "epoch": 155.28,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029562995723077235,
+      "loss": 0.4793,
+      "step": 58230
+    },
+    {
+      "epoch": 155.30666666666667,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029562845051208167,
+      "loss": 0.46,
+      "step": 58240
+    },
+    {
+      "epoch": 155.33333333333334,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002956269435375307,
+      "loss": 0.4758,
+      "step": 58250
+    },
+    {
+      "epoch": 155.36,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002956254363071223,
+      "loss": 0.4782,
+      "step": 58260
+    },
+    {
+      "epoch": 155.38666666666666,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.000295623928820859,
+      "loss": 0.4693,
+      "step": 58270
+    },
+    {
+      "epoch": 155.41333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002956224210787435,
+      "loss": 0.473,
+      "step": 58280
+    },
+    {
+      "epoch": 155.44,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002956209130807783,
+      "loss": 0.4761,
+      "step": 58290
+    },
+    {
+      "epoch": 155.46666666666667,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0002956194048269662,
+      "loss": 0.486,
+      "step": 58300
+    },
+    {
+      "epoch": 155.49333333333334,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002956178963173099,
+      "loss": 0.4774,
+      "step": 58310
+    },
+    {
+      "epoch": 155.52,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002956163875518119,
+      "loss": 0.4713,
+      "step": 58320
+    },
+    {
+      "epoch": 155.54666666666665,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0002956148785304749,
+      "loss": 0.4686,
+      "step": 58330
+    },
+    {
+      "epoch": 155.57333333333332,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002956133692533016,
+      "loss": 0.4681,
+      "step": 58340
+    },
+    {
+      "epoch": 155.6,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029561185972029456,
+      "loss": 0.4616,
+      "step": 58350
+    },
+    {
+      "epoch": 155.62666666666667,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.00029561034993145656,
+      "loss": 0.4488,
+      "step": 58360
+    },
+    {
+      "epoch": 155.65333333333334,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029560883988679015,
+      "loss": 0.468,
+      "step": 58370
+    },
+    {
+      "epoch": 155.68,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.000295607329586298,
+      "loss": 0.4594,
+      "step": 58380
+    },
+    {
+      "epoch": 155.70666666666668,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00029560581902998283,
+      "loss": 0.46,
+      "step": 58390
+    },
+    {
+      "epoch": 155.73333333333332,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00029560430821784717,
+      "loss": 0.4785,
+      "step": 58400
+    },
+    {
+      "epoch": 155.76,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029560279714989384,
+      "loss": 0.4805,
+      "step": 58410
+    },
+    {
+      "epoch": 155.78666666666666,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00029560128582612535,
+      "loss": 0.4782,
+      "step": 58420
+    },
+    {
+      "epoch": 155.81333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002955997742465444,
+      "loss": 0.4644,
+      "step": 58430
+    },
+    {
+      "epoch": 155.84,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0002955982624111537,
+      "loss": 0.4701,
+      "step": 58440
+    },
+    {
+      "epoch": 155.86666666666667,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002955967503199558,
+      "loss": 0.4692,
+      "step": 58450
+    },
+    {
+      "epoch": 155.89333333333335,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002955952379729534,
+      "loss": 0.4597,
+      "step": 58460
+    },
+    {
+      "epoch": 155.92,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029559372537014923,
+      "loss": 0.4701,
+      "step": 58470
+    },
+    {
+      "epoch": 155.94666666666666,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002955922125115459,
+      "loss": 0.4635,
+      "step": 58480
+    },
+    {
+      "epoch": 155.97333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029559069939714597,
+      "loss": 0.4747,
+      "step": 58490
+    },
+    {
+      "epoch": 156.0,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.00029558918602695227,
+      "loss": 0.4635,
+      "step": 58500
+    },
+    {
+      "epoch": 156.0,
+      "eval_loss": 0.47919952869415283,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.9366,
+      "eval_samples_per_second": 1.463,
+      "eval_steps_per_second": 0.091,
+      "step": 58500
+    },
+    {
+      "epoch": 156.02666666666667,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029558767240096736,
+      "loss": 0.4834,
+      "step": 58510
+    },
+    {
+      "epoch": 156.05333333333334,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029558615851919395,
+      "loss": 0.4883,
+      "step": 58520
+    },
+    {
+      "epoch": 156.08,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002955846443816346,
+      "loss": 0.4755,
+      "step": 58530
+    },
+    {
+      "epoch": 156.10666666666665,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002955831299882921,
+      "loss": 0.4714,
+      "step": 58540
+    },
+    {
+      "epoch": 156.13333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029558161533916894,
+      "loss": 0.4731,
+      "step": 58550
+    },
+    {
+      "epoch": 156.16,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.000295580100434268,
+      "loss": 0.4766,
+      "step": 58560
+    },
+    {
+      "epoch": 156.18666666666667,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029557858527359175,
+      "loss": 0.4692,
+      "step": 58570
+    },
+    {
+      "epoch": 156.21333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029557706985714296,
+      "loss": 0.4633,
+      "step": 58580
+    },
+    {
+      "epoch": 156.24,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002955755541849243,
+      "loss": 0.4672,
+      "step": 58590
+    },
+    {
+      "epoch": 156.26666666666668,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029557403825693836,
+      "loss": 0.4724,
+      "step": 58600
+    },
+    {
+      "epoch": 156.29333333333332,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002955725220731878,
+      "loss": 0.4662,
+      "step": 58610
+    },
+    {
+      "epoch": 156.32,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002955710056336754,
+      "loss": 0.4696,
+      "step": 58620
+    },
+    {
+      "epoch": 156.34666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029556948893840374,
+      "loss": 0.4775,
+      "step": 58630
+    },
+    {
+      "epoch": 156.37333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029556797198737546,
+      "loss": 0.4734,
+      "step": 58640
+    },
+    {
+      "epoch": 156.4,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029556645478059327,
+      "loss": 0.4702,
+      "step": 58650
+    },
+    {
+      "epoch": 156.42666666666668,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002955649373180598,
+      "loss": 0.4771,
+      "step": 58660
+    },
+    {
+      "epoch": 156.45333333333335,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029556341959977776,
+      "loss": 0.4782,
+      "step": 58670
+    },
+    {
+      "epoch": 156.48,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002955619016257498,
+      "loss": 0.485,
+      "step": 58680
+    },
+    {
+      "epoch": 156.50666666666666,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002955603833959786,
+      "loss": 0.4745,
+      "step": 58690
+    },
+    {
+      "epoch": 156.53333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002955588649104668,
+      "loss": 0.4684,
+      "step": 58700
+    },
+    {
+      "epoch": 156.56,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029555734616921705,
+      "loss": 0.4694,
+      "step": 58710
+    },
+    {
+      "epoch": 156.58666666666667,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029555582717223205,
+      "loss": 0.4654,
+      "step": 58720
+    },
+    {
+      "epoch": 156.61333333333334,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002955543079195145,
+      "loss": 0.4509,
+      "step": 58730
+    },
+    {
+      "epoch": 156.64,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.000295552788411067,
+      "loss": 0.4566,
+      "step": 58740
+    },
+    {
+      "epoch": 156.66666666666666,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002955512686468923,
+      "loss": 0.4713,
+      "step": 58750
+    },
+    {
+      "epoch": 156.69333333333333,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.000295549748626993,
+      "loss": 0.4543,
+      "step": 58760
+    },
+    {
+      "epoch": 156.72,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029554822835137174,
+      "loss": 0.4719,
+      "step": 58770
+    },
+    {
+      "epoch": 156.74666666666667,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002955467078200313,
+      "loss": 0.4758,
+      "step": 58780
+    },
+    {
+      "epoch": 156.77333333333334,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002955451870329743,
+      "loss": 0.4834,
+      "step": 58790
+    },
+    {
+      "epoch": 156.8,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0002955436659902034,
+      "loss": 0.4706,
+      "step": 58800
+    },
+    {
+      "epoch": 156.82666666666665,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029554214469172126,
+      "loss": 0.4652,
+      "step": 58810
+    },
+    {
+      "epoch": 156.85333333333332,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002955406231375306,
+      "loss": 0.4733,
+      "step": 58820
+    },
+    {
+      "epoch": 156.88,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002955391013276341,
+      "loss": 0.4615,
+      "step": 58830
+    },
+    {
+      "epoch": 156.90666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002955375792620344,
+      "loss": 0.4646,
+      "step": 58840
+    },
+    {
+      "epoch": 156.93333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029553605694073413,
+      "loss": 0.4669,
+      "step": 58850
+    },
+    {
+      "epoch": 156.96,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.000295534534363736,
+      "loss": 0.4644,
+      "step": 58860
+    },
+    {
+      "epoch": 156.98666666666668,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002955330115310427,
+      "loss": 0.4789,
+      "step": 58870
+    },
+    {
+      "epoch": 157.0,
+      "eval_loss": 0.4790196120738983,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.6513,
+      "eval_samples_per_second": 1.502,
+      "eval_steps_per_second": 0.094,
+      "step": 58875
+    },
+    {
+      "epoch": 157.01333333333332,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.000295531488442657,
+      "loss": 0.4684,
+      "step": 58880
+    },
+    {
+      "epoch": 157.04,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002955299650985814,
+      "loss": 0.4916,
+      "step": 58890
+    },
+    {
+      "epoch": 157.06666666666666,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002955284414988186,
+      "loss": 0.4794,
+      "step": 58900
+    },
+    {
+      "epoch": 157.09333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002955269176433714,
+      "loss": 0.4741,
+      "step": 58910
+    },
+    {
+      "epoch": 157.12,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0002955253935322424,
+      "loss": 0.4699,
+      "step": 58920
+    },
+    {
+      "epoch": 157.14666666666668,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002955238691654343,
+      "loss": 0.4786,
+      "step": 58930
+    },
+    {
+      "epoch": 157.17333333333335,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0002955223445429497,
+      "loss": 0.4701,
+      "step": 58940
+    },
+    {
+      "epoch": 157.2,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002955208196647914,
+      "loss": 0.4681,
+      "step": 58950
+    },
+    {
+      "epoch": 157.22666666666666,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.000295519294530962,
+      "loss": 0.4653,
+      "step": 58960
+    },
+    {
+      "epoch": 157.25333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029551776914146425,
+      "loss": 0.4657,
+      "step": 58970
+    },
+    {
+      "epoch": 157.28,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002955162434963007,
+      "loss": 0.4789,
+      "step": 58980
+    },
+    {
+      "epoch": 157.30666666666667,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002955147175954742,
+      "loss": 0.46,
+      "step": 58990
+    },
+    {
+      "epoch": 157.33333333333334,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002955131914389873,
+      "loss": 0.4756,
+      "step": 59000
+    },
+    {
+      "epoch": 157.36,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00029551166502684266,
+      "loss": 0.4777,
+      "step": 59010
+    },
+    {
+      "epoch": 157.38666666666666,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0002955101383590431,
+      "loss": 0.469,
+      "step": 59020
+    },
+    {
+      "epoch": 157.41333333333333,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002955086114355912,
+      "loss": 0.4736,
+      "step": 59030
+    },
+    {
+      "epoch": 157.44,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002955070842564897,
+      "loss": 0.4763,
+      "step": 59040
+    },
+    {
+      "epoch": 157.46666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002955055568217412,
+      "loss": 0.486,
+      "step": 59050
+    },
+    {
+      "epoch": 157.49333333333334,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0002955040291313485,
+      "loss": 0.4772,
+      "step": 59060
+    },
+    {
+      "epoch": 157.52,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002955025011853142,
+      "loss": 0.4713,
+      "step": 59070
+    },
+    {
+      "epoch": 157.54666666666665,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.000295500972983641,
+      "loss": 0.4681,
+      "step": 59080
+    },
+    {
+      "epoch": 157.57333333333332,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002954994445263316,
+      "loss": 0.4682,
+      "step": 59090
+    },
+    {
+      "epoch": 157.6,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029549791581338865,
+      "loss": 0.4613,
+      "step": 59100
+    },
+    {
+      "epoch": 157.62666666666667,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0002954963868448149,
+      "loss": 0.4489,
+      "step": 59110
+    },
+    {
+      "epoch": 157.65333333333334,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029549485762061297,
+      "loss": 0.4687,
+      "step": 59120
+    },
+    {
+      "epoch": 157.68,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029549332814078557,
+      "loss": 0.4592,
+      "step": 59130
+    },
+    {
+      "epoch": 157.70666666666668,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029549179840533544,
+      "loss": 0.4602,
+      "step": 59140
+    },
+    {
+      "epoch": 157.73333333333332,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00029549026841426513,
+      "loss": 0.4792,
+      "step": 59150
+    },
+    {
+      "epoch": 157.76,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0002954887381675775,
+      "loss": 0.481,
+      "step": 59160
+    },
+    {
+      "epoch": 157.78666666666666,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00029548720766527515,
+      "loss": 0.4805,
+      "step": 59170
+    },
+    {
+      "epoch": 157.81333333333333,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.00029548567690736074,
+      "loss": 0.4645,
+      "step": 59180
+    },
+    {
+      "epoch": 157.84,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.00029548414589383704,
+      "loss": 0.47,
+      "step": 59190
+    },
+    {
+      "epoch": 157.86666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029548261462470667,
+      "loss": 0.4692,
+      "step": 59200
+    },
+    {
+      "epoch": 157.89333333333335,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002954810830999723,
+      "loss": 0.4599,
+      "step": 59210
+    },
+    {
+      "epoch": 157.92,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0002954795513196367,
+      "loss": 0.4705,
+      "step": 59220
+    },
+    {
+      "epoch": 157.94666666666666,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029547801928370257,
+      "loss": 0.4625,
+      "step": 59230
+    },
+    {
+      "epoch": 157.97333333333333,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029547648699217253,
+      "loss": 0.475,
+      "step": 59240
+    },
+    {
+      "epoch": 158.0,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002954749544450493,
+      "loss": 0.4637,
+      "step": 59250
+    },
+    {
+      "epoch": 158.0,
+      "eval_loss": 0.47802263498306274,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1878,
+      "eval_samples_per_second": 1.571,
+      "eval_steps_per_second": 0.098,
+      "step": 59250
+    },
+    {
+      "epoch": 158.02666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002954734216423356,
+      "loss": 0.4836,
+      "step": 59260
+    },
+    {
+      "epoch": 158.05333333333334,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029547188858403405,
+      "loss": 0.4885,
+      "step": 59270
+    },
+    {
+      "epoch": 158.08,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002954703552701474,
+      "loss": 0.476,
+      "step": 59280
+    },
+    {
+      "epoch": 158.10666666666665,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002954688217006784,
+      "loss": 0.4711,
+      "step": 59290
+    },
+    {
+      "epoch": 158.13333333333333,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0002954672878756296,
+      "loss": 0.4724,
+      "step": 59300
+    },
+    {
+      "epoch": 158.16,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002954657537950038,
+      "loss": 0.4767,
+      "step": 59310
+    },
+    {
+      "epoch": 158.18666666666667,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002954642194588036,
+      "loss": 0.4693,
+      "step": 59320
+    },
+    {
+      "epoch": 158.21333333333334,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029546268486703187,
+      "loss": 0.4638,
+      "step": 59330
+    },
+    {
+      "epoch": 158.24,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002954611500196912,
+      "loss": 0.4678,
+      "step": 59340
+    },
+    {
+      "epoch": 158.26666666666668,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029545961491678424,
+      "loss": 0.4718,
+      "step": 59350
+    },
+    {
+      "epoch": 158.29333333333332,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029545807955831373,
+      "loss": 0.4664,
+      "step": 59360
+    },
+    {
+      "epoch": 158.32,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029545654394428234,
+      "loss": 0.4697,
+      "step": 59370
+    },
+    {
+      "epoch": 158.34666666666666,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002954550080746929,
+      "loss": 0.4779,
+      "step": 59380
+    },
+    {
+      "epoch": 158.37333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029545347194954796,
+      "loss": 0.4732,
+      "step": 59390
+    },
+    {
+      "epoch": 158.4,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002954519355688502,
+      "loss": 0.4702,
+      "step": 59400
+    },
+    {
+      "epoch": 158.42666666666668,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002954503989326025,
+      "loss": 0.4761,
+      "step": 59410
+    },
+    {
+      "epoch": 158.45333333333335,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002954488620408074,
+      "loss": 0.4783,
+      "step": 59420
+    },
+    {
+      "epoch": 158.48,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0002954473248934676,
+      "loss": 0.4846,
+      "step": 59430
+    },
+    {
+      "epoch": 158.50666666666666,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002954457874905859,
+      "loss": 0.475,
+      "step": 59440
+    },
+    {
+      "epoch": 158.53333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029544424983216495,
+      "loss": 0.4689,
+      "step": 59450
+    },
+    {
+      "epoch": 158.56,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002954427119182074,
+      "loss": 0.4694,
+      "step": 59460
+    },
+    {
+      "epoch": 158.58666666666667,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.00029544117374871606,
+      "loss": 0.4651,
+      "step": 59470
+    },
+    {
+      "epoch": 158.61333333333334,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029543963532369354,
+      "loss": 0.4514,
+      "step": 59480
+    },
+    {
+      "epoch": 158.64,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002954380966431426,
+      "loss": 0.4561,
+      "step": 59490
+    },
+    {
+      "epoch": 158.66666666666666,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002954365577070659,
+      "loss": 0.4716,
+      "step": 59500
+    },
+    {
+      "epoch": 158.69333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002954350185154662,
+      "loss": 0.4547,
+      "step": 59510
+    },
+    {
+      "epoch": 158.72,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.00029543347906834614,
+      "loss": 0.4718,
+      "step": 59520
+    },
+    {
+      "epoch": 158.74666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029543193936570845,
+      "loss": 0.4763,
+      "step": 59530
+    },
+    {
+      "epoch": 158.77333333333334,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029543039940755587,
+      "loss": 0.4833,
+      "step": 59540
+    },
+    {
+      "epoch": 158.8,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.00029542885919389103,
+      "loss": 0.471,
+      "step": 59550
+    },
+    {
+      "epoch": 158.82666666666665,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002954273187247167,
+      "loss": 0.4658,
+      "step": 59560
+    },
+    {
+      "epoch": 158.85333333333332,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002954257780000356,
+      "loss": 0.4741,
+      "step": 59570
+    },
+    {
+      "epoch": 158.88,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002954242370198504,
+      "loss": 0.4619,
+      "step": 59580
+    },
+    {
+      "epoch": 158.90666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002954226957841638,
+      "loss": 0.4649,
+      "step": 59590
+    },
+    {
+      "epoch": 158.93333333333334,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002954211542929785,
+      "loss": 0.4669,
+      "step": 59600
+    },
+    {
+      "epoch": 158.96,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002954196125462972,
+      "loss": 0.4646,
+      "step": 59610
+    },
+    {
+      "epoch": 158.98666666666668,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.00029541807054412266,
+      "loss": 0.4785,
+      "step": 59620
+    },
+    {
+      "epoch": 159.0,
+      "eval_loss": 0.477658212184906,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3986,
+      "eval_samples_per_second": 1.539,
+      "eval_steps_per_second": 0.096,
+      "step": 59625
+    },
+    {
+      "epoch": 159.01333333333332,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029541652828645755,
+      "loss": 0.4672,
+      "step": 59630
+    },
+    {
+      "epoch": 159.04,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029541498577330465,
+      "loss": 0.4921,
+      "step": 59640
+    },
+    {
+      "epoch": 159.06666666666666,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002954134430046666,
+      "loss": 0.4794,
+      "step": 59650
+    },
+    {
+      "epoch": 159.09333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002954118999805461,
+      "loss": 0.4733,
+      "step": 59660
+    },
+    {
+      "epoch": 159.12,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029541035670094587,
+      "loss": 0.4711,
+      "step": 59670
+    },
+    {
+      "epoch": 159.14666666666668,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002954088131658687,
+      "loss": 0.4789,
+      "step": 59680
+    },
+    {
+      "epoch": 159.17333333333335,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029540726937531716,
+      "loss": 0.4701,
+      "step": 59690
+    },
+    {
+      "epoch": 159.2,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002954057253292941,
+      "loss": 0.4682,
+      "step": 59700
+    },
+    {
+      "epoch": 159.22666666666666,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002954041810278022,
+      "loss": 0.4653,
+      "step": 59710
+    },
+    {
+      "epoch": 159.25333333333333,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002954026364708441,
+      "loss": 0.4648,
+      "step": 59720
+    },
+    {
+      "epoch": 159.28,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029540109165842254,
+      "loss": 0.4789,
+      "step": 59730
+    },
+    {
+      "epoch": 159.30666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002953995465905403,
+      "loss": 0.4599,
+      "step": 59740
+    },
+    {
+      "epoch": 159.33333333333334,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029539800126720004,
+      "loss": 0.4754,
+      "step": 59750
+    },
+    {
+      "epoch": 159.36,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002953964556884045,
+      "loss": 0.4769,
+      "step": 59760
+    },
+    {
+      "epoch": 159.38666666666666,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002953949098541564,
+      "loss": 0.4693,
+      "step": 59770
+    },
+    {
+      "epoch": 159.41333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002953933637644584,
+      "loss": 0.4733,
+      "step": 59780
+    },
+    {
+      "epoch": 159.44,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029539181741931327,
+      "loss": 0.4768,
+      "step": 59790
+    },
+    {
+      "epoch": 159.46666666666667,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029539027081872365,
+      "loss": 0.4863,
+      "step": 59800
+    },
+    {
+      "epoch": 159.49333333333334,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002953887239626924,
+      "loss": 0.4777,
+      "step": 59810
+    },
+    {
+      "epoch": 159.52,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002953871768512221,
+      "loss": 0.4722,
+      "step": 59820
+    },
+    {
+      "epoch": 159.54666666666665,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029538562948431554,
+      "loss": 0.4689,
+      "step": 59830
+    },
+    {
+      "epoch": 159.57333333333332,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029538408186197547,
+      "loss": 0.4677,
+      "step": 59840
+    },
+    {
+      "epoch": 159.6,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029538253398420447,
+      "loss": 0.4611,
+      "step": 59850
+    },
+    {
+      "epoch": 159.62666666666667,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029538098585100547,
+      "loss": 0.4489,
+      "step": 59860
+    },
+    {
+      "epoch": 159.65333333333334,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029537943746238096,
+      "loss": 0.4682,
+      "step": 59870
+    },
+    {
+      "epoch": 159.68,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002953778888183338,
+      "loss": 0.4598,
+      "step": 59880
+    },
+    {
+      "epoch": 159.70666666666668,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029537633991886673,
+      "loss": 0.4603,
+      "step": 59890
+    },
+    {
+      "epoch": 159.73333333333332,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002953747907639824,
+      "loss": 0.4789,
+      "step": 59900
+    },
+    {
+      "epoch": 159.76,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002953732413536835,
+      "loss": 0.4797,
+      "step": 59910
+    },
+    {
+      "epoch": 159.78666666666666,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.00029537169168797285,
+      "loss": 0.4787,
+      "step": 59920
+    },
+    {
+      "epoch": 159.81333333333333,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0002953701417668531,
+      "loss": 0.4639,
+      "step": 59930
+    },
+    {
+      "epoch": 159.84,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002953685915903271,
+      "loss": 0.4702,
+      "step": 59940
+    },
+    {
+      "epoch": 159.86666666666667,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029536704115839743,
+      "loss": 0.4686,
+      "step": 59950
+    },
+    {
+      "epoch": 159.89333333333335,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002953654904710668,
+      "loss": 0.4601,
+      "step": 59960
+    },
+    {
+      "epoch": 159.92,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029536393952833803,
+      "loss": 0.4706,
+      "step": 59970
+    },
+    {
+      "epoch": 159.94666666666666,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002953623883302139,
+      "loss": 0.4632,
+      "step": 59980
+    },
+    {
+      "epoch": 159.97333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002953608368766969,
+      "loss": 0.4743,
+      "step": 59990
+    },
+    {
+      "epoch": 160.0,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029535928516779,
+      "loss": 0.464,
+      "step": 60000
+    },
+    {
+      "epoch": 160.0,
+      "eval_loss": 0.4785791039466858,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3856,
+      "eval_samples_per_second": 1.541,
+      "eval_steps_per_second": 0.096,
+      "step": 60000
+    },
+    {
+      "epoch": 160.02666666666667,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002953577332034958,
+      "loss": 0.4835,
+      "step": 60010
+    },
+    {
+      "epoch": 160.05333333333334,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.000295356180983817,
+      "loss": 0.4887,
+      "step": 60020
+    },
+    {
+      "epoch": 160.08,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029535462850875644,
+      "loss": 0.4753,
+      "step": 60030
+    },
+    {
+      "epoch": 160.10666666666665,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029535307577831677,
+      "loss": 0.4709,
+      "step": 60040
+    },
+    {
+      "epoch": 160.13333333333333,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029535152279250076,
+      "loss": 0.473,
+      "step": 60050
+    },
+    {
+      "epoch": 160.16,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002953499695513111,
+      "loss": 0.4761,
+      "step": 60060
+    },
+    {
+      "epoch": 160.18666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029534841605475057,
+      "loss": 0.4696,
+      "step": 60070
+    },
+    {
+      "epoch": 160.21333333333334,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002953468623028218,
+      "loss": 0.4635,
+      "step": 60080
+    },
+    {
+      "epoch": 160.24,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029534530829552764,
+      "loss": 0.4677,
+      "step": 60090
+    },
+    {
+      "epoch": 160.26666666666668,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002953437540328707,
+      "loss": 0.4722,
+      "step": 60100
+    },
+    {
+      "epoch": 160.29333333333332,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002953421995148539,
+      "loss": 0.4656,
+      "step": 60110
+    },
+    {
+      "epoch": 160.32,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002953406447414797,
+      "loss": 0.4696,
+      "step": 60120
+    },
+    {
+      "epoch": 160.34666666666666,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029533908971275105,
+      "loss": 0.4778,
+      "step": 60130
+    },
+    {
+      "epoch": 160.37333333333333,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002953375344286706,
+      "loss": 0.4738,
+      "step": 60140
+    },
+    {
+      "epoch": 160.4,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029533597888924107,
+      "loss": 0.4701,
+      "step": 60150
+    },
+    {
+      "epoch": 160.42666666666668,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029533442309446524,
+      "loss": 0.4765,
+      "step": 60160
+    },
+    {
+      "epoch": 160.45333333333335,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002953328670443458,
+      "loss": 0.4785,
+      "step": 60170
+    },
+    {
+      "epoch": 160.48,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029533131073888555,
+      "loss": 0.4851,
+      "step": 60180
+    },
+    {
+      "epoch": 160.50666666666666,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002953297541780871,
+      "loss": 0.4744,
+      "step": 60190
+    },
+    {
+      "epoch": 160.53333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029532819736195336,
+      "loss": 0.4679,
+      "step": 60200
+    },
+    {
+      "epoch": 160.56,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002953266402904869,
+      "loss": 0.4696,
+      "step": 60210
+    },
+    {
+      "epoch": 160.58666666666667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002953250829636906,
+      "loss": 0.4654,
+      "step": 60220
+    },
+    {
+      "epoch": 160.61333333333334,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029532352538156703,
+      "loss": 0.4511,
+      "step": 60230
+    },
+    {
+      "epoch": 160.64,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.000295321967544119,
+      "loss": 0.4559,
+      "step": 60240
+    },
+    {
+      "epoch": 160.66666666666666,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002953204094513493,
+      "loss": 0.4722,
+      "step": 60250
+    },
+    {
+      "epoch": 160.69333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029531885110326066,
+      "loss": 0.4548,
+      "step": 60260
+    },
+    {
+      "epoch": 160.72,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029531729249985574,
+      "loss": 0.4718,
+      "step": 60270
+    },
+    {
+      "epoch": 160.74666666666667,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029531573364113736,
+      "loss": 0.4766,
+      "step": 60280
+    },
+    {
+      "epoch": 160.77333333333334,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029531417452710823,
+      "loss": 0.4833,
+      "step": 60290
+    },
+    {
+      "epoch": 160.8,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029531261515777107,
+      "loss": 0.4702,
+      "step": 60300
+    },
+    {
+      "epoch": 160.82666666666665,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029531105553312864,
+      "loss": 0.4654,
+      "step": 60310
+    },
+    {
+      "epoch": 160.85333333333332,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029530949565318365,
+      "loss": 0.4733,
+      "step": 60320
+    },
+    {
+      "epoch": 160.88,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029530793551793886,
+      "loss": 0.4623,
+      "step": 60330
+    },
+    {
+      "epoch": 160.90666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002953063751273971,
+      "loss": 0.4649,
+      "step": 60340
+    },
+    {
+      "epoch": 160.93333333333334,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002953048144815609,
+      "loss": 0.4665,
+      "step": 60350
+    },
+    {
+      "epoch": 160.96,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029530325358043325,
+      "loss": 0.4645,
+      "step": 60360
+    },
+    {
+      "epoch": 160.98666666666668,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002953016924240167,
+      "loss": 0.478,
+      "step": 60370
+    },
+    {
+      "epoch": 161.0,
+      "eval_loss": 0.4802950620651245,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9741,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 0.1,
+      "step": 60375
+    },
+    {
+      "epoch": 161.01333333333332,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029530013101231403,
+      "loss": 0.467,
+      "step": 60380
+    },
+    {
+      "epoch": 161.04,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002952985693453281,
+      "loss": 0.4919,
+      "step": 60390
+    },
+    {
+      "epoch": 161.06666666666666,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029529700742306154,
+      "loss": 0.4782,
+      "step": 60400
+    },
+    {
+      "epoch": 161.09333333333333,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0002952954452455171,
+      "loss": 0.4737,
+      "step": 60410
+    },
+    {
+      "epoch": 161.12,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0002952938828126976,
+      "loss": 0.4706,
+      "step": 60420
+    },
+    {
+      "epoch": 161.14666666666668,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00029529232012460576,
+      "loss": 0.4786,
+      "step": 60430
+    },
+    {
+      "epoch": 161.17333333333335,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002952907571812442,
+      "loss": 0.4694,
+      "step": 60440
+    },
+    {
+      "epoch": 161.2,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029528919398261584,
+      "loss": 0.4683,
+      "step": 60450
+    },
+    {
+      "epoch": 161.22666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002952876305287233,
+      "loss": 0.4657,
+      "step": 60460
+    },
+    {
+      "epoch": 161.25333333333333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029528606681956943,
+      "loss": 0.4653,
+      "step": 60470
+    },
+    {
+      "epoch": 161.28,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002952845028551569,
+      "loss": 0.4799,
+      "step": 60480
+    },
+    {
+      "epoch": 161.30666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002952829386354885,
+      "loss": 0.4603,
+      "step": 60490
+    },
+    {
+      "epoch": 161.33333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029528137416056694,
+      "loss": 0.4755,
+      "step": 60500
+    },
+    {
+      "epoch": 161.36,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029527980943039507,
+      "loss": 0.4778,
+      "step": 60510
+    },
+    {
+      "epoch": 161.38666666666666,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0002952782444449755,
+      "loss": 0.4697,
+      "step": 60520
+    },
+    {
+      "epoch": 161.41333333333333,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029527667920431104,
+      "loss": 0.4735,
+      "step": 60530
+    },
+    {
+      "epoch": 161.44,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002952751137084044,
+      "loss": 0.4766,
+      "step": 60540
+    },
+    {
+      "epoch": 161.46666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029527354795725843,
+      "loss": 0.4856,
+      "step": 60550
+    },
+    {
+      "epoch": 161.49333333333334,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029527198195087586,
+      "loss": 0.4773,
+      "step": 60560
+    },
+    {
+      "epoch": 161.52,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0002952704156892593,
+      "loss": 0.4719,
+      "step": 60570
+    },
+    {
+      "epoch": 161.54666666666665,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002952688491724117,
+      "loss": 0.4688,
+      "step": 60580
+    },
+    {
+      "epoch": 161.57333333333332,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029526728240033564,
+      "loss": 0.4677,
+      "step": 60590
+    },
+    {
+      "epoch": 161.6,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.000295265715373034,
+      "loss": 0.4616,
+      "step": 60600
+    },
+    {
+      "epoch": 161.62666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002952641480905095,
+      "loss": 0.4485,
+      "step": 60610
+    },
+    {
+      "epoch": 161.65333333333334,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029526258055276484,
+      "loss": 0.4675,
+      "step": 60620
+    },
+    {
+      "epoch": 161.68,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002952610127598029,
+      "loss": 0.4588,
+      "step": 60630
+    },
+    {
+      "epoch": 161.70666666666668,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002952594447116262,
+      "loss": 0.4598,
+      "step": 60640
+    },
+    {
+      "epoch": 161.73333333333332,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002952578764082377,
+      "loss": 0.4788,
+      "step": 60650
+    },
+    {
+      "epoch": 161.76,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029525630784964013,
+      "loss": 0.4797,
+      "step": 60660
+    },
+    {
+      "epoch": 161.78666666666666,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029525473903583617,
+      "loss": 0.4789,
+      "step": 60670
+    },
+    {
+      "epoch": 161.81333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002952531699668287,
+      "loss": 0.4635,
+      "step": 60680
+    },
+    {
+      "epoch": 161.84,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002952516006426203,
+      "loss": 0.4697,
+      "step": 60690
+    },
+    {
+      "epoch": 161.86666666666667,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029525003106321384,
+      "loss": 0.4697,
+      "step": 60700
+    },
+    {
+      "epoch": 161.89333333333335,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002952484612286121,
+      "loss": 0.4609,
+      "step": 60710
+    },
+    {
+      "epoch": 161.92,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029524689113881783,
+      "loss": 0.4706,
+      "step": 60720
+    },
+    {
+      "epoch": 161.94666666666666,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0002952453207938337,
+      "loss": 0.4627,
+      "step": 60730
+    },
+    {
+      "epoch": 161.97333333333333,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002952437501936625,
+      "loss": 0.4741,
+      "step": 60740
+    },
+    {
+      "epoch": 162.0,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029524217933830704,
+      "loss": 0.4638,
+      "step": 60750
+    },
+    {
+      "epoch": 162.0,
+      "eval_loss": 0.4797598421573639,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5891,
+      "eval_samples_per_second": 1.511,
+      "eval_steps_per_second": 0.094,
+      "step": 60750
+    },
+    {
+      "epoch": 162.02666666666667,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029524060822777007,
+      "loss": 0.4828,
+      "step": 60760
+    },
+    {
+      "epoch": 162.05333333333334,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002952390368620543,
+      "loss": 0.489,
+      "step": 60770
+    },
+    {
+      "epoch": 162.08,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029523746524116255,
+      "loss": 0.4759,
+      "step": 60780
+    },
+    {
+      "epoch": 162.10666666666665,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002952358933650976,
+      "loss": 0.4707,
+      "step": 60790
+    },
+    {
+      "epoch": 162.13333333333333,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002952343212338621,
+      "loss": 0.4733,
+      "step": 60800
+    },
+    {
+      "epoch": 162.16,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029523274884745896,
+      "loss": 0.4769,
+      "step": 60810
+    },
+    {
+      "epoch": 162.18666666666667,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029523117620589077,
+      "loss": 0.4692,
+      "step": 60820
+    },
+    {
+      "epoch": 162.21333333333334,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00029522960330916043,
+      "loss": 0.4638,
+      "step": 60830
+    },
+    {
+      "epoch": 162.24,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002952280301572707,
+      "loss": 0.4677,
+      "step": 60840
+    },
+    {
+      "epoch": 162.26666666666668,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002952264567502242,
+      "loss": 0.4718,
+      "step": 60850
+    },
+    {
+      "epoch": 162.29333333333332,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029522488308802395,
+      "loss": 0.4667,
+      "step": 60860
+    },
+    {
+      "epoch": 162.32,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.00029522330917067246,
+      "loss": 0.4697,
+      "step": 60870
+    },
+    {
+      "epoch": 162.34666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029522173499817265,
+      "loss": 0.4771,
+      "step": 60880
+    },
+    {
+      "epoch": 162.37333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029522016057052715,
+      "loss": 0.4734,
+      "step": 60890
+    },
+    {
+      "epoch": 162.4,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029521858588773886,
+      "loss": 0.4701,
+      "step": 60900
+    },
+    {
+      "epoch": 162.42666666666668,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002952170109498105,
+      "loss": 0.4761,
+      "step": 60910
+    },
+    {
+      "epoch": 162.45333333333335,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002952154357567449,
+      "loss": 0.4782,
+      "step": 60920
+    },
+    {
+      "epoch": 162.48,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002952138603085447,
+      "loss": 0.4846,
+      "step": 60930
+    },
+    {
+      "epoch": 162.50666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002952122846052127,
+      "loss": 0.4737,
+      "step": 60940
+    },
+    {
+      "epoch": 162.53333333333333,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002952107086467518,
+      "loss": 0.4687,
+      "step": 60950
+    },
+    {
+      "epoch": 162.56,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002952091324331646,
+      "loss": 0.4694,
+      "step": 60960
+    },
+    {
+      "epoch": 162.58666666666667,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002952075559644539,
+      "loss": 0.4649,
+      "step": 60970
+    },
+    {
+      "epoch": 162.61333333333334,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002952059792406225,
+      "loss": 0.4513,
+      "step": 60980
+    },
+    {
+      "epoch": 162.64,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0002952044022616732,
+      "loss": 0.4561,
+      "step": 60990
+    },
+    {
+      "epoch": 162.66666666666666,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029520282502760884,
+      "loss": 0.4721,
+      "step": 61000
+    },
+    {
+      "epoch": 162.69333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.000295201247538432,
+      "loss": 0.4548,
+      "step": 61010
+    },
+    {
+      "epoch": 162.72,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029519966979414555,
+      "loss": 0.4715,
+      "step": 61020
+    },
+    {
+      "epoch": 162.74666666666667,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029519809179475234,
+      "loss": 0.4761,
+      "step": 61030
+    },
+    {
+      "epoch": 162.77333333333334,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.000295196513540255,
+      "loss": 0.4838,
+      "step": 61040
+    },
+    {
+      "epoch": 162.8,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029519493503065634,
+      "loss": 0.4706,
+      "step": 61050
+    },
+    {
+      "epoch": 162.82666666666665,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029519335626595924,
+      "loss": 0.465,
+      "step": 61060
+    },
+    {
+      "epoch": 162.85333333333332,
+      "grad_norm": 0.5,
+      "learning_rate": 0.00029519177724616634,
+      "loss": 0.4739,
+      "step": 61070
+    },
+    {
+      "epoch": 162.88,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029519019797128045,
+      "loss": 0.4616,
+      "step": 61080
+    },
+    {
+      "epoch": 162.90666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029518861844130445,
+      "loss": 0.465,
+      "step": 61090
+    },
+    {
+      "epoch": 162.93333333333334,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029518703865624095,
+      "loss": 0.466,
+      "step": 61100
+    },
+    {
+      "epoch": 162.96,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029518545861609286,
+      "loss": 0.4646,
+      "step": 61110
+    },
+    {
+      "epoch": 162.98666666666668,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029518387832086284,
+      "loss": 0.4775,
+      "step": 61120
+    },
+    {
+      "epoch": 163.0,
+      "eval_loss": 0.4788980185985565,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2111,
+      "eval_samples_per_second": 1.567,
+      "eval_steps_per_second": 0.098,
+      "step": 61125
+    },
+    {
+      "epoch": 163.01333333333332,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029518229777055377,
+      "loss": 0.4672,
+      "step": 61130
+    },
+    {
+      "epoch": 163.04,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029518071696516836,
+      "loss": 0.4917,
+      "step": 61140
+    },
+    {
+      "epoch": 163.06666666666666,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029517913590470943,
+      "loss": 0.4791,
+      "step": 61150
+    },
+    {
+      "epoch": 163.09333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029517755458917974,
+      "loss": 0.4738,
+      "step": 61160
+    },
+    {
+      "epoch": 163.12,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029517597301858205,
+      "loss": 0.4702,
+      "step": 61170
+    },
+    {
+      "epoch": 163.14666666666668,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002951743911929192,
+      "loss": 0.4788,
+      "step": 61180
+    },
+    {
+      "epoch": 163.17333333333335,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029517280911219387,
+      "loss": 0.4691,
+      "step": 61190
+    },
+    {
+      "epoch": 163.2,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002951712267764089,
+      "loss": 0.4677,
+      "step": 61200
+    },
+    {
+      "epoch": 163.22666666666666,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029516964418556705,
+      "loss": 0.4659,
+      "step": 61210
+    },
+    {
+      "epoch": 163.25333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002951680613396712,
+      "loss": 0.4648,
+      "step": 61220
+    },
+    {
+      "epoch": 163.28,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.000295166478238724,
+      "loss": 0.4787,
+      "step": 61230
+    },
+    {
+      "epoch": 163.30666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029516489488272824,
+      "loss": 0.46,
+      "step": 61240
+    },
+    {
+      "epoch": 163.33333333333334,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029516331127168674,
+      "loss": 0.4754,
+      "step": 61250
+    },
+    {
+      "epoch": 163.36,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029516172740560233,
+      "loss": 0.4773,
+      "step": 61260
+    },
+    {
+      "epoch": 163.38666666666666,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.00029516014328447775,
+      "loss": 0.4693,
+      "step": 61270
+    },
+    {
+      "epoch": 163.41333333333333,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029515855890831574,
+      "loss": 0.4734,
+      "step": 61280
+    },
+    {
+      "epoch": 163.44,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002951569742771191,
+      "loss": 0.4764,
+      "step": 61290
+    },
+    {
+      "epoch": 163.46666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002951553893908907,
+      "loss": 0.4862,
+      "step": 61300
+    },
+    {
+      "epoch": 163.49333333333334,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00029515380424963324,
+      "loss": 0.4776,
+      "step": 61310
+    },
+    {
+      "epoch": 163.52,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002951522188533495,
+      "loss": 0.4714,
+      "step": 61320
+    },
+    {
+      "epoch": 163.54666666666665,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002951506332020423,
+      "loss": 0.4678,
+      "step": 61330
+    },
+    {
+      "epoch": 163.57333333333332,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029514904729571436,
+      "loss": 0.467,
+      "step": 61340
+    },
+    {
+      "epoch": 163.6,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002951474611343686,
+      "loss": 0.4608,
+      "step": 61350
+    },
+    {
+      "epoch": 163.62666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002951458747180077,
+      "loss": 0.4486,
+      "step": 61360
+    },
+    {
+      "epoch": 163.65333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002951442880466345,
+      "loss": 0.4681,
+      "step": 61370
+    },
+    {
+      "epoch": 163.68,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002951427011202518,
+      "loss": 0.459,
+      "step": 61380
+    },
+    {
+      "epoch": 163.70666666666668,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029514111393886226,
+      "loss": 0.4598,
+      "step": 61390
+    },
+    {
+      "epoch": 163.73333333333332,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002951395265024688,
+      "loss": 0.479,
+      "step": 61400
+    },
+    {
+      "epoch": 163.76,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0002951379388110742,
+      "loss": 0.4804,
+      "step": 61410
+    },
+    {
+      "epoch": 163.78666666666666,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0002951363508646812,
+      "loss": 0.4786,
+      "step": 61420
+    },
+    {
+      "epoch": 163.81333333333333,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0002951347626632926,
+      "loss": 0.4638,
+      "step": 61430
+    },
+    {
+      "epoch": 163.84,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002951331742069112,
+      "loss": 0.4693,
+      "step": 61440
+    },
+    {
+      "epoch": 163.86666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029513158549553976,
+      "loss": 0.4693,
+      "step": 61450
+    },
+    {
+      "epoch": 163.89333333333335,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029512999652918115,
+      "loss": 0.4603,
+      "step": 61460
+    },
+    {
+      "epoch": 163.92,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029512840730783814,
+      "loss": 0.4704,
+      "step": 61470
+    },
+    {
+      "epoch": 163.94666666666666,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029512681783151344,
+      "loss": 0.4632,
+      "step": 61480
+    },
+    {
+      "epoch": 163.97333333333333,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0002951252281002099,
+      "loss": 0.4749,
+      "step": 61490
+    },
+    {
+      "epoch": 164.0,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029512363811393035,
+      "loss": 0.4635,
+      "step": 61500
+    },
+    {
+      "epoch": 164.0,
+      "eval_loss": 0.4781702160835266,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.4605,
+      "eval_samples_per_second": 1.53,
+      "eval_steps_per_second": 0.096,
+      "step": 61500
+    },
+    {
+      "epoch": 164.02666666666667,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029512204787267754,
+      "loss": 0.4834,
+      "step": 61510
+    },
+    {
+      "epoch": 164.05333333333334,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002951204573764542,
+      "loss": 0.4897,
+      "step": 61520
+    },
+    {
+      "epoch": 164.08,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002951188666252632,
+      "loss": 0.476,
+      "step": 61530
+    },
+    {
+      "epoch": 164.10666666666665,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002951172756191074,
+      "loss": 0.4711,
+      "step": 61540
+    },
+    {
+      "epoch": 164.13333333333333,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029511568435798953,
+      "loss": 0.4721,
+      "step": 61550
+    },
+    {
+      "epoch": 164.16,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029511409284191235,
+      "loss": 0.4766,
+      "step": 61560
+    },
+    {
+      "epoch": 164.18666666666667,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002951125010708787,
+      "loss": 0.4696,
+      "step": 61570
+    },
+    {
+      "epoch": 164.21333333333334,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029511090904489135,
+      "loss": 0.4634,
+      "step": 61580
+    },
+    {
+      "epoch": 164.24,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029510931676395306,
+      "loss": 0.4684,
+      "step": 61590
+    },
+    {
+      "epoch": 164.26666666666668,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029510772422806675,
+      "loss": 0.4721,
+      "step": 61600
+    },
+    {
+      "epoch": 164.29333333333332,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002951061314372351,
+      "loss": 0.4657,
+      "step": 61610
+    },
+    {
+      "epoch": 164.32,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029510453839146094,
+      "loss": 0.4693,
+      "step": 61620
+    },
+    {
+      "epoch": 164.34666666666666,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029510294509074714,
+      "loss": 0.4777,
+      "step": 61630
+    },
+    {
+      "epoch": 164.37333333333333,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002951013515350964,
+      "loss": 0.4738,
+      "step": 61640
+    },
+    {
+      "epoch": 164.4,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029509975772451155,
+      "loss": 0.4694,
+      "step": 61650
+    },
+    {
+      "epoch": 164.42666666666668,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029509816365899547,
+      "loss": 0.4757,
+      "step": 61660
+    },
+    {
+      "epoch": 164.45333333333335,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002950965693385508,
+      "loss": 0.4787,
+      "step": 61670
+    },
+    {
+      "epoch": 164.48,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002950949747631805,
+      "loss": 0.4843,
+      "step": 61680
+    },
+    {
+      "epoch": 164.50666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002950933799328873,
+      "loss": 0.4745,
+      "step": 61690
+    },
+    {
+      "epoch": 164.53333333333333,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.000295091784847674,
+      "loss": 0.4684,
+      "step": 61700
+    },
+    {
+      "epoch": 164.56,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00029509018950754336,
+      "loss": 0.4693,
+      "step": 61710
+    },
+    {
+      "epoch": 164.58666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002950885939124983,
+      "loss": 0.466,
+      "step": 61720
+    },
+    {
+      "epoch": 164.61333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002950869980625415,
+      "loss": 0.4507,
+      "step": 61730
+    },
+    {
+      "epoch": 164.64,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029508540195767586,
+      "loss": 0.4562,
+      "step": 61740
+    },
+    {
+      "epoch": 164.66666666666666,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002950838055979041,
+      "loss": 0.4721,
+      "step": 61750
+    },
+    {
+      "epoch": 164.69333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002950822089832291,
+      "loss": 0.4543,
+      "step": 61760
+    },
+    {
+      "epoch": 164.72,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029508061211365363,
+      "loss": 0.4716,
+      "step": 61770
+    },
+    {
+      "epoch": 164.74666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029507901498918054,
+      "loss": 0.4759,
+      "step": 61780
+    },
+    {
+      "epoch": 164.77333333333334,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029507741760981254,
+      "loss": 0.4826,
+      "step": 61790
+    },
+    {
+      "epoch": 164.8,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0002950758199755525,
+      "loss": 0.4707,
+      "step": 61800
+    },
+    {
+      "epoch": 164.82666666666665,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029507422208640324,
+      "loss": 0.4646,
+      "step": 61810
+    },
+    {
+      "epoch": 164.85333333333332,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002950726239423675,
+      "loss": 0.4741,
+      "step": 61820
+    },
+    {
+      "epoch": 164.88,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029507102554344816,
+      "loss": 0.462,
+      "step": 61830
+    },
+    {
+      "epoch": 164.90666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029506942688964805,
+      "loss": 0.4644,
+      "step": 61840
+    },
+    {
+      "epoch": 164.93333333333334,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002950678279809699,
+      "loss": 0.466,
+      "step": 61850
+    },
+    {
+      "epoch": 164.96,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002950662288174165,
+      "loss": 0.4642,
+      "step": 61860
+    },
+    {
+      "epoch": 164.98666666666668,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029506462939899076,
+      "loss": 0.478,
+      "step": 61870
+    },
+    {
+      "epoch": 165.0,
+      "eval_loss": 0.4781385660171509,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.869,
+      "eval_samples_per_second": 1.621,
+      "eval_steps_per_second": 0.101,
+      "step": 61875
+    },
+    {
+      "epoch": 165.01333333333332,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00029506302972569546,
+      "loss": 0.4677,
+      "step": 61880
+    },
+    {
+      "epoch": 165.04,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029506142979753334,
+      "loss": 0.4917,
+      "step": 61890
+    },
+    {
+      "epoch": 165.06666666666666,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029505982961450727,
+      "loss": 0.4791,
+      "step": 61900
+    },
+    {
+      "epoch": 165.09333333333333,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029505822917662007,
+      "loss": 0.4737,
+      "step": 61910
+    },
+    {
+      "epoch": 165.12,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002950566284838745,
+      "loss": 0.4702,
+      "step": 61920
+    },
+    {
+      "epoch": 165.14666666666668,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029505502753627346,
+      "loss": 0.4785,
+      "step": 61930
+    },
+    {
+      "epoch": 165.17333333333335,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029505342633381963,
+      "loss": 0.4697,
+      "step": 61940
+    },
+    {
+      "epoch": 165.2,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.000295051824876516,
+      "loss": 0.4672,
+      "step": 61950
+    },
+    {
+      "epoch": 165.22666666666666,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029505022316436523,
+      "loss": 0.4657,
+      "step": 61960
+    },
+    {
+      "epoch": 165.25333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002950486211973702,
+      "loss": 0.4657,
+      "step": 61970
+    },
+    {
+      "epoch": 165.28,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002950470189755337,
+      "loss": 0.4794,
+      "step": 61980
+    },
+    {
+      "epoch": 165.30666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029504541649885864,
+      "loss": 0.4595,
+      "step": 61990
+    },
+    {
+      "epoch": 165.33333333333334,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029504381376734766,
+      "loss": 0.4756,
+      "step": 62000
+    },
+    {
+      "epoch": 165.36,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.00029504221078100373,
+      "loss": 0.4774,
+      "step": 62010
+    },
+    {
+      "epoch": 165.38666666666666,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002950406075398296,
+      "loss": 0.4691,
+      "step": 62020
+    },
+    {
+      "epoch": 165.41333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.000295039004043828,
+      "loss": 0.4737,
+      "step": 62030
+    },
+    {
+      "epoch": 165.44,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002950374002930019,
+      "loss": 0.4766,
+      "step": 62040
+    },
+    {
+      "epoch": 165.46666666666667,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00029503579628735417,
+      "loss": 0.4857,
+      "step": 62050
+    },
+    {
+      "epoch": 165.49333333333334,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00029503419202688735,
+      "loss": 0.4776,
+      "step": 62060
+    },
+    {
+      "epoch": 165.52,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029503258751160456,
+      "loss": 0.4721,
+      "step": 62070
+    },
+    {
+      "epoch": 165.54666666666665,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002950309827415084,
+      "loss": 0.4689,
+      "step": 62080
+    },
+    {
+      "epoch": 165.57333333333332,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002950293777166018,
+      "loss": 0.4681,
+      "step": 62090
+    },
+    {
+      "epoch": 165.6,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029502777243688757,
+      "loss": 0.4615,
+      "step": 62100
+    },
+    {
+      "epoch": 165.62666666666667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002950261669023685,
+      "loss": 0.4485,
+      "step": 62110
+    },
+    {
+      "epoch": 165.65333333333334,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002950245611130474,
+      "loss": 0.468,
+      "step": 62120
+    },
+    {
+      "epoch": 165.68,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00029502295506892714,
+      "loss": 0.4596,
+      "step": 62130
+    },
+    {
+      "epoch": 165.70666666666668,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029502134877001056,
+      "loss": 0.4594,
+      "step": 62140
+    },
+    {
+      "epoch": 165.73333333333332,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002950197422163004,
+      "loss": 0.4794,
+      "step": 62150
+    },
+    {
+      "epoch": 165.76,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.00029501813540779953,
+      "loss": 0.48,
+      "step": 62160
+    },
+    {
+      "epoch": 165.78666666666666,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029501652834451077,
+      "loss": 0.4787,
+      "step": 62170
+    },
+    {
+      "epoch": 165.81333333333333,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029501492102643694,
+      "loss": 0.4634,
+      "step": 62180
+    },
+    {
+      "epoch": 165.84,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029501331345358086,
+      "loss": 0.47,
+      "step": 62190
+    },
+    {
+      "epoch": 165.86666666666667,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00029501170562594534,
+      "loss": 0.469,
+      "step": 62200
+    },
+    {
+      "epoch": 165.89333333333335,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029501009754353326,
+      "loss": 0.4599,
+      "step": 62210
+    },
+    {
+      "epoch": 165.92,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002950084892063474,
+      "loss": 0.471,
+      "step": 62220
+    },
+    {
+      "epoch": 165.94666666666666,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002950068806143906,
+      "loss": 0.4632,
+      "step": 62230
+    },
+    {
+      "epoch": 165.97333333333333,
+      "grad_norm": 0.5,
+      "learning_rate": 0.00029500527176766565,
+      "loss": 0.474,
+      "step": 62240
+    },
+    {
+      "epoch": 166.0,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0002950036626661754,
+      "loss": 0.4636,
+      "step": 62250
+    },
+    {
+      "epoch": 166.0,
+      "eval_loss": 0.4782837927341461,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.1462,
+      "eval_samples_per_second": 1.577,
+      "eval_steps_per_second": 0.099,
+      "step": 62250
+    },
+    {
+      "epoch": 166.02666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002950020533099227,
+      "loss": 0.4832,
+      "step": 62260
+    },
+    {
+      "epoch": 166.05333333333334,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029500044369891036,
+      "loss": 0.4889,
+      "step": 62270
+    },
+    {
+      "epoch": 166.08,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029499883383314124,
+      "loss": 0.4757,
+      "step": 62280
+    },
+    {
+      "epoch": 166.10666666666665,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002949972237126181,
+      "loss": 0.4707,
+      "step": 62290
+    },
+    {
+      "epoch": 166.13333333333333,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.00029499561333734385,
+      "loss": 0.4729,
+      "step": 62300
+    },
+    {
+      "epoch": 166.16,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029499400270732127,
+      "loss": 0.477,
+      "step": 62310
+    },
+    {
+      "epoch": 166.18666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029499239182255313,
+      "loss": 0.4692,
+      "step": 62320
+    },
+    {
+      "epoch": 166.21333333333334,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029499078068304243,
+      "loss": 0.4636,
+      "step": 62330
+    },
+    {
+      "epoch": 166.24,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029498916928879186,
+      "loss": 0.4683,
+      "step": 62340
+    },
+    {
+      "epoch": 166.26666666666668,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00029498755763980426,
+      "loss": 0.4723,
+      "step": 62350
+    },
+    {
+      "epoch": 166.29333333333332,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002949859457360825,
+      "loss": 0.466,
+      "step": 62360
+    },
+    {
+      "epoch": 166.32,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029498433357762937,
+      "loss": 0.4685,
+      "step": 62370
+    },
+    {
+      "epoch": 166.34666666666666,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029498272116444777,
+      "loss": 0.4769,
+      "step": 62380
+    },
+    {
+      "epoch": 166.37333333333333,
+      "grad_norm": 0.5,
+      "learning_rate": 0.00029498110849654046,
+      "loss": 0.4736,
+      "step": 62390
+    },
+    {
+      "epoch": 166.4,
+      "grad_norm": 0.5,
+      "learning_rate": 0.00029497949557391037,
+      "loss": 0.4704,
+      "step": 62400
+    },
+    {
+      "epoch": 166.42666666666668,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00029497788239656026,
+      "loss": 0.4759,
+      "step": 62410
+    },
+    {
+      "epoch": 166.45333333333335,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.000294976268964493,
+      "loss": 0.4779,
+      "step": 62420
+    },
+    {
+      "epoch": 166.48,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029497465527771136,
+      "loss": 0.4849,
+      "step": 62430
+    },
+    {
+      "epoch": 166.50666666666666,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002949730413362182,
+      "loss": 0.474,
+      "step": 62440
+    },
+    {
+      "epoch": 166.53333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029497142714001644,
+      "loss": 0.4684,
+      "step": 62450
+    },
+    {
+      "epoch": 166.56,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0002949698126891088,
+      "loss": 0.4691,
+      "step": 62460
+    },
+    {
+      "epoch": 166.58666666666667,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002949681979834982,
+      "loss": 0.4654,
+      "step": 62470
+    },
+    {
+      "epoch": 166.61333333333334,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002949665830231874,
+      "loss": 0.4515,
+      "step": 62480
+    },
+    {
+      "epoch": 166.64,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029496496780817934,
+      "loss": 0.4559,
+      "step": 62490
+    },
+    {
+      "epoch": 166.66666666666666,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029496335233847673,
+      "loss": 0.4713,
+      "step": 62500
+    },
+    {
+      "epoch": 166.69333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029496173661408255,
+      "loss": 0.4549,
+      "step": 62510
+    },
+    {
+      "epoch": 166.72,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029496012063499953,
+      "loss": 0.4721,
+      "step": 62520
+    },
+    {
+      "epoch": 166.74666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002949585044012305,
+      "loss": 0.4762,
+      "step": 62530
+    },
+    {
+      "epoch": 166.77333333333334,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00029495688791277843,
+      "loss": 0.484,
+      "step": 62540
+    },
+    {
+      "epoch": 166.8,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029495527116964605,
+      "loss": 0.4708,
+      "step": 62550
+    },
+    {
+      "epoch": 166.82666666666665,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002949536541718362,
+      "loss": 0.4649,
+      "step": 62560
+    },
+    {
+      "epoch": 166.85333333333332,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002949520369193517,
+      "loss": 0.4737,
+      "step": 62570
+    },
+    {
+      "epoch": 166.88,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002949504194121955,
+      "loss": 0.4617,
+      "step": 62580
+    },
+    {
+      "epoch": 166.90666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002949488016503704,
+      "loss": 0.4645,
+      "step": 62590
+    },
+    {
+      "epoch": 166.93333333333334,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0002949471836338792,
+      "loss": 0.4665,
+      "step": 62600
+    },
+    {
+      "epoch": 166.96,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029494556536272475,
+      "loss": 0.4642,
+      "step": 62610
+    },
+    {
+      "epoch": 166.98666666666668,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002949439468369099,
+      "loss": 0.478,
+      "step": 62620
+    },
+    {
+      "epoch": 167.0,
+      "eval_loss": 0.4798828363418579,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3858,
+      "eval_samples_per_second": 1.541,
+      "eval_steps_per_second": 0.096,
+      "step": 62625
+    },
+    {
+      "epoch": 167.01333333333332,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029494232805643756,
+      "loss": 0.4666,
+      "step": 62630
+    },
+    {
+      "epoch": 167.04,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029494070902131046,
+      "loss": 0.4911,
+      "step": 62640
+    },
+    {
+      "epoch": 167.06666666666666,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029493908973153153,
+      "loss": 0.4786,
+      "step": 62650
+    },
+    {
+      "epoch": 167.09333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029493747018710355,
+      "loss": 0.4734,
+      "step": 62660
+    },
+    {
+      "epoch": 167.12,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029493585038802944,
+      "loss": 0.47,
+      "step": 62670
+    },
+    {
+      "epoch": 167.14666666666668,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029493423033431196,
+      "loss": 0.4781,
+      "step": 62680
+    },
+    {
+      "epoch": 167.17333333333335,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029493261002595404,
+      "loss": 0.4697,
+      "step": 62690
+    },
+    {
+      "epoch": 167.2,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002949309894629585,
+      "loss": 0.4679,
+      "step": 62700
+    },
+    {
+      "epoch": 167.22666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002949293686453281,
+      "loss": 0.4654,
+      "step": 62710
+    },
+    {
+      "epoch": 167.25333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029492774757306585,
+      "loss": 0.4655,
+      "step": 62720
+    },
+    {
+      "epoch": 167.28,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.00029492612624617447,
+      "loss": 0.4787,
+      "step": 62730
+    },
+    {
+      "epoch": 167.30666666666667,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0002949245046646569,
+      "loss": 0.46,
+      "step": 62740
+    },
+    {
+      "epoch": 167.33333333333334,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002949228828285159,
+      "loss": 0.4756,
+      "step": 62750
+    },
+    {
+      "epoch": 167.36,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002949212607377543,
+      "loss": 0.4771,
+      "step": 62760
+    },
+    {
+      "epoch": 167.38666666666666,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029491963839237514,
+      "loss": 0.4687,
+      "step": 62770
+    },
+    {
+      "epoch": 167.41333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029491801579238105,
+      "loss": 0.4738,
+      "step": 62780
+    },
+    {
+      "epoch": 167.44,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029491639293777503,
+      "loss": 0.4763,
+      "step": 62790
+    },
+    {
+      "epoch": 167.46666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002949147698285598,
+      "loss": 0.4856,
+      "step": 62800
+    },
+    {
+      "epoch": 167.49333333333334,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029491314646473834,
+      "loss": 0.4775,
+      "step": 62810
+    },
+    {
+      "epoch": 167.52,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002949115228463134,
+      "loss": 0.4713,
+      "step": 62820
+    },
+    {
+      "epoch": 167.54666666666665,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00029490989897328795,
+      "loss": 0.4691,
+      "step": 62830
+    },
+    {
+      "epoch": 167.57333333333332,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002949082748456647,
+      "loss": 0.4672,
+      "step": 62840
+    },
+    {
+      "epoch": 167.6,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002949066504634466,
+      "loss": 0.4614,
+      "step": 62850
+    },
+    {
+      "epoch": 167.62666666666667,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00029490502582663644,
+      "loss": 0.4483,
+      "step": 62860
+    },
+    {
+      "epoch": 167.65333333333334,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002949034009352372,
+      "loss": 0.4682,
+      "step": 62870
+    },
+    {
+      "epoch": 167.68,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002949017757892516,
+      "loss": 0.4595,
+      "step": 62880
+    },
+    {
+      "epoch": 167.70666666666668,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029490015038868253,
+      "loss": 0.4602,
+      "step": 62890
+    },
+    {
+      "epoch": 167.73333333333332,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029489852473353286,
+      "loss": 0.4788,
+      "step": 62900
+    },
+    {
+      "epoch": 167.76,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002948968988238055,
+      "loss": 0.4801,
+      "step": 62910
+    },
+    {
+      "epoch": 167.78666666666666,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002948952726595032,
+      "loss": 0.4781,
+      "step": 62920
+    },
+    {
+      "epoch": 167.81333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002948936462406289,
+      "loss": 0.4641,
+      "step": 62930
+    },
+    {
+      "epoch": 167.84,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002948920195671854,
+      "loss": 0.4694,
+      "step": 62940
+    },
+    {
+      "epoch": 167.86666666666667,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002948903926391756,
+      "loss": 0.4686,
+      "step": 62950
+    },
+    {
+      "epoch": 167.89333333333335,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002948887654566023,
+      "loss": 0.4605,
+      "step": 62960
+    },
+    {
+      "epoch": 167.92,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029488713801946844,
+      "loss": 0.4704,
+      "step": 62970
+    },
+    {
+      "epoch": 167.94666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029488551032777683,
+      "loss": 0.463,
+      "step": 62980
+    },
+    {
+      "epoch": 167.97333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002948838823815303,
+      "loss": 0.4746,
+      "step": 62990
+    },
+    {
+      "epoch": 168.0,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029488225418073186,
+      "loss": 0.4629,
+      "step": 63000
+    },
+    {
+      "epoch": 168.0,
+      "eval_loss": 0.47927647829055786,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5313,
+      "eval_samples_per_second": 1.519,
+      "eval_steps_per_second": 0.095,
+      "step": 63000
+    },
+    {
+      "epoch": 168.02666666666667,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002948806257253842,
+      "loss": 0.4834,
+      "step": 63010
+    },
+    {
+      "epoch": 168.05333333333334,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029487899701549023,
+      "loss": 0.4888,
+      "step": 63020
+    },
+    {
+      "epoch": 168.08,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002948773680510528,
+      "loss": 0.4762,
+      "step": 63030
+    },
+    {
+      "epoch": 168.10666666666665,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002948757388320749,
+      "loss": 0.4708,
+      "step": 63040
+    },
+    {
+      "epoch": 168.13333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002948741093585592,
+      "loss": 0.4727,
+      "step": 63050
+    },
+    {
+      "epoch": 168.16,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.00029487247963050865,
+      "loss": 0.4772,
+      "step": 63060
+    },
+    {
+      "epoch": 168.18666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029487084964792615,
+      "loss": 0.469,
+      "step": 63070
+    },
+    {
+      "epoch": 168.21333333333334,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002948692194108145,
+      "loss": 0.4633,
+      "step": 63080
+    },
+    {
+      "epoch": 168.24,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029486758891917664,
+      "loss": 0.4673,
+      "step": 63090
+    },
+    {
+      "epoch": 168.26666666666668,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002948659581730154,
+      "loss": 0.472,
+      "step": 63100
+    },
+    {
+      "epoch": 168.29333333333332,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029486432717233354,
+      "loss": 0.4653,
+      "step": 63110
+    },
+    {
+      "epoch": 168.32,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002948626959171341,
+      "loss": 0.469,
+      "step": 63120
+    },
+    {
+      "epoch": 168.34666666666666,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002948610644074198,
+      "loss": 0.4783,
+      "step": 63130
+    },
+    {
+      "epoch": 168.37333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002948594326431936,
+      "loss": 0.4735,
+      "step": 63140
+    },
+    {
+      "epoch": 168.4,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029485780062445837,
+      "loss": 0.4702,
+      "step": 63150
+    },
+    {
+      "epoch": 168.42666666666668,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002948561683512169,
+      "loss": 0.4773,
+      "step": 63160
+    },
+    {
+      "epoch": 168.45333333333335,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002948545358234721,
+      "loss": 0.479,
+      "step": 63170
+    },
+    {
+      "epoch": 168.48,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0002948529030412269,
+      "loss": 0.4851,
+      "step": 63180
+    },
+    {
+      "epoch": 168.50666666666666,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029485127000448405,
+      "loss": 0.4748,
+      "step": 63190
+    },
+    {
+      "epoch": 168.53333333333333,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029484963671324655,
+      "loss": 0.4674,
+      "step": 63200
+    },
+    {
+      "epoch": 168.56,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002948480031675171,
+      "loss": 0.4694,
+      "step": 63210
+    },
+    {
+      "epoch": 168.58666666666667,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0002948463693672987,
+      "loss": 0.4653,
+      "step": 63220
+    },
+    {
+      "epoch": 168.61333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029484473531259423,
+      "loss": 0.4509,
+      "step": 63230
+    },
+    {
+      "epoch": 168.64,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002948431010034065,
+      "loss": 0.4567,
+      "step": 63240
+    },
+    {
+      "epoch": 168.66666666666666,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002948414664397384,
+      "loss": 0.4714,
+      "step": 63250
+    },
+    {
+      "epoch": 168.69333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002948398316215928,
+      "loss": 0.4545,
+      "step": 63260
+    },
+    {
+      "epoch": 168.72,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0002948381965489726,
+      "loss": 0.4713,
+      "step": 63270
+    },
+    {
+      "epoch": 168.74666666666667,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.00029483656122188063,
+      "loss": 0.4756,
+      "step": 63280
+    },
+    {
+      "epoch": 168.77333333333334,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029483492564031977,
+      "loss": 0.4832,
+      "step": 63290
+    },
+    {
+      "epoch": 168.8,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002948332898042929,
+      "loss": 0.4708,
+      "step": 63300
+    },
+    {
+      "epoch": 168.82666666666665,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002948316537138029,
+      "loss": 0.4642,
+      "step": 63310
+    },
+    {
+      "epoch": 168.85333333333332,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002948300173688526,
+      "loss": 0.4742,
+      "step": 63320
+    },
+    {
+      "epoch": 168.88,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.000294828380769445,
+      "loss": 0.4618,
+      "step": 63330
+    },
+    {
+      "epoch": 168.90666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002948267439155829,
+      "loss": 0.4645,
+      "step": 63340
+    },
+    {
+      "epoch": 168.93333333333334,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029482510680726916,
+      "loss": 0.4666,
+      "step": 63350
+    },
+    {
+      "epoch": 168.96,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002948234694445066,
+      "loss": 0.4638,
+      "step": 63360
+    },
+    {
+      "epoch": 168.98666666666668,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002948218318272982,
+      "loss": 0.4789,
+      "step": 63370
+    },
+    {
+      "epoch": 169.0,
+      "eval_loss": 0.47983646392822266,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5389,
+      "eval_samples_per_second": 1.518,
+      "eval_steps_per_second": 0.095,
+      "step": 63375
+    },
+    {
+      "epoch": 169.01333333333332,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029482019395564684,
+      "loss": 0.4677,
+      "step": 63380
+    },
+    {
+      "epoch": 169.04,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002948185558295553,
+      "loss": 0.4913,
+      "step": 63390
+    },
+    {
+      "epoch": 169.06666666666666,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002948169174490265,
+      "loss": 0.4795,
+      "step": 63400
+    },
+    {
+      "epoch": 169.09333333333333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002948152788140634,
+      "loss": 0.4736,
+      "step": 63410
+    },
+    {
+      "epoch": 169.12,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002948136399246688,
+      "loss": 0.4705,
+      "step": 63420
+    },
+    {
+      "epoch": 169.14666666666668,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002948120007808456,
+      "loss": 0.4788,
+      "step": 63430
+    },
+    {
+      "epoch": 169.17333333333335,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029481036138259665,
+      "loss": 0.4695,
+      "step": 63440
+    },
+    {
+      "epoch": 169.2,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002948087217299248,
+      "loss": 0.4674,
+      "step": 63450
+    },
+    {
+      "epoch": 169.22666666666666,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002948070818228331,
+      "loss": 0.4655,
+      "step": 63460
+    },
+    {
+      "epoch": 169.25333333333333,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029480544166132425,
+      "loss": 0.4646,
+      "step": 63470
+    },
+    {
+      "epoch": 169.28,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029480380124540114,
+      "loss": 0.4784,
+      "step": 63480
+    },
+    {
+      "epoch": 169.30666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002948021605750668,
+      "loss": 0.4594,
+      "step": 63490
+    },
+    {
+      "epoch": 169.33333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.000294800519650324,
+      "loss": 0.4747,
+      "step": 63500
+    },
+    {
+      "epoch": 169.36,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002947988784711756,
+      "loss": 0.477,
+      "step": 63510
+    },
+    {
+      "epoch": 169.38666666666666,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029479723703762457,
+      "loss": 0.4691,
+      "step": 63520
+    },
+    {
+      "epoch": 169.41333333333333,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00029479559534967373,
+      "loss": 0.4733,
+      "step": 63530
+    },
+    {
+      "epoch": 169.44,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029479395340732597,
+      "loss": 0.4765,
+      "step": 63540
+    },
+    {
+      "epoch": 169.46666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029479231121058426,
+      "loss": 0.4855,
+      "step": 63550
+    },
+    {
+      "epoch": 169.49333333333334,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029479066875945137,
+      "loss": 0.4771,
+      "step": 63560
+    },
+    {
+      "epoch": 169.52,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0002947890260539302,
+      "loss": 0.4717,
+      "step": 63570
+    },
+    {
+      "epoch": 169.54666666666665,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029478738309402376,
+      "loss": 0.4691,
+      "step": 63580
+    },
+    {
+      "epoch": 169.57333333333332,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002947857398797348,
+      "loss": 0.4674,
+      "step": 63590
+    },
+    {
+      "epoch": 169.6,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002947840964110662,
+      "loss": 0.461,
+      "step": 63600
+    },
+    {
+      "epoch": 169.62666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029478245268802096,
+      "loss": 0.4487,
+      "step": 63610
+    },
+    {
+      "epoch": 169.65333333333334,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029478080871060187,
+      "loss": 0.4684,
+      "step": 63620
+    },
+    {
+      "epoch": 169.68,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029477916447881187,
+      "loss": 0.4594,
+      "step": 63630
+    },
+    {
+      "epoch": 169.70666666666668,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029477751999265384,
+      "loss": 0.4597,
+      "step": 63640
+    },
+    {
+      "epoch": 169.73333333333332,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029477587525213064,
+      "loss": 0.4782,
+      "step": 63650
+    },
+    {
+      "epoch": 169.76,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002947742302572452,
+      "loss": 0.48,
+      "step": 63660
+    },
+    {
+      "epoch": 169.78666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029477258500800043,
+      "loss": 0.4787,
+      "step": 63670
+    },
+    {
+      "epoch": 169.81333333333333,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002947709395043991,
+      "loss": 0.4641,
+      "step": 63680
+    },
+    {
+      "epoch": 169.84,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002947692937464442,
+      "loss": 0.4691,
+      "step": 63690
+    },
+    {
+      "epoch": 169.86666666666667,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002947676477341387,
+      "loss": 0.4693,
+      "step": 63700
+    },
+    {
+      "epoch": 169.89333333333335,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.00029476600146748533,
+      "loss": 0.4603,
+      "step": 63710
+    },
+    {
+      "epoch": 169.92,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029476435494648706,
+      "loss": 0.47,
+      "step": 63720
+    },
+    {
+      "epoch": 169.94666666666666,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029476270817114673,
+      "loss": 0.4625,
+      "step": 63730
+    },
+    {
+      "epoch": 169.97333333333333,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029476106114146737,
+      "loss": 0.4743,
+      "step": 63740
+    },
+    {
+      "epoch": 170.0,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029475941385745174,
+      "loss": 0.4638,
+      "step": 63750
+    },
+    {
+      "epoch": 170.0,
+      "eval_loss": 0.4791702628135681,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.5946,
+      "eval_samples_per_second": 1.668,
+      "eval_steps_per_second": 0.104,
+      "step": 63750
+    },
+    {
+      "epoch": 170.02666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002947577663191027,
+      "loss": 0.4828,
+      "step": 63760
+    },
+    {
+      "epoch": 170.05333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002947561185264233,
+      "loss": 0.4889,
+      "step": 63770
+    },
+    {
+      "epoch": 170.08,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002947544704794163,
+      "loss": 0.4758,
+      "step": 63780
+    },
+    {
+      "epoch": 170.10666666666665,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002947528221780847,
+      "loss": 0.4712,
+      "step": 63790
+    },
+    {
+      "epoch": 170.13333333333333,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029475117362243137,
+      "loss": 0.4722,
+      "step": 63800
+    },
+    {
+      "epoch": 170.16,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002947495248124591,
+      "loss": 0.4768,
+      "step": 63810
+    },
+    {
+      "epoch": 170.18666666666667,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002947478757481709,
+      "loss": 0.4688,
+      "step": 63820
+    },
+    {
+      "epoch": 170.21333333333334,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002947462264295697,
+      "loss": 0.4644,
+      "step": 63830
+    },
+    {
+      "epoch": 170.24,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029474457685665825,
+      "loss": 0.4678,
+      "step": 63840
+    },
+    {
+      "epoch": 170.26666666666668,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029474292702943956,
+      "loss": 0.4715,
+      "step": 63850
+    },
+    {
+      "epoch": 170.29333333333332,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002947412769479165,
+      "loss": 0.4659,
+      "step": 63860
+    },
+    {
+      "epoch": 170.32,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.000294739626612092,
+      "loss": 0.4688,
+      "step": 63870
+    },
+    {
+      "epoch": 170.34666666666666,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002947379760219689,
+      "loss": 0.4769,
+      "step": 63880
+    },
+    {
+      "epoch": 170.37333333333333,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0002947363251775501,
+      "loss": 0.4733,
+      "step": 63890
+    },
+    {
+      "epoch": 170.4,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002947346740788386,
+      "loss": 0.4703,
+      "step": 63900
+    },
+    {
+      "epoch": 170.42666666666668,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0002947330227258372,
+      "loss": 0.4758,
+      "step": 63910
+    },
+    {
+      "epoch": 170.45333333333335,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029473137111854877,
+      "loss": 0.4783,
+      "step": 63920
+    },
+    {
+      "epoch": 170.48,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002947297192569764,
+      "loss": 0.4847,
+      "step": 63930
+    },
+    {
+      "epoch": 170.50666666666666,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029472806714112275,
+      "loss": 0.4746,
+      "step": 63940
+    },
+    {
+      "epoch": 170.53333333333333,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00029472641477099087,
+      "loss": 0.4684,
+      "step": 63950
+    },
+    {
+      "epoch": 170.56,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002947247621465837,
+      "loss": 0.4692,
+      "step": 63960
+    },
+    {
+      "epoch": 170.58666666666667,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.000294723109267904,
+      "loss": 0.4647,
+      "step": 63970
+    },
+    {
+      "epoch": 170.61333333333334,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029472145613495474,
+      "loss": 0.451,
+      "step": 63980
+    },
+    {
+      "epoch": 170.64,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002947198027477389,
+      "loss": 0.4565,
+      "step": 63990
+    },
+    {
+      "epoch": 170.66666666666666,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029471814910625926,
+      "loss": 0.4711,
+      "step": 64000
+    },
+    {
+      "epoch": 170.69333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029471649521051885,
+      "loss": 0.4541,
+      "step": 64010
+    },
+    {
+      "epoch": 170.72,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029471484106052045,
+      "loss": 0.4713,
+      "step": 64020
+    },
+    {
+      "epoch": 170.74666666666667,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029471318665626704,
+      "loss": 0.4762,
+      "step": 64030
+    },
+    {
+      "epoch": 170.77333333333334,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0002947115319977615,
+      "loss": 0.4833,
+      "step": 64040
+    },
+    {
+      "epoch": 170.8,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.00029470987708500677,
+      "loss": 0.4707,
+      "step": 64050
+    },
+    {
+      "epoch": 170.82666666666665,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002947082219180057,
+      "loss": 0.4652,
+      "step": 64060
+    },
+    {
+      "epoch": 170.85333333333332,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002947065664967613,
+      "loss": 0.4739,
+      "step": 64070
+    },
+    {
+      "epoch": 170.88,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002947049108212764,
+      "loss": 0.4614,
+      "step": 64080
+    },
+    {
+      "epoch": 170.90666666666667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002947032548915539,
+      "loss": 0.4648,
+      "step": 64090
+    },
+    {
+      "epoch": 170.93333333333334,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002947015987075967,
+      "loss": 0.4658,
+      "step": 64100
+    },
+    {
+      "epoch": 170.96,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029469994226940784,
+      "loss": 0.4648,
+      "step": 64110
+    },
+    {
+      "epoch": 170.98666666666668,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002946982855769901,
+      "loss": 0.4789,
+      "step": 64120
+    },
+    {
+      "epoch": 171.0,
+      "eval_loss": 0.4793436825275421,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.8251,
+      "eval_samples_per_second": 1.478,
+      "eval_steps_per_second": 0.092,
+      "step": 64125
+    },
+    {
+      "epoch": 171.01333333333332,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0002946966286303464,
+      "loss": 0.4671,
+      "step": 64130
+    },
+    {
+      "epoch": 171.04,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029469497142947963,
+      "loss": 0.4912,
+      "step": 64140
+    },
+    {
+      "epoch": 171.06666666666666,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002946933139743928,
+      "loss": 0.4792,
+      "step": 64150
+    },
+    {
+      "epoch": 171.09333333333333,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029469165626508874,
+      "loss": 0.4735,
+      "step": 64160
+    },
+    {
+      "epoch": 171.12,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002946899983015704,
+      "loss": 0.4694,
+      "step": 64170
+    },
+    {
+      "epoch": 171.14666666666668,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002946883400838407,
+      "loss": 0.4789,
+      "step": 64180
+    },
+    {
+      "epoch": 171.17333333333335,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002946866816119025,
+      "loss": 0.4688,
+      "step": 64190
+    },
+    {
+      "epoch": 171.2,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002946850228857588,
+      "loss": 0.4681,
+      "step": 64200
+    },
+    {
+      "epoch": 171.22666666666666,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.00029468336390541243,
+      "loss": 0.4657,
+      "step": 64210
+    },
+    {
+      "epoch": 171.25333333333333,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029468170467086634,
+      "loss": 0.465,
+      "step": 64220
+    },
+    {
+      "epoch": 171.28,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.00029468004518212347,
+      "loss": 0.4793,
+      "step": 64230
+    },
+    {
+      "epoch": 171.30666666666667,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029467838543918674,
+      "loss": 0.4597,
+      "step": 64240
+    },
+    {
+      "epoch": 171.33333333333334,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029467672544205897,
+      "loss": 0.4754,
+      "step": 64250
+    },
+    {
+      "epoch": 171.36,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029467506519074315,
+      "loss": 0.477,
+      "step": 64260
+    },
+    {
+      "epoch": 171.38666666666666,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002946734046852422,
+      "loss": 0.4694,
+      "step": 64270
+    },
+    {
+      "epoch": 171.41333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.000294671743925559,
+      "loss": 0.4739,
+      "step": 64280
+    },
+    {
+      "epoch": 171.44,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002946700829116965,
+      "loss": 0.4766,
+      "step": 64290
+    },
+    {
+      "epoch": 171.46666666666667,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002946684216436577,
+      "loss": 0.4862,
+      "step": 64300
+    },
+    {
+      "epoch": 171.49333333333334,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002946667601214454,
+      "loss": 0.4768,
+      "step": 64310
+    },
+    {
+      "epoch": 171.52,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002946650983450625,
+      "loss": 0.4713,
+      "step": 64320
+    },
+    {
+      "epoch": 171.54666666666665,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.000294663436314512,
+      "loss": 0.469,
+      "step": 64330
+    },
+    {
+      "epoch": 171.57333333333332,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029466177402979674,
+      "loss": 0.4678,
+      "step": 64340
+    },
+    {
+      "epoch": 171.6,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029466011149091977,
+      "loss": 0.4615,
+      "step": 64350
+    },
+    {
+      "epoch": 171.62666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002946584486978839,
+      "loss": 0.4491,
+      "step": 64360
+    },
+    {
+      "epoch": 171.65333333333334,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002946567856506921,
+      "loss": 0.4682,
+      "step": 64370
+    },
+    {
+      "epoch": 171.68,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029465512234934724,
+      "loss": 0.4588,
+      "step": 64380
+    },
+    {
+      "epoch": 171.70666666666668,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002946534587938523,
+      "loss": 0.4597,
+      "step": 64390
+    },
+    {
+      "epoch": 171.73333333333332,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002946517949842102,
+      "loss": 0.4786,
+      "step": 64400
+    },
+    {
+      "epoch": 171.76,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0002946501309204238,
+      "loss": 0.4801,
+      "step": 64410
+    },
+    {
+      "epoch": 171.78666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002946484666024961,
+      "loss": 0.4789,
+      "step": 64420
+    },
+    {
+      "epoch": 171.81333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029464680203043,
+      "loss": 0.4644,
+      "step": 64430
+    },
+    {
+      "epoch": 171.84,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002946451372042284,
+      "loss": 0.4694,
+      "step": 64440
+    },
+    {
+      "epoch": 171.86666666666667,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002946434721238942,
+      "loss": 0.4691,
+      "step": 64450
+    },
+    {
+      "epoch": 171.89333333333335,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00029464180678943046,
+      "loss": 0.46,
+      "step": 64460
+    },
+    {
+      "epoch": 171.92,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029464014120083996,
+      "loss": 0.4705,
+      "step": 64470
+    },
+    {
+      "epoch": 171.94666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002946384753581257,
+      "loss": 0.4628,
+      "step": 64480
+    },
+    {
+      "epoch": 171.97333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002946368092612905,
+      "loss": 0.4736,
+      "step": 64490
+    },
+    {
+      "epoch": 172.0,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002946351429103375,
+      "loss": 0.4631,
+      "step": 64500
+    },
+    {
+      "epoch": 172.0,
+      "eval_loss": 0.48019760847091675,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5782,
+      "eval_samples_per_second": 1.513,
+      "eval_steps_per_second": 0.095,
+      "step": 64500
+    },
+    {
+      "epoch": 172.02666666666667,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0002946334763052694,
+      "loss": 0.4831,
+      "step": 64510
+    },
+    {
+      "epoch": 172.05333333333334,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0002946318094460893,
+      "loss": 0.4889,
+      "step": 64520
+    },
+    {
+      "epoch": 172.08,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002946301423328,
+      "loss": 0.4763,
+      "step": 64530
+    },
+    {
+      "epoch": 172.10666666666665,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029462847496540453,
+      "loss": 0.4701,
+      "step": 64540
+    },
+    {
+      "epoch": 172.13333333333333,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029462680734390574,
+      "loss": 0.4723,
+      "step": 64550
+    },
+    {
+      "epoch": 172.16,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029462513946830666,
+      "loss": 0.4772,
+      "step": 64560
+    },
+    {
+      "epoch": 172.18666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002946234713386101,
+      "loss": 0.4686,
+      "step": 64570
+    },
+    {
+      "epoch": 172.21333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002946218029548191,
+      "loss": 0.4639,
+      "step": 64580
+    },
+    {
+      "epoch": 172.24,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002946201343169364,
+      "loss": 0.4674,
+      "step": 64590
+    },
+    {
+      "epoch": 172.26666666666668,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029461846542496524,
+      "loss": 0.4712,
+      "step": 64600
+    },
+    {
+      "epoch": 172.29333333333332,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002946167962789083,
+      "loss": 0.4659,
+      "step": 64610
+    },
+    {
+      "epoch": 172.32,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002946151268787686,
+      "loss": 0.469,
+      "step": 64620
+    },
+    {
+      "epoch": 172.34666666666666,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029461345722454907,
+      "loss": 0.4778,
+      "step": 64630
+    },
+    {
+      "epoch": 172.37333333333333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029461178731625263,
+      "loss": 0.4729,
+      "step": 64640
+    },
+    {
+      "epoch": 172.4,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.00029461011715388224,
+      "loss": 0.4698,
+      "step": 64650
+    },
+    {
+      "epoch": 172.42666666666668,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029460844673744085,
+      "loss": 0.476,
+      "step": 64660
+    },
+    {
+      "epoch": 172.45333333333335,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0002946067760669313,
+      "loss": 0.4785,
+      "step": 64670
+    },
+    {
+      "epoch": 172.48,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0002946051051423566,
+      "loss": 0.4844,
+      "step": 64680
+    },
+    {
+      "epoch": 172.50666666666666,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002946034339637198,
+      "loss": 0.4743,
+      "step": 64690
+    },
+    {
+      "epoch": 172.53333333333333,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002946017625310236,
+      "loss": 0.4677,
+      "step": 64700
+    },
+    {
+      "epoch": 172.56,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.000294600090844271,
+      "loss": 0.4694,
+      "step": 64710
+    },
+    {
+      "epoch": 172.58666666666667,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029459841890346506,
+      "loss": 0.4645,
+      "step": 64720
+    },
+    {
+      "epoch": 172.61333333333334,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002945967467086086,
+      "loss": 0.4507,
+      "step": 64730
+    },
+    {
+      "epoch": 172.64,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029459507425970465,
+      "loss": 0.4565,
+      "step": 64740
+    },
+    {
+      "epoch": 172.66666666666666,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029459340155675606,
+      "loss": 0.4711,
+      "step": 64750
+    },
+    {
+      "epoch": 172.69333333333333,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029459172859976583,
+      "loss": 0.4546,
+      "step": 64760
+    },
+    {
+      "epoch": 172.72,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029459005538873685,
+      "loss": 0.4719,
+      "step": 64770
+    },
+    {
+      "epoch": 172.74666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029458838192367214,
+      "loss": 0.476,
+      "step": 64780
+    },
+    {
+      "epoch": 172.77333333333334,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002945867082045745,
+      "loss": 0.4833,
+      "step": 64790
+    },
+    {
+      "epoch": 172.8,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002945850342314471,
+      "loss": 0.4709,
+      "step": 64800
+    },
+    {
+      "epoch": 172.82666666666665,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029458336000429266,
+      "loss": 0.4655,
+      "step": 64810
+    },
+    {
+      "epoch": 172.85333333333332,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002945816855231141,
+      "loss": 0.4729,
+      "step": 64820
+    },
+    {
+      "epoch": 172.88,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002945800107879146,
+      "loss": 0.4615,
+      "step": 64830
+    },
+    {
+      "epoch": 172.90666666666667,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002945783357986969,
+      "loss": 0.4648,
+      "step": 64840
+    },
+    {
+      "epoch": 172.93333333333334,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.000294576660555464,
+      "loss": 0.4665,
+      "step": 64850
+    },
+    {
+      "epoch": 172.96,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029457498505821894,
+      "loss": 0.4634,
+      "step": 64860
+    },
+    {
+      "epoch": 172.98666666666668,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002945733093069645,
+      "loss": 0.4779,
+      "step": 64870
+    },
+    {
+      "epoch": 173.0,
+      "eval_loss": 0.4777694344520569,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9365,
+      "eval_samples_per_second": 1.61,
+      "eval_steps_per_second": 0.101,
+      "step": 64875
+    },
+    {
+      "epoch": 173.01333333333332,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002945716333017037,
+      "loss": 0.4672,
+      "step": 64880
+    },
+    {
+      "epoch": 173.04,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002945699570424395,
+      "loss": 0.4908,
+      "step": 64890
+    },
+    {
+      "epoch": 173.06666666666666,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0002945682805291748,
+      "loss": 0.4793,
+      "step": 64900
+    },
+    {
+      "epoch": 173.09333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002945666037619126,
+      "loss": 0.4737,
+      "step": 64910
+    },
+    {
+      "epoch": 173.12,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002945649267406558,
+      "loss": 0.47,
+      "step": 64920
+    },
+    {
+      "epoch": 173.14666666666668,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002945632494654074,
+      "loss": 0.4789,
+      "step": 64930
+    },
+    {
+      "epoch": 173.17333333333335,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002945615719361703,
+      "loss": 0.47,
+      "step": 64940
+    },
+    {
+      "epoch": 173.2,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029455989415294744,
+      "loss": 0.4676,
+      "step": 64950
+    },
+    {
+      "epoch": 173.22666666666666,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002945582161157418,
+      "loss": 0.4652,
+      "step": 64960
+    },
+    {
+      "epoch": 173.25333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029455653782455634,
+      "loss": 0.4647,
+      "step": 64970
+    },
+    {
+      "epoch": 173.28,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.000294554859279394,
+      "loss": 0.4787,
+      "step": 64980
+    },
+    {
+      "epoch": 173.30666666666667,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029455318048025766,
+      "loss": 0.4594,
+      "step": 64990
+    },
+    {
+      "epoch": 173.33333333333334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029455150142715034,
+      "loss": 0.4751,
+      "step": 65000
+    },
+    {
+      "epoch": 173.36,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.000294549822120075,
+      "loss": 0.4773,
+      "step": 65010
+    },
+    {
+      "epoch": 173.38666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002945481425590345,
+      "loss": 0.4686,
+      "step": 65020
+    },
+    {
+      "epoch": 173.41333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002945464627440319,
+      "loss": 0.4738,
+      "step": 65030
+    },
+    {
+      "epoch": 173.44,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002945447826750701,
+      "loss": 0.4757,
+      "step": 65040
+    },
+    {
+      "epoch": 173.46666666666667,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029454310235215207,
+      "loss": 0.486,
+      "step": 65050
+    },
+    {
+      "epoch": 173.49333333333334,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029454142177528077,
+      "loss": 0.4775,
+      "step": 65060
+    },
+    {
+      "epoch": 173.52,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002945397409444591,
+      "loss": 0.472,
+      "step": 65070
+    },
+    {
+      "epoch": 173.54666666666665,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029453805985969,
+      "loss": 0.4682,
+      "step": 65080
+    },
+    {
+      "epoch": 173.57333333333332,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002945363785209765,
+      "loss": 0.4676,
+      "step": 65090
+    },
+    {
+      "epoch": 173.6,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002945346969283216,
+      "loss": 0.4608,
+      "step": 65100
+    },
+    {
+      "epoch": 173.62666666666667,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0002945330150817281,
+      "loss": 0.4482,
+      "step": 65110
+    },
+    {
+      "epoch": 173.65333333333334,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029453133298119904,
+      "loss": 0.4681,
+      "step": 65120
+    },
+    {
+      "epoch": 173.68,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002945296506267374,
+      "loss": 0.4594,
+      "step": 65130
+    },
+    {
+      "epoch": 173.70666666666668,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002945279680183461,
+      "loss": 0.46,
+      "step": 65140
+    },
+    {
+      "epoch": 173.73333333333332,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002945262851560281,
+      "loss": 0.4783,
+      "step": 65150
+    },
+    {
+      "epoch": 173.76,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029452460203978635,
+      "loss": 0.4801,
+      "step": 65160
+    },
+    {
+      "epoch": 173.78666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029452291866962385,
+      "loss": 0.4783,
+      "step": 65170
+    },
+    {
+      "epoch": 173.81333333333333,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002945212350455434,
+      "loss": 0.464,
+      "step": 65180
+    },
+    {
+      "epoch": 173.84,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002945195511675482,
+      "loss": 0.4701,
+      "step": 65190
+    },
+    {
+      "epoch": 173.86666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029451786703564104,
+      "loss": 0.4684,
+      "step": 65200
+    },
+    {
+      "epoch": 173.89333333333335,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.00029451618264982494,
+      "loss": 0.4604,
+      "step": 65210
+    },
+    {
+      "epoch": 173.92,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029451449801010283,
+      "loss": 0.4702,
+      "step": 65220
+    },
+    {
+      "epoch": 173.94666666666666,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002945128131164777,
+      "loss": 0.4634,
+      "step": 65230
+    },
+    {
+      "epoch": 173.97333333333333,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029451112796895256,
+      "loss": 0.4742,
+      "step": 65240
+    },
+    {
+      "epoch": 174.0,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029450944256753024,
+      "loss": 0.4637,
+      "step": 65250
+    },
+    {
+      "epoch": 174.0,
+      "eval_loss": 0.47973257303237915,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9547,
+      "eval_samples_per_second": 1.607,
+      "eval_steps_per_second": 0.1,
+      "step": 65250
+    },
+    {
+      "epoch": 174.02666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029450775691221374,
+      "loss": 0.4829,
+      "step": 65260
+    },
+    {
+      "epoch": 174.05333333333334,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002945060710030061,
+      "loss": 0.4891,
+      "step": 65270
+    },
+    {
+      "epoch": 174.08,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002945043848399102,
+      "loss": 0.4759,
+      "step": 65280
+    },
+    {
+      "epoch": 174.10666666666665,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029450269842292906,
+      "loss": 0.471,
+      "step": 65290
+    },
+    {
+      "epoch": 174.13333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029450101175206565,
+      "loss": 0.4724,
+      "step": 65300
+    },
+    {
+      "epoch": 174.16,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002944993248273228,
+      "loss": 0.4767,
+      "step": 65310
+    },
+    {
+      "epoch": 174.18666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029449763764870365,
+      "loss": 0.4687,
+      "step": 65320
+    },
+    {
+      "epoch": 174.21333333333334,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002944959502162111,
+      "loss": 0.4635,
+      "step": 65330
+    },
+    {
+      "epoch": 174.24,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002944942625298481,
+      "loss": 0.4677,
+      "step": 65340
+    },
+    {
+      "epoch": 174.26666666666668,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029449257458961757,
+      "loss": 0.4717,
+      "step": 65350
+    },
+    {
+      "epoch": 174.29333333333332,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002944908863955226,
+      "loss": 0.4663,
+      "step": 65360
+    },
+    {
+      "epoch": 174.32,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.000294489197947566,
+      "loss": 0.4689,
+      "step": 65370
+    },
+    {
+      "epoch": 174.34666666666666,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029448750924575083,
+      "loss": 0.4778,
+      "step": 65380
+    },
+    {
+      "epoch": 174.37333333333333,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002944858202900801,
+      "loss": 0.4728,
+      "step": 65390
+    },
+    {
+      "epoch": 174.4,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.00029448413108055664,
+      "loss": 0.4705,
+      "step": 65400
+    },
+    {
+      "epoch": 174.42666666666668,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029448244161718357,
+      "loss": 0.4758,
+      "step": 65410
+    },
+    {
+      "epoch": 174.45333333333335,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00029448075189996375,
+      "loss": 0.4783,
+      "step": 65420
+    },
+    {
+      "epoch": 174.48,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0002944790619289002,
+      "loss": 0.4842,
+      "step": 65430
+    },
+    {
+      "epoch": 174.50666666666666,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002944773717039959,
+      "loss": 0.4744,
+      "step": 65440
+    },
+    {
+      "epoch": 174.53333333333333,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029447568122525377,
+      "loss": 0.4677,
+      "step": 65450
+    },
+    {
+      "epoch": 174.56,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029447399049267677,
+      "loss": 0.4694,
+      "step": 65460
+    },
+    {
+      "epoch": 174.58666666666667,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0002944722995062679,
+      "loss": 0.4641,
+      "step": 65470
+    },
+    {
+      "epoch": 174.61333333333334,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029447060826603023,
+      "loss": 0.4513,
+      "step": 65480
+    },
+    {
+      "epoch": 174.64,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002944689167719666,
+      "loss": 0.4564,
+      "step": 65490
+    },
+    {
+      "epoch": 174.66666666666666,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029446722502408,
+      "loss": 0.471,
+      "step": 65500
+    },
+    {
+      "epoch": 174.69333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002944655330223734,
+      "loss": 0.4544,
+      "step": 65510
+    },
+    {
+      "epoch": 174.72,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002944638407668498,
+      "loss": 0.472,
+      "step": 65520
+    },
+    {
+      "epoch": 174.74666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002944621482575122,
+      "loss": 0.4755,
+      "step": 65530
+    },
+    {
+      "epoch": 174.77333333333334,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.00029446045549436355,
+      "loss": 0.4839,
+      "step": 65540
+    },
+    {
+      "epoch": 174.8,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029445876247740683,
+      "loss": 0.4703,
+      "step": 65550
+    },
+    {
+      "epoch": 174.82666666666665,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.00029445706920664495,
+      "loss": 0.4651,
+      "step": 65560
+    },
+    {
+      "epoch": 174.85333333333332,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00029445537568208093,
+      "loss": 0.4734,
+      "step": 65570
+    },
+    {
+      "epoch": 174.88,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002944536819037178,
+      "loss": 0.4618,
+      "step": 65580
+    },
+    {
+      "epoch": 174.90666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029445198787155847,
+      "loss": 0.4645,
+      "step": 65590
+    },
+    {
+      "epoch": 174.93333333333334,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002944502935856059,
+      "loss": 0.4659,
+      "step": 65600
+    },
+    {
+      "epoch": 174.96,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029444859904586315,
+      "loss": 0.4638,
+      "step": 65610
+    },
+    {
+      "epoch": 174.98666666666668,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002944469042523331,
+      "loss": 0.4782,
+      "step": 65620
+    },
+    {
+      "epoch": 175.0,
+      "eval_loss": 0.47829604148864746,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8386,
+      "eval_samples_per_second": 1.626,
+      "eval_steps_per_second": 0.102,
+      "step": 65625
+    },
+    {
+      "epoch": 175.01333333333332,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0002944452092050188,
+      "loss": 0.4673,
+      "step": 65630
+    },
+    {
+      "epoch": 175.04,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00029444351390392324,
+      "loss": 0.4919,
+      "step": 65640
+    },
+    {
+      "epoch": 175.06666666666666,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0002944418183490493,
+      "loss": 0.4784,
+      "step": 65650
+    },
+    {
+      "epoch": 175.09333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002944401225404,
+      "loss": 0.4739,
+      "step": 65660
+    },
+    {
+      "epoch": 175.12,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002944384264779784,
+      "loss": 0.4695,
+      "step": 65670
+    },
+    {
+      "epoch": 175.14666666666668,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002944367301617874,
+      "loss": 0.478,
+      "step": 65680
+    },
+    {
+      "epoch": 175.17333333333335,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00029443503359183,
+      "loss": 0.4692,
+      "step": 65690
+    },
+    {
+      "epoch": 175.2,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002944333367681092,
+      "loss": 0.4675,
+      "step": 65700
+    },
+    {
+      "epoch": 175.22666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029443163969062794,
+      "loss": 0.4651,
+      "step": 65710
+    },
+    {
+      "epoch": 175.25333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002944299423593892,
+      "loss": 0.4653,
+      "step": 65720
+    },
+    {
+      "epoch": 175.28,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.000294428244774396,
+      "loss": 0.4787,
+      "step": 65730
+    },
+    {
+      "epoch": 175.30666666666667,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0002944265469356513,
+      "loss": 0.4601,
+      "step": 65740
+    },
+    {
+      "epoch": 175.33333333333334,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029442484884315814,
+      "loss": 0.4756,
+      "step": 65750
+    },
+    {
+      "epoch": 175.36,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029442315049691943,
+      "loss": 0.4776,
+      "step": 65760
+    },
+    {
+      "epoch": 175.38666666666666,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029442145189693817,
+      "loss": 0.4689,
+      "step": 65770
+    },
+    {
+      "epoch": 175.41333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002944197530432174,
+      "loss": 0.4737,
+      "step": 65780
+    },
+    {
+      "epoch": 175.44,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029441805393575995,
+      "loss": 0.4766,
+      "step": 65790
+    },
+    {
+      "epoch": 175.46666666666667,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.000294416354574569,
+      "loss": 0.4857,
+      "step": 65800
+    },
+    {
+      "epoch": 175.49333333333334,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0002944146549596474,
+      "loss": 0.4773,
+      "step": 65810
+    },
+    {
+      "epoch": 175.52,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029441295509099823,
+      "loss": 0.4715,
+      "step": 65820
+    },
+    {
+      "epoch": 175.54666666666665,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002944112549686244,
+      "loss": 0.4682,
+      "step": 65830
+    },
+    {
+      "epoch": 175.57333333333332,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029440955459252894,
+      "loss": 0.4675,
+      "step": 65840
+    },
+    {
+      "epoch": 175.6,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029440785396271483,
+      "loss": 0.4607,
+      "step": 65850
+    },
+    {
+      "epoch": 175.62666666666667,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.000294406153079185,
+      "loss": 0.4474,
+      "step": 65860
+    },
+    {
+      "epoch": 175.65333333333334,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002944044519419426,
+      "loss": 0.4674,
+      "step": 65870
+    },
+    {
+      "epoch": 175.68,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002944027505509904,
+      "loss": 0.4591,
+      "step": 65880
+    },
+    {
+      "epoch": 175.70666666666668,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0002944010489063316,
+      "loss": 0.46,
+      "step": 65890
+    },
+    {
+      "epoch": 175.73333333333332,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.000294399347007969,
+      "loss": 0.4793,
+      "step": 65900
+    },
+    {
+      "epoch": 175.76,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002943976448559057,
+      "loss": 0.4798,
+      "step": 65910
+    },
+    {
+      "epoch": 175.78666666666666,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002943959424501447,
+      "loss": 0.4785,
+      "step": 65920
+    },
+    {
+      "epoch": 175.81333333333333,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.000294394239790689,
+      "loss": 0.4638,
+      "step": 65930
+    },
+    {
+      "epoch": 175.84,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002943925368775415,
+      "loss": 0.4696,
+      "step": 65940
+    },
+    {
+      "epoch": 175.86666666666667,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029439083371070524,
+      "loss": 0.469,
+      "step": 65950
+    },
+    {
+      "epoch": 175.89333333333335,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029438913029018325,
+      "loss": 0.4601,
+      "step": 65960
+    },
+    {
+      "epoch": 175.92,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002943874266159784,
+      "loss": 0.4696,
+      "step": 65970
+    },
+    {
+      "epoch": 175.94666666666666,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00029438572268809387,
+      "loss": 0.4624,
+      "step": 65980
+    },
+    {
+      "epoch": 175.97333333333333,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029438401850653253,
+      "loss": 0.4751,
+      "step": 65990
+    },
+    {
+      "epoch": 176.0,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002943823140712974,
+      "loss": 0.4632,
+      "step": 66000
+    },
+    {
+      "epoch": 176.0,
+      "eval_loss": 0.47858747839927673,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9547,
+      "eval_samples_per_second": 1.607,
+      "eval_steps_per_second": 0.1,
+      "step": 66000
+    },
+    {
+      "epoch": 176.02666666666667,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029438060938239145,
+      "loss": 0.4829,
+      "step": 66010
+    },
+    {
+      "epoch": 176.05333333333334,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0002943789044398177,
+      "loss": 0.4888,
+      "step": 66020
+    },
+    {
+      "epoch": 176.08,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0002943771992435792,
+      "loss": 0.4758,
+      "step": 66030
+    },
+    {
+      "epoch": 176.10666666666665,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029437549379367884,
+      "loss": 0.4713,
+      "step": 66040
+    },
+    {
+      "epoch": 176.13333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002943737880901197,
+      "loss": 0.4724,
+      "step": 66050
+    },
+    {
+      "epoch": 176.16,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.00029437208213290475,
+      "loss": 0.4772,
+      "step": 66060
+    },
+    {
+      "epoch": 176.18666666666667,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029437037592203695,
+      "loss": 0.4692,
+      "step": 66070
+    },
+    {
+      "epoch": 176.21333333333334,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029436866945751935,
+      "loss": 0.4635,
+      "step": 66080
+    },
+    {
+      "epoch": 176.24,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029436696273935493,
+      "loss": 0.4668,
+      "step": 66090
+    },
+    {
+      "epoch": 176.26666666666668,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002943652557675466,
+      "loss": 0.4723,
+      "step": 66100
+    },
+    {
+      "epoch": 176.29333333333332,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002943635485420976,
+      "loss": 0.4656,
+      "step": 66110
+    },
+    {
+      "epoch": 176.32,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002943618410630107,
+      "loss": 0.4694,
+      "step": 66120
+    },
+    {
+      "epoch": 176.34666666666666,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.000294360133330289,
+      "loss": 0.4774,
+      "step": 66130
+    },
+    {
+      "epoch": 176.37333333333333,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002943584253439354,
+      "loss": 0.4733,
+      "step": 66140
+    },
+    {
+      "epoch": 176.4,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00029435671710395305,
+      "loss": 0.47,
+      "step": 66150
+    },
+    {
+      "epoch": 176.42666666666668,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00029435500861034486,
+      "loss": 0.4767,
+      "step": 66160
+    },
+    {
+      "epoch": 176.45333333333335,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002943532998631138,
+      "loss": 0.4785,
+      "step": 66170
+    },
+    {
+      "epoch": 176.48,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029435159086226297,
+      "loss": 0.4846,
+      "step": 66180
+    },
+    {
+      "epoch": 176.50666666666666,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029434988160779535,
+      "loss": 0.4742,
+      "step": 66190
+    },
+    {
+      "epoch": 176.53333333333333,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002943481720997139,
+      "loss": 0.4683,
+      "step": 66200
+    },
+    {
+      "epoch": 176.56,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002943464623380216,
+      "loss": 0.4692,
+      "step": 66210
+    },
+    {
+      "epoch": 176.58666666666667,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002943447523227215,
+      "loss": 0.4655,
+      "step": 66220
+    },
+    {
+      "epoch": 176.61333333333334,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029434304205381667,
+      "loss": 0.4513,
+      "step": 66230
+    },
+    {
+      "epoch": 176.64,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029434133153131,
+      "loss": 0.4563,
+      "step": 66240
+    },
+    {
+      "epoch": 176.66666666666666,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0002943396207552046,
+      "loss": 0.4718,
+      "step": 66250
+    },
+    {
+      "epoch": 176.69333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002943379097255033,
+      "loss": 0.4545,
+      "step": 66260
+    },
+    {
+      "epoch": 176.72,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029433619844220925,
+      "loss": 0.4715,
+      "step": 66270
+    },
+    {
+      "epoch": 176.74666666666667,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029433448690532546,
+      "loss": 0.4751,
+      "step": 66280
+    },
+    {
+      "epoch": 176.77333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002943327751148549,
+      "loss": 0.4833,
+      "step": 66290
+    },
+    {
+      "epoch": 176.8,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002943310630708006,
+      "loss": 0.4702,
+      "step": 66300
+    },
+    {
+      "epoch": 176.82666666666665,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002943293507731655,
+      "loss": 0.4649,
+      "step": 66310
+    },
+    {
+      "epoch": 176.85333333333332,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0002943276382219527,
+      "loss": 0.4732,
+      "step": 66320
+    },
+    {
+      "epoch": 176.88,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029432592541716517,
+      "loss": 0.4616,
+      "step": 66330
+    },
+    {
+      "epoch": 176.90666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002943242123588059,
+      "loss": 0.4646,
+      "step": 66340
+    },
+    {
+      "epoch": 176.93333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002943224990468779,
+      "loss": 0.466,
+      "step": 66350
+    },
+    {
+      "epoch": 176.96,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029432078548138424,
+      "loss": 0.4632,
+      "step": 66360
+    },
+    {
+      "epoch": 176.98666666666668,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029431907166232784,
+      "loss": 0.4785,
+      "step": 66370
+    },
+    {
+      "epoch": 177.0,
+      "eval_loss": 0.47848471999168396,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0402,
+      "eval_samples_per_second": 1.594,
+      "eval_steps_per_second": 0.1,
+      "step": 66375
+    },
+    {
+      "epoch": 177.01333333333332,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002943173575897118,
+      "loss": 0.4671,
+      "step": 66380
+    },
+    {
+      "epoch": 177.04,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002943156432635391,
+      "loss": 0.4906,
+      "step": 66390
+    },
+    {
+      "epoch": 177.06666666666666,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029431392868381267,
+      "loss": 0.479,
+      "step": 66400
+    },
+    {
+      "epoch": 177.09333333333333,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029431221385053567,
+      "loss": 0.4738,
+      "step": 66410
+    },
+    {
+      "epoch": 177.12,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.000294310498763711,
+      "loss": 0.4697,
+      "step": 66420
+    },
+    {
+      "epoch": 177.14666666666668,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029430878342334173,
+      "loss": 0.4781,
+      "step": 66430
+    },
+    {
+      "epoch": 177.17333333333335,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002943070678294308,
+      "loss": 0.4696,
+      "step": 66440
+    },
+    {
+      "epoch": 177.2,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.00029430535198198134,
+      "loss": 0.4673,
+      "step": 66450
+    },
+    {
+      "epoch": 177.22666666666666,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002943036358809963,
+      "loss": 0.4648,
+      "step": 66460
+    },
+    {
+      "epoch": 177.25333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002943019195264787,
+      "loss": 0.4646,
+      "step": 66470
+    },
+    {
+      "epoch": 177.28,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002943002029184315,
+      "loss": 0.4787,
+      "step": 66480
+    },
+    {
+      "epoch": 177.30666666666667,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002942984860568578,
+      "loss": 0.4591,
+      "step": 66490
+    },
+    {
+      "epoch": 177.33333333333334,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029429676894176057,
+      "loss": 0.4755,
+      "step": 66500
+    },
+    {
+      "epoch": 177.36,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002942950515731429,
+      "loss": 0.4772,
+      "step": 66510
+    },
+    {
+      "epoch": 177.38666666666666,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002942933339510077,
+      "loss": 0.4692,
+      "step": 66520
+    },
+    {
+      "epoch": 177.41333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029429161607535807,
+      "loss": 0.4735,
+      "step": 66530
+    },
+    {
+      "epoch": 177.44,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.000294289897946197,
+      "loss": 0.4762,
+      "step": 66540
+    },
+    {
+      "epoch": 177.46666666666667,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029428817956352743,
+      "loss": 0.4858,
+      "step": 66550
+    },
+    {
+      "epoch": 177.49333333333334,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029428646092735254,
+      "loss": 0.4765,
+      "step": 66560
+    },
+    {
+      "epoch": 177.52,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002942847420376752,
+      "loss": 0.4707,
+      "step": 66570
+    },
+    {
+      "epoch": 177.54666666666665,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002942830228944985,
+      "loss": 0.4683,
+      "step": 66580
+    },
+    {
+      "epoch": 177.57333333333332,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029428130349782553,
+      "loss": 0.4679,
+      "step": 66590
+    },
+    {
+      "epoch": 177.6,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002942795838476592,
+      "loss": 0.4612,
+      "step": 66600
+    },
+    {
+      "epoch": 177.62666666666667,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0002942778639440025,
+      "loss": 0.4485,
+      "step": 66610
+    },
+    {
+      "epoch": 177.65333333333334,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002942761437868586,
+      "loss": 0.4678,
+      "step": 66620
+    },
+    {
+      "epoch": 177.68,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029427442337623036,
+      "loss": 0.4588,
+      "step": 66630
+    },
+    {
+      "epoch": 177.70666666666668,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029427270271212096,
+      "loss": 0.4593,
+      "step": 66640
+    },
+    {
+      "epoch": 177.73333333333332,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002942709817945333,
+      "loss": 0.4789,
+      "step": 66650
+    },
+    {
+      "epoch": 177.76,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029426926062347045,
+      "loss": 0.4802,
+      "step": 66660
+    },
+    {
+      "epoch": 177.78666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029426753919893547,
+      "loss": 0.4783,
+      "step": 66670
+    },
+    {
+      "epoch": 177.81333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002942658175209313,
+      "loss": 0.4637,
+      "step": 66680
+    },
+    {
+      "epoch": 177.84,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.00029426409558946107,
+      "loss": 0.4696,
+      "step": 66690
+    },
+    {
+      "epoch": 177.86666666666667,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002942623734045277,
+      "loss": 0.4686,
+      "step": 66700
+    },
+    {
+      "epoch": 177.89333333333335,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002942606509661343,
+      "loss": 0.4598,
+      "step": 66710
+    },
+    {
+      "epoch": 177.92,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002942589282742838,
+      "loss": 0.4702,
+      "step": 66720
+    },
+    {
+      "epoch": 177.94666666666666,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002942572053289794,
+      "loss": 0.4633,
+      "step": 66730
+    },
+    {
+      "epoch": 177.97333333333333,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002942554821302239,
+      "loss": 0.4747,
+      "step": 66740
+    },
+    {
+      "epoch": 178.0,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002942537586780205,
+      "loss": 0.4638,
+      "step": 66750
+    },
+    {
+      "epoch": 178.0,
+      "eval_loss": 0.47956180572509766,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.918,
+      "eval_samples_per_second": 1.613,
+      "eval_steps_per_second": 0.101,
+      "step": 66750
+    },
+    {
+      "epoch": 178.02666666666667,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029425203497237217,
+      "loss": 0.4836,
+      "step": 66760
+    },
+    {
+      "epoch": 178.05333333333334,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002942503110132819,
+      "loss": 0.4888,
+      "step": 66770
+    },
+    {
+      "epoch": 178.08,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002942485868007528,
+      "loss": 0.4758,
+      "step": 66780
+    },
+    {
+      "epoch": 178.10666666666665,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029424686233478785,
+      "loss": 0.471,
+      "step": 66790
+    },
+    {
+      "epoch": 178.13333333333333,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.00029424513761539006,
+      "loss": 0.4725,
+      "step": 66800
+    },
+    {
+      "epoch": 178.16,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002942434126425625,
+      "loss": 0.4763,
+      "step": 66810
+    },
+    {
+      "epoch": 178.18666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029424168741630817,
+      "loss": 0.469,
+      "step": 66820
+    },
+    {
+      "epoch": 178.21333333333334,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029423996193663014,
+      "loss": 0.4637,
+      "step": 66830
+    },
+    {
+      "epoch": 178.24,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002942382362035314,
+      "loss": 0.4674,
+      "step": 66840
+    },
+    {
+      "epoch": 178.26666666666668,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.000294236510217015,
+      "loss": 0.4717,
+      "step": 66850
+    },
+    {
+      "epoch": 178.29333333333332,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.000294234783977084,
+      "loss": 0.4658,
+      "step": 66860
+    },
+    {
+      "epoch": 178.32,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029423305748374137,
+      "loss": 0.4691,
+      "step": 66870
+    },
+    {
+      "epoch": 178.34666666666666,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002942313307369902,
+      "loss": 0.4777,
+      "step": 66880
+    },
+    {
+      "epoch": 178.37333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002942296037368335,
+      "loss": 0.4734,
+      "step": 66890
+    },
+    {
+      "epoch": 178.4,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0002942278764832743,
+      "loss": 0.4696,
+      "step": 66900
+    },
+    {
+      "epoch": 178.42666666666668,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002942261489763157,
+      "loss": 0.4759,
+      "step": 66910
+    },
+    {
+      "epoch": 178.45333333333335,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002942244212159606,
+      "loss": 0.4785,
+      "step": 66920
+    },
+    {
+      "epoch": 178.48,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002942226932022122,
+      "loss": 0.4842,
+      "step": 66930
+    },
+    {
+      "epoch": 178.50666666666666,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029422096493507334,
+      "loss": 0.4741,
+      "step": 66940
+    },
+    {
+      "epoch": 178.53333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029421923641454725,
+      "loss": 0.4678,
+      "step": 66950
+    },
+    {
+      "epoch": 178.56,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002942175076406369,
+      "loss": 0.4688,
+      "step": 66960
+    },
+    {
+      "epoch": 178.58666666666667,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002942157786133452,
+      "loss": 0.4647,
+      "step": 66970
+    },
+    {
+      "epoch": 178.61333333333334,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029421404933267537,
+      "loss": 0.4518,
+      "step": 66980
+    },
+    {
+      "epoch": 178.64,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029421231979863036,
+      "loss": 0.4567,
+      "step": 66990
+    },
+    {
+      "epoch": 178.66666666666666,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002942105900112132,
+      "loss": 0.4721,
+      "step": 67000
+    },
+    {
+      "epoch": 178.69333333333333,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.000294208859970427,
+      "loss": 0.4545,
+      "step": 67010
+    },
+    {
+      "epoch": 178.72,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029420712967627476,
+      "loss": 0.4713,
+      "step": 67020
+    },
+    {
+      "epoch": 178.74666666666667,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002942053991287595,
+      "loss": 0.476,
+      "step": 67030
+    },
+    {
+      "epoch": 178.77333333333334,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002942036683278843,
+      "loss": 0.4829,
+      "step": 67040
+    },
+    {
+      "epoch": 178.8,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0002942019372736521,
+      "loss": 0.4701,
+      "step": 67050
+    },
+    {
+      "epoch": 178.82666666666665,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002942002059660661,
+      "loss": 0.4647,
+      "step": 67060
+    },
+    {
+      "epoch": 178.85333333333332,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029419847440512924,
+      "loss": 0.4732,
+      "step": 67070
+    },
+    {
+      "epoch": 178.88,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029419674259084454,
+      "loss": 0.4614,
+      "step": 67080
+    },
+    {
+      "epoch": 178.90666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002941950105232151,
+      "loss": 0.4643,
+      "step": 67090
+    },
+    {
+      "epoch": 178.93333333333334,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029419327820224395,
+      "loss": 0.4664,
+      "step": 67100
+    },
+    {
+      "epoch": 178.96,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002941915456279341,
+      "loss": 0.4638,
+      "step": 67110
+    },
+    {
+      "epoch": 178.98666666666668,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0002941898128002887,
+      "loss": 0.4779,
+      "step": 67120
+    },
+    {
+      "epoch": 179.0,
+      "eval_loss": 0.4776739180088043,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5294,
+      "eval_samples_per_second": 1.52,
+      "eval_steps_per_second": 0.095,
+      "step": 67125
+    },
+    {
+      "epoch": 179.01333333333332,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029418807971931066,
+      "loss": 0.4673,
+      "step": 67130
+    },
+    {
+      "epoch": 179.04,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002941863463850031,
+      "loss": 0.4911,
+      "step": 67140
+    },
+    {
+      "epoch": 179.06666666666666,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.000294184612797369,
+      "loss": 0.4787,
+      "step": 67150
+    },
+    {
+      "epoch": 179.09333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029418287895641154,
+      "loss": 0.474,
+      "step": 67160
+    },
+    {
+      "epoch": 179.12,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029418114486213365,
+      "loss": 0.4694,
+      "step": 67170
+    },
+    {
+      "epoch": 179.14666666666668,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029417941051453837,
+      "loss": 0.4784,
+      "step": 67180
+    },
+    {
+      "epoch": 179.17333333333335,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.00029417767591362884,
+      "loss": 0.4695,
+      "step": 67190
+    },
+    {
+      "epoch": 179.2,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.000294175941059408,
+      "loss": 0.4674,
+      "step": 67200
+    },
+    {
+      "epoch": 179.22666666666666,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.000294174205951879,
+      "loss": 0.4649,
+      "step": 67210
+    },
+    {
+      "epoch": 179.25333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002941724705910448,
+      "loss": 0.4653,
+      "step": 67220
+    },
+    {
+      "epoch": 179.28,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002941707349769085,
+      "loss": 0.4789,
+      "step": 67230
+    },
+    {
+      "epoch": 179.30666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029416899910947314,
+      "loss": 0.4597,
+      "step": 67240
+    },
+    {
+      "epoch": 179.33333333333334,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002941672629887418,
+      "loss": 0.4757,
+      "step": 67250
+    },
+    {
+      "epoch": 179.36,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002941655266147174,
+      "loss": 0.4771,
+      "step": 67260
+    },
+    {
+      "epoch": 179.38666666666666,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00029416378998740314,
+      "loss": 0.4683,
+      "step": 67270
+    },
+    {
+      "epoch": 179.41333333333333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029416205310680204,
+      "loss": 0.4736,
+      "step": 67280
+    },
+    {
+      "epoch": 179.44,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002941603159729171,
+      "loss": 0.4761,
+      "step": 67290
+    },
+    {
+      "epoch": 179.46666666666667,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029415857858575145,
+      "loss": 0.4856,
+      "step": 67300
+    },
+    {
+      "epoch": 179.49333333333334,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.000294156840945308,
+      "loss": 0.477,
+      "step": 67310
+    },
+    {
+      "epoch": 179.52,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029415510305159004,
+      "loss": 0.4715,
+      "step": 67320
+    },
+    {
+      "epoch": 179.54666666666665,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029415336490460037,
+      "loss": 0.4682,
+      "step": 67330
+    },
+    {
+      "epoch": 179.57333333333332,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029415162650434226,
+      "loss": 0.4674,
+      "step": 67340
+    },
+    {
+      "epoch": 179.6,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029414988785081853,
+      "loss": 0.4614,
+      "step": 67350
+    },
+    {
+      "epoch": 179.62666666666667,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029414814894403244,
+      "loss": 0.448,
+      "step": 67360
+    },
+    {
+      "epoch": 179.65333333333334,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002941464097839869,
+      "loss": 0.4686,
+      "step": 67370
+    },
+    {
+      "epoch": 179.68,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029414467037068516,
+      "loss": 0.4595,
+      "step": 67380
+    },
+    {
+      "epoch": 179.70666666666668,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002941429307041301,
+      "loss": 0.459,
+      "step": 67390
+    },
+    {
+      "epoch": 179.73333333333332,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029414119078432474,
+      "loss": 0.4788,
+      "step": 67400
+    },
+    {
+      "epoch": 179.76,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002941394506112723,
+      "loss": 0.4799,
+      "step": 67410
+    },
+    {
+      "epoch": 179.78666666666666,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029413771018497575,
+      "loss": 0.4783,
+      "step": 67420
+    },
+    {
+      "epoch": 179.81333333333333,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029413596950543817,
+      "loss": 0.4635,
+      "step": 67430
+    },
+    {
+      "epoch": 179.84,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0002941342285726626,
+      "loss": 0.4694,
+      "step": 67440
+    },
+    {
+      "epoch": 179.86666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002941324873866521,
+      "loss": 0.4689,
+      "step": 67450
+    },
+    {
+      "epoch": 179.89333333333335,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002941307459474097,
+      "loss": 0.4593,
+      "step": 67460
+    },
+    {
+      "epoch": 179.92,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029412900425493854,
+      "loss": 0.47,
+      "step": 67470
+    },
+    {
+      "epoch": 179.94666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029412726230924164,
+      "loss": 0.4629,
+      "step": 67480
+    },
+    {
+      "epoch": 179.97333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029412552011032206,
+      "loss": 0.4744,
+      "step": 67490
+    },
+    {
+      "epoch": 180.0,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029412377765818283,
+      "loss": 0.463,
+      "step": 67500
+    },
+    {
+      "epoch": 180.0,
+      "eval_loss": 0.47930899262428284,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 11.2008,
+      "eval_samples_per_second": 1.428,
+      "eval_steps_per_second": 0.089,
+      "step": 67500
+    },
+    {
+      "epoch": 180.02666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029412203495282705,
+      "loss": 0.4827,
+      "step": 67510
+    },
+    {
+      "epoch": 180.05333333333334,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029412029199425775,
+      "loss": 0.4879,
+      "step": 67520
+    },
+    {
+      "epoch": 180.08,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00029411854878247807,
+      "loss": 0.4751,
+      "step": 67530
+    },
+    {
+      "epoch": 180.10666666666665,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029411680531749094,
+      "loss": 0.471,
+      "step": 67540
+    },
+    {
+      "epoch": 180.13333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029411506159929957,
+      "loss": 0.4723,
+      "step": 67550
+    },
+    {
+      "epoch": 180.16,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029411331762790687,
+      "loss": 0.4764,
+      "step": 67560
+    },
+    {
+      "epoch": 180.18666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029411157340331604,
+      "loss": 0.4692,
+      "step": 67570
+    },
+    {
+      "epoch": 180.21333333333334,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0002941098289255301,
+      "loss": 0.464,
+      "step": 67580
+    },
+    {
+      "epoch": 180.24,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00029410808419455206,
+      "loss": 0.4669,
+      "step": 67590
+    },
+    {
+      "epoch": 180.26666666666668,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002941063392103851,
+      "loss": 0.472,
+      "step": 67600
+    },
+    {
+      "epoch": 180.29333333333332,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00029410459397303216,
+      "loss": 0.4662,
+      "step": 67610
+    },
+    {
+      "epoch": 180.32,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.00029410284848249636,
+      "loss": 0.4687,
+      "step": 67620
+    },
+    {
+      "epoch": 180.34666666666666,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029410110273878076,
+      "loss": 0.4779,
+      "step": 67630
+    },
+    {
+      "epoch": 180.37333333333333,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0002940993567418885,
+      "loss": 0.4735,
+      "step": 67640
+    },
+    {
+      "epoch": 180.4,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029409761049182257,
+      "loss": 0.4704,
+      "step": 67650
+    },
+    {
+      "epoch": 180.42666666666668,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.000294095863988586,
+      "loss": 0.476,
+      "step": 67660
+    },
+    {
+      "epoch": 180.45333333333335,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002940941172321819,
+      "loss": 0.4786,
+      "step": 67670
+    },
+    {
+      "epoch": 180.48,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002940923702226134,
+      "loss": 0.4852,
+      "step": 67680
+    },
+    {
+      "epoch": 180.50666666666666,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00029409062295988355,
+      "loss": 0.4738,
+      "step": 67690
+    },
+    {
+      "epoch": 180.53333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002940888754439953,
+      "loss": 0.4675,
+      "step": 67700
+    },
+    {
+      "epoch": 180.56,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002940871276749519,
+      "loss": 0.4694,
+      "step": 67710
+    },
+    {
+      "epoch": 180.58666666666667,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002940853796527563,
+      "loss": 0.4653,
+      "step": 67720
+    },
+    {
+      "epoch": 180.61333333333334,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029408363137741156,
+      "loss": 0.4516,
+      "step": 67730
+    },
+    {
+      "epoch": 180.64,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002940818828489208,
+      "loss": 0.4566,
+      "step": 67740
+    },
+    {
+      "epoch": 180.66666666666666,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002940801340672871,
+      "loss": 0.4704,
+      "step": 67750
+    },
+    {
+      "epoch": 180.69333333333333,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.00029407838503251355,
+      "loss": 0.4555,
+      "step": 67760
+    },
+    {
+      "epoch": 180.72,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029407663574460317,
+      "loss": 0.4713,
+      "step": 67770
+    },
+    {
+      "epoch": 180.74666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.000294074886203559,
+      "loss": 0.4758,
+      "step": 67780
+    },
+    {
+      "epoch": 180.77333333333334,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0002940731364093842,
+      "loss": 0.4832,
+      "step": 67790
+    },
+    {
+      "epoch": 180.8,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029407138636208184,
+      "loss": 0.4709,
+      "step": 67800
+    },
+    {
+      "epoch": 180.82666666666665,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029406963606165494,
+      "loss": 0.4644,
+      "step": 67810
+    },
+    {
+      "epoch": 180.85333333333332,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029406788550810663,
+      "loss": 0.4734,
+      "step": 67820
+    },
+    {
+      "epoch": 180.88,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029406613470143994,
+      "loss": 0.4615,
+      "step": 67830
+    },
+    {
+      "epoch": 180.90666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029406438364165796,
+      "loss": 0.4643,
+      "step": 67840
+    },
+    {
+      "epoch": 180.93333333333334,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002940626323287638,
+      "loss": 0.4661,
+      "step": 67850
+    },
+    {
+      "epoch": 180.96,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002940608807627604,
+      "loss": 0.4639,
+      "step": 67860
+    },
+    {
+      "epoch": 180.98666666666668,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029405912894365106,
+      "loss": 0.4776,
+      "step": 67870
+    },
+    {
+      "epoch": 181.0,
+      "eval_loss": 0.47777873277664185,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7052,
+      "eval_samples_per_second": 1.649,
+      "eval_steps_per_second": 0.103,
+      "step": 67875
+    },
+    {
+      "epoch": 181.01333333333332,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002940573768714387,
+      "loss": 0.4674,
+      "step": 67880
+    },
+    {
+      "epoch": 181.04,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029405562454612645,
+      "loss": 0.4913,
+      "step": 67890
+    },
+    {
+      "epoch": 181.06666666666666,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00029405387196771734,
+      "loss": 0.4786,
+      "step": 67900
+    },
+    {
+      "epoch": 181.09333333333333,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029405211913621456,
+      "loss": 0.4739,
+      "step": 67910
+    },
+    {
+      "epoch": 181.12,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029405036605162105,
+      "loss": 0.4692,
+      "step": 67920
+    },
+    {
+      "epoch": 181.14666666666668,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029404861271394,
+      "loss": 0.4791,
+      "step": 67930
+    },
+    {
+      "epoch": 181.17333333333335,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002940468591231744,
+      "loss": 0.4696,
+      "step": 67940
+    },
+    {
+      "epoch": 181.2,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029404510527932746,
+      "loss": 0.4676,
+      "step": 67950
+    },
+    {
+      "epoch": 181.22666666666666,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0002940433511824021,
+      "loss": 0.4649,
+      "step": 67960
+    },
+    {
+      "epoch": 181.25333333333333,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002940415968324015,
+      "loss": 0.4652,
+      "step": 67970
+    },
+    {
+      "epoch": 181.28,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029403984222932877,
+      "loss": 0.4789,
+      "step": 67980
+    },
+    {
+      "epoch": 181.30666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002940380873731869,
+      "loss": 0.46,
+      "step": 67990
+    },
+    {
+      "epoch": 181.33333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029403633226397904,
+      "loss": 0.4756,
+      "step": 68000
+    },
+    {
+      "epoch": 181.36,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029403457690170825,
+      "loss": 0.4767,
+      "step": 68010
+    },
+    {
+      "epoch": 181.38666666666666,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002940328212863776,
+      "loss": 0.4687,
+      "step": 68020
+    },
+    {
+      "epoch": 181.41333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029403106541799024,
+      "loss": 0.4728,
+      "step": 68030
+    },
+    {
+      "epoch": 181.44,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029402930929654916,
+      "loss": 0.4755,
+      "step": 68040
+    },
+    {
+      "epoch": 181.46666666666667,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0002940275529220575,
+      "loss": 0.4852,
+      "step": 68050
+    },
+    {
+      "epoch": 181.49333333333334,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002940257962945184,
+      "loss": 0.4768,
+      "step": 68060
+    },
+    {
+      "epoch": 181.52,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029402403941393483,
+      "loss": 0.4711,
+      "step": 68070
+    },
+    {
+      "epoch": 181.54666666666665,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002940222822803099,
+      "loss": 0.4682,
+      "step": 68080
+    },
+    {
+      "epoch": 181.57333333333332,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00029402052489364676,
+      "loss": 0.4674,
+      "step": 68090
+    },
+    {
+      "epoch": 181.6,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002940187672539485,
+      "loss": 0.4607,
+      "step": 68100
+    },
+    {
+      "epoch": 181.62666666666667,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029401700936121817,
+      "loss": 0.4478,
+      "step": 68110
+    },
+    {
+      "epoch": 181.65333333333334,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002940152512154588,
+      "loss": 0.4679,
+      "step": 68120
+    },
+    {
+      "epoch": 181.68,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029401349281667357,
+      "loss": 0.459,
+      "step": 68130
+    },
+    {
+      "epoch": 181.70666666666668,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029401173416486557,
+      "loss": 0.4594,
+      "step": 68140
+    },
+    {
+      "epoch": 181.73333333333332,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029400997526003784,
+      "loss": 0.4781,
+      "step": 68150
+    },
+    {
+      "epoch": 181.76,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0002940082161021935,
+      "loss": 0.4795,
+      "step": 68160
+    },
+    {
+      "epoch": 181.78666666666666,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029400645669133566,
+      "loss": 0.4785,
+      "step": 68170
+    },
+    {
+      "epoch": 181.81333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002940046970274673,
+      "loss": 0.4639,
+      "step": 68180
+    },
+    {
+      "epoch": 181.84,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029400293711059173,
+      "loss": 0.469,
+      "step": 68190
+    },
+    {
+      "epoch": 181.86666666666667,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002940011769407118,
+      "loss": 0.4687,
+      "step": 68200
+    },
+    {
+      "epoch": 181.89333333333335,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002939994165178307,
+      "loss": 0.4592,
+      "step": 68210
+    },
+    {
+      "epoch": 181.92,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002939976558419516,
+      "loss": 0.4701,
+      "step": 68220
+    },
+    {
+      "epoch": 181.94666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029399589491307753,
+      "loss": 0.4629,
+      "step": 68230
+    },
+    {
+      "epoch": 181.97333333333333,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002939941337312115,
+      "loss": 0.4745,
+      "step": 68240
+    },
+    {
+      "epoch": 182.0,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029399237229635675,
+      "loss": 0.4629,
+      "step": 68250
+    },
+    {
+      "epoch": 182.0,
+      "eval_loss": 0.4777792990207672,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8783,
+      "eval_samples_per_second": 1.62,
+      "eval_steps_per_second": 0.101,
+      "step": 68250
+    },
+    {
+      "epoch": 182.02666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002939906106085163,
+      "loss": 0.4823,
+      "step": 68260
+    },
+    {
+      "epoch": 182.05333333333334,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002939888486676932,
+      "loss": 0.4881,
+      "step": 68270
+    },
+    {
+      "epoch": 182.08,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029398708647389066,
+      "loss": 0.4751,
+      "step": 68280
+    },
+    {
+      "epoch": 182.10666666666665,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002939853240271117,
+      "loss": 0.4704,
+      "step": 68290
+    },
+    {
+      "epoch": 182.13333333333333,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002939835613273594,
+      "loss": 0.4724,
+      "step": 68300
+    },
+    {
+      "epoch": 182.16,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.000293981798374637,
+      "loss": 0.4764,
+      "step": 68310
+    },
+    {
+      "epoch": 182.18666666666667,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002939800351689474,
+      "loss": 0.4693,
+      "step": 68320
+    },
+    {
+      "epoch": 182.21333333333334,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002939782717102938,
+      "loss": 0.4633,
+      "step": 68330
+    },
+    {
+      "epoch": 182.24,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029397650799867925,
+      "loss": 0.4672,
+      "step": 68340
+    },
+    {
+      "epoch": 182.26666666666668,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0002939747440341069,
+      "loss": 0.471,
+      "step": 68350
+    },
+    {
+      "epoch": 182.29333333333332,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029397297981657985,
+      "loss": 0.4657,
+      "step": 68360
+    },
+    {
+      "epoch": 182.32,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002939712153461012,
+      "loss": 0.4686,
+      "step": 68370
+    },
+    {
+      "epoch": 182.34666666666666,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.000293969450622674,
+      "loss": 0.4774,
+      "step": 68380
+    },
+    {
+      "epoch": 182.37333333333333,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00029396768564630136,
+      "loss": 0.4727,
+      "step": 68390
+    },
+    {
+      "epoch": 182.4,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002939659204169864,
+      "loss": 0.4698,
+      "step": 68400
+    },
+    {
+      "epoch": 182.42666666666668,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002939641549347322,
+      "loss": 0.476,
+      "step": 68410
+    },
+    {
+      "epoch": 182.45333333333335,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.000293962389199542,
+      "loss": 0.4782,
+      "step": 68420
+    },
+    {
+      "epoch": 182.48,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.00029396062321141873,
+      "loss": 0.4842,
+      "step": 68430
+    },
+    {
+      "epoch": 182.50666666666666,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002939588569703655,
+      "loss": 0.4744,
+      "step": 68440
+    },
+    {
+      "epoch": 182.53333333333333,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002939570904763855,
+      "loss": 0.4671,
+      "step": 68450
+    },
+    {
+      "epoch": 182.56,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029395532372948185,
+      "loss": 0.4696,
+      "step": 68460
+    },
+    {
+      "epoch": 182.58666666666667,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029395355672965756,
+      "loss": 0.4644,
+      "step": 68470
+    },
+    {
+      "epoch": 182.61333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029395178947691573,
+      "loss": 0.4512,
+      "step": 68480
+    },
+    {
+      "epoch": 182.64,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029395002197125957,
+      "loss": 0.456,
+      "step": 68490
+    },
+    {
+      "epoch": 182.66666666666666,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002939482542126921,
+      "loss": 0.4707,
+      "step": 68500
+    },
+    {
+      "epoch": 182.69333333333333,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029394648620121645,
+      "loss": 0.4547,
+      "step": 68510
+    },
+    {
+      "epoch": 182.72,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002939447179368357,
+      "loss": 0.4712,
+      "step": 68520
+    },
+    {
+      "epoch": 182.74666666666667,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.000293942949419553,
+      "loss": 0.4757,
+      "step": 68530
+    },
+    {
+      "epoch": 182.77333333333334,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.00029394118064937153,
+      "loss": 0.4827,
+      "step": 68540
+    },
+    {
+      "epoch": 182.8,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0002939394116262942,
+      "loss": 0.4701,
+      "step": 68550
+    },
+    {
+      "epoch": 182.82666666666665,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002939376423503243,
+      "loss": 0.4647,
+      "step": 68560
+    },
+    {
+      "epoch": 182.85333333333332,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029393587282146484,
+      "loss": 0.4732,
+      "step": 68570
+    },
+    {
+      "epoch": 182.88,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029393410303971894,
+      "loss": 0.4617,
+      "step": 68580
+    },
+    {
+      "epoch": 182.90666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002939323330050897,
+      "loss": 0.4646,
+      "step": 68590
+    },
+    {
+      "epoch": 182.93333333333334,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002939305627175803,
+      "loss": 0.4658,
+      "step": 68600
+    },
+    {
+      "epoch": 182.96,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002939287921771938,
+      "loss": 0.4648,
+      "step": 68610
+    },
+    {
+      "epoch": 182.98666666666668,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029392702138393334,
+      "loss": 0.478,
+      "step": 68620
+    },
+    {
+      "epoch": 183.0,
+      "eval_loss": 0.47816112637519836,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7955,
+      "eval_samples_per_second": 1.633,
+      "eval_steps_per_second": 0.102,
+      "step": 68625
+    },
+    {
+      "epoch": 183.01333333333332,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.000293925250337802,
+      "loss": 0.4666,
+      "step": 68630
+    },
+    {
+      "epoch": 183.04,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029392347903880285,
+      "loss": 0.4909,
+      "step": 68640
+    },
+    {
+      "epoch": 183.06666666666666,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029392170748693904,
+      "loss": 0.4787,
+      "step": 68650
+    },
+    {
+      "epoch": 183.09333333333333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029391993568221375,
+      "loss": 0.4737,
+      "step": 68660
+    },
+    {
+      "epoch": 183.12,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029391816362463,
+      "loss": 0.4694,
+      "step": 68670
+    },
+    {
+      "epoch": 183.14666666666668,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002939163913141909,
+      "loss": 0.4784,
+      "step": 68680
+    },
+    {
+      "epoch": 183.17333333333335,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029391461875089974,
+      "loss": 0.4691,
+      "step": 68690
+    },
+    {
+      "epoch": 183.2,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002939128459347594,
+      "loss": 0.4679,
+      "step": 68700
+    },
+    {
+      "epoch": 183.22666666666666,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0002939110728657731,
+      "loss": 0.4647,
+      "step": 68710
+    },
+    {
+      "epoch": 183.25333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002939092995439439,
+      "loss": 0.4651,
+      "step": 68720
+    },
+    {
+      "epoch": 183.28,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.000293907525969275,
+      "loss": 0.4786,
+      "step": 68730
+    },
+    {
+      "epoch": 183.30666666666667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029390575214176956,
+      "loss": 0.4599,
+      "step": 68740
+    },
+    {
+      "epoch": 183.33333333333334,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002939039780614305,
+      "loss": 0.4749,
+      "step": 68750
+    },
+    {
+      "epoch": 183.36,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002939022037282611,
+      "loss": 0.4769,
+      "step": 68760
+    },
+    {
+      "epoch": 183.38666666666666,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029390042914226446,
+      "loss": 0.4688,
+      "step": 68770
+    },
+    {
+      "epoch": 183.41333333333333,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.00029389865430344363,
+      "loss": 0.4731,
+      "step": 68780
+    },
+    {
+      "epoch": 183.44,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029389687921180176,
+      "loss": 0.476,
+      "step": 68790
+    },
+    {
+      "epoch": 183.46666666666667,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.000293895103867342,
+      "loss": 0.4856,
+      "step": 68800
+    },
+    {
+      "epoch": 183.49333333333334,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002938933282700674,
+      "loss": 0.4773,
+      "step": 68810
+    },
+    {
+      "epoch": 183.52,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002938915524199812,
+      "loss": 0.4721,
+      "step": 68820
+    },
+    {
+      "epoch": 183.54666666666665,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0002938897763170864,
+      "loss": 0.4679,
+      "step": 68830
+    },
+    {
+      "epoch": 183.57333333333332,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002938879999613861,
+      "loss": 0.4679,
+      "step": 68840
+    },
+    {
+      "epoch": 183.6,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002938862233528836,
+      "loss": 0.4613,
+      "step": 68850
+    },
+    {
+      "epoch": 183.62666666666667,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.00029388444649158183,
+      "loss": 0.4487,
+      "step": 68860
+    },
+    {
+      "epoch": 183.65333333333334,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.00029388266937748403,
+      "loss": 0.4679,
+      "step": 68870
+    },
+    {
+      "epoch": 183.68,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029388089201059326,
+      "loss": 0.4594,
+      "step": 68880
+    },
+    {
+      "epoch": 183.70666666666668,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029387911439091263,
+      "loss": 0.4592,
+      "step": 68890
+    },
+    {
+      "epoch": 183.73333333333332,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029387733651844533,
+      "loss": 0.4783,
+      "step": 68900
+    },
+    {
+      "epoch": 183.76,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002938755583931945,
+      "loss": 0.4797,
+      "step": 68910
+    },
+    {
+      "epoch": 183.78666666666666,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00029387378001516313,
+      "loss": 0.4785,
+      "step": 68920
+    },
+    {
+      "epoch": 183.81333333333333,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.00029387200138435447,
+      "loss": 0.464,
+      "step": 68930
+    },
+    {
+      "epoch": 183.84,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0002938702225007716,
+      "loss": 0.4696,
+      "step": 68940
+    },
+    {
+      "epoch": 183.86666666666667,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029386844336441763,
+      "loss": 0.4687,
+      "step": 68950
+    },
+    {
+      "epoch": 183.89333333333335,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.00029386666397529575,
+      "loss": 0.4593,
+      "step": 68960
+    },
+    {
+      "epoch": 183.92,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.000293864884333409,
+      "loss": 0.4704,
+      "step": 68970
+    },
+    {
+      "epoch": 183.94666666666666,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002938631044387606,
+      "loss": 0.4625,
+      "step": 68980
+    },
+    {
+      "epoch": 183.97333333333333,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029386132429135356,
+      "loss": 0.4744,
+      "step": 68990
+    },
+    {
+      "epoch": 184.0,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002938595438911911,
+      "loss": 0.4631,
+      "step": 69000
+    },
+    {
+      "epoch": 184.0,
+      "eval_loss": 0.47846052050590515,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.162,
+      "eval_samples_per_second": 1.574,
+      "eval_steps_per_second": 0.098,
+      "step": 69000
+    },
+    {
+      "epoch": 184.02666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029385776323827625,
+      "loss": 0.4831,
+      "step": 69010
+    },
+    {
+      "epoch": 184.05333333333334,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002938559823326123,
+      "loss": 0.4893,
+      "step": 69020
+    },
+    {
+      "epoch": 184.08,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029385420117420227,
+      "loss": 0.4752,
+      "step": 69030
+    },
+    {
+      "epoch": 184.10666666666665,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029385241976304934,
+      "loss": 0.4707,
+      "step": 69040
+    },
+    {
+      "epoch": 184.13333333333333,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029385063809915654,
+      "loss": 0.4719,
+      "step": 69050
+    },
+    {
+      "epoch": 184.16,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002938488561825271,
+      "loss": 0.4767,
+      "step": 69060
+    },
+    {
+      "epoch": 184.18666666666667,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002938470740131641,
+      "loss": 0.4691,
+      "step": 69070
+    },
+    {
+      "epoch": 184.21333333333334,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002938452915910707,
+      "loss": 0.4635,
+      "step": 69080
+    },
+    {
+      "epoch": 184.24,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029384350891625003,
+      "loss": 0.4677,
+      "step": 69090
+    },
+    {
+      "epoch": 184.26666666666668,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002938417259887052,
+      "loss": 0.4719,
+      "step": 69100
+    },
+    {
+      "epoch": 184.29333333333332,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029383994280843934,
+      "loss": 0.4656,
+      "step": 69110
+    },
+    {
+      "epoch": 184.32,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029383815937545564,
+      "loss": 0.4689,
+      "step": 69120
+    },
+    {
+      "epoch": 184.34666666666666,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029383637568975715,
+      "loss": 0.4769,
+      "step": 69130
+    },
+    {
+      "epoch": 184.37333333333333,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029383459175134707,
+      "loss": 0.4733,
+      "step": 69140
+    },
+    {
+      "epoch": 184.4,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002938328075602285,
+      "loss": 0.4699,
+      "step": 69150
+    },
+    {
+      "epoch": 184.42666666666668,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0002938310231164046,
+      "loss": 0.4758,
+      "step": 69160
+    },
+    {
+      "epoch": 184.45333333333335,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029382923841987847,
+      "loss": 0.4782,
+      "step": 69170
+    },
+    {
+      "epoch": 184.48,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029382745347065326,
+      "loss": 0.4844,
+      "step": 69180
+    },
+    {
+      "epoch": 184.50666666666666,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002938256682687322,
+      "loss": 0.4743,
+      "step": 69190
+    },
+    {
+      "epoch": 184.53333333333333,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002938238828141182,
+      "loss": 0.4676,
+      "step": 69200
+    },
+    {
+      "epoch": 184.56,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029382209710681457,
+      "loss": 0.4692,
+      "step": 69210
+    },
+    {
+      "epoch": 184.58666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002938203111468245,
+      "loss": 0.4645,
+      "step": 69220
+    },
+    {
+      "epoch": 184.61333333333334,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029381852493415094,
+      "loss": 0.4509,
+      "step": 69230
+    },
+    {
+      "epoch": 184.64,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0002938167384687972,
+      "loss": 0.4557,
+      "step": 69240
+    },
+    {
+      "epoch": 184.66666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002938149517507663,
+      "loss": 0.4717,
+      "step": 69250
+    },
+    {
+      "epoch": 184.69333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002938131647800614,
+      "loss": 0.4543,
+      "step": 69260
+    },
+    {
+      "epoch": 184.72,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029381137755668573,
+      "loss": 0.4713,
+      "step": 69270
+    },
+    {
+      "epoch": 184.74666666666667,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00029380959008064235,
+      "loss": 0.4758,
+      "step": 69280
+    },
+    {
+      "epoch": 184.77333333333334,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0002938078023519344,
+      "loss": 0.4827,
+      "step": 69290
+    },
+    {
+      "epoch": 184.8,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002938060143705651,
+      "loss": 0.4702,
+      "step": 69300
+    },
+    {
+      "epoch": 184.82666666666665,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029380422613653744,
+      "loss": 0.4647,
+      "step": 69310
+    },
+    {
+      "epoch": 184.85333333333332,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002938024376498547,
+      "loss": 0.4733,
+      "step": 69320
+    },
+    {
+      "epoch": 184.88,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002938006489105199,
+      "loss": 0.4616,
+      "step": 69330
+    },
+    {
+      "epoch": 184.90666666666667,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029379885991853634,
+      "loss": 0.4641,
+      "step": 69340
+    },
+    {
+      "epoch": 184.93333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029379707067390704,
+      "loss": 0.4656,
+      "step": 69350
+    },
+    {
+      "epoch": 184.96,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002937952811766352,
+      "loss": 0.4643,
+      "step": 69360
+    },
+    {
+      "epoch": 184.98666666666668,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029379349142672395,
+      "loss": 0.4784,
+      "step": 69370
+    },
+    {
+      "epoch": 185.0,
+      "eval_loss": 0.47779580950737,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.7285,
+      "eval_samples_per_second": 1.491,
+      "eval_steps_per_second": 0.093,
+      "step": 69375
+    },
+    {
+      "epoch": 185.01333333333332,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002937917014241764,
+      "loss": 0.4669,
+      "step": 69380
+    },
+    {
+      "epoch": 185.04,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029378991116899575,
+      "loss": 0.491,
+      "step": 69390
+    },
+    {
+      "epoch": 185.06666666666666,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0002937881206611851,
+      "loss": 0.4783,
+      "step": 69400
+    },
+    {
+      "epoch": 185.09333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002937863299007477,
+      "loss": 0.4734,
+      "step": 69410
+    },
+    {
+      "epoch": 185.12,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002937845388876865,
+      "loss": 0.4696,
+      "step": 69420
+    },
+    {
+      "epoch": 185.14666666666668,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002937827476220048,
+      "loss": 0.4788,
+      "step": 69430
+    },
+    {
+      "epoch": 185.17333333333335,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029378095610370575,
+      "loss": 0.4688,
+      "step": 69440
+    },
+    {
+      "epoch": 185.2,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002937791643327924,
+      "loss": 0.4681,
+      "step": 69450
+    },
+    {
+      "epoch": 185.22666666666666,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029377737230926795,
+      "loss": 0.4648,
+      "step": 69460
+    },
+    {
+      "epoch": 185.25333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029377558003313553,
+      "loss": 0.4652,
+      "step": 69470
+    },
+    {
+      "epoch": 185.28,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002937737875043984,
+      "loss": 0.4791,
+      "step": 69480
+    },
+    {
+      "epoch": 185.30666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002937719947230596,
+      "loss": 0.4597,
+      "step": 69490
+    },
+    {
+      "epoch": 185.33333333333334,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002937702016891222,
+      "loss": 0.4753,
+      "step": 69500
+    },
+    {
+      "epoch": 185.36,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029376840840258955,
+      "loss": 0.4766,
+      "step": 69510
+    },
+    {
+      "epoch": 185.38666666666666,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029376661486346466,
+      "loss": 0.4689,
+      "step": 69520
+    },
+    {
+      "epoch": 185.41333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002937648210717507,
+      "loss": 0.4728,
+      "step": 69530
+    },
+    {
+      "epoch": 185.44,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002937630270274509,
+      "loss": 0.476,
+      "step": 69540
+    },
+    {
+      "epoch": 185.46666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002937612327305683,
+      "loss": 0.4857,
+      "step": 69550
+    },
+    {
+      "epoch": 185.49333333333334,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029375943818110614,
+      "loss": 0.477,
+      "step": 69560
+    },
+    {
+      "epoch": 185.52,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029375764337906754,
+      "loss": 0.4717,
+      "step": 69570
+    },
+    {
+      "epoch": 185.54666666666665,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029375584832445564,
+      "loss": 0.4686,
+      "step": 69580
+    },
+    {
+      "epoch": 185.57333333333332,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002937540530172736,
+      "loss": 0.4677,
+      "step": 69590
+    },
+    {
+      "epoch": 185.6,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002937522574575246,
+      "loss": 0.4605,
+      "step": 69600
+    },
+    {
+      "epoch": 185.62666666666667,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.00029375046164521176,
+      "loss": 0.4481,
+      "step": 69610
+    },
+    {
+      "epoch": 185.65333333333334,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002937486655803382,
+      "loss": 0.468,
+      "step": 69620
+    },
+    {
+      "epoch": 185.68,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002937468692629072,
+      "loss": 0.4587,
+      "step": 69630
+    },
+    {
+      "epoch": 185.70666666666668,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002937450726929218,
+      "loss": 0.4594,
+      "step": 69640
+    },
+    {
+      "epoch": 185.73333333333332,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002937432758703853,
+      "loss": 0.4789,
+      "step": 69650
+    },
+    {
+      "epoch": 185.76,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002937414787953007,
+      "loss": 0.4802,
+      "step": 69660
+    },
+    {
+      "epoch": 185.78666666666666,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029373968146767114,
+      "loss": 0.478,
+      "step": 69670
+    },
+    {
+      "epoch": 185.81333333333333,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029373788388749995,
+      "loss": 0.464,
+      "step": 69680
+    },
+    {
+      "epoch": 185.84,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00029373608605479015,
+      "loss": 0.4693,
+      "step": 69690
+    },
+    {
+      "epoch": 185.86666666666667,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002937342879695449,
+      "loss": 0.4685,
+      "step": 69700
+    },
+    {
+      "epoch": 185.89333333333335,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002937324896317674,
+      "loss": 0.4595,
+      "step": 69710
+    },
+    {
+      "epoch": 185.92,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002937306910414609,
+      "loss": 0.4701,
+      "step": 69720
+    },
+    {
+      "epoch": 185.94666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002937288921986284,
+      "loss": 0.4626,
+      "step": 69730
+    },
+    {
+      "epoch": 185.97333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002937270931032731,
+      "loss": 0.4744,
+      "step": 69740
+    },
+    {
+      "epoch": 186.0,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029372529375539826,
+      "loss": 0.463,
+      "step": 69750
+    },
+    {
+      "epoch": 186.0,
+      "eval_loss": 0.47842052578926086,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8196,
+      "eval_samples_per_second": 1.629,
+      "eval_steps_per_second": 0.102,
+      "step": 69750
+    },
+    {
+      "epoch": 186.02666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002937234941550069,
+      "loss": 0.4828,
+      "step": 69760
+    },
+    {
+      "epoch": 186.05333333333334,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002937216943021023,
+      "loss": 0.4886,
+      "step": 69770
+    },
+    {
+      "epoch": 186.08,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029371989419668757,
+      "loss": 0.476,
+      "step": 69780
+    },
+    {
+      "epoch": 186.10666666666665,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029371809383876585,
+      "loss": 0.4705,
+      "step": 69790
+    },
+    {
+      "epoch": 186.13333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029371629322834034,
+      "loss": 0.4727,
+      "step": 69800
+    },
+    {
+      "epoch": 186.16,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002937144923654142,
+      "loss": 0.4761,
+      "step": 69810
+    },
+    {
+      "epoch": 186.18666666666667,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002937126912499906,
+      "loss": 0.4683,
+      "step": 69820
+    },
+    {
+      "epoch": 186.21333333333334,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029371088988207263,
+      "loss": 0.4634,
+      "step": 69830
+    },
+    {
+      "epoch": 186.24,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029370908826166357,
+      "loss": 0.4675,
+      "step": 69840
+    },
+    {
+      "epoch": 186.26666666666668,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029370728638876653,
+      "loss": 0.4717,
+      "step": 69850
+    },
+    {
+      "epoch": 186.29333333333332,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002937054842633847,
+      "loss": 0.4655,
+      "step": 69860
+    },
+    {
+      "epoch": 186.32,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029370368188552117,
+      "loss": 0.4687,
+      "step": 69870
+    },
+    {
+      "epoch": 186.34666666666666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002937018792551792,
+      "loss": 0.4771,
+      "step": 69880
+    },
+    {
+      "epoch": 186.37333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029370007637236187,
+      "loss": 0.4733,
+      "step": 69890
+    },
+    {
+      "epoch": 186.4,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002936982732370724,
+      "loss": 0.4699,
+      "step": 69900
+    },
+    {
+      "epoch": 186.42666666666668,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029369646984931405,
+      "loss": 0.4758,
+      "step": 69910
+    },
+    {
+      "epoch": 186.45333333333335,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002936946662090898,
+      "loss": 0.4779,
+      "step": 69920
+    },
+    {
+      "epoch": 186.48,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002936928623164029,
+      "loss": 0.4843,
+      "step": 69930
+    },
+    {
+      "epoch": 186.50666666666666,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.00029369105817125655,
+      "loss": 0.4745,
+      "step": 69940
+    },
+    {
+      "epoch": 186.53333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002936892537736539,
+      "loss": 0.4679,
+      "step": 69950
+    },
+    {
+      "epoch": 186.56,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029368744912359817,
+      "loss": 0.4693,
+      "step": 69960
+    },
+    {
+      "epoch": 186.58666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029368564422109245,
+      "loss": 0.465,
+      "step": 69970
+    },
+    {
+      "epoch": 186.61333333333334,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002936838390661399,
+      "loss": 0.4512,
+      "step": 69980
+    },
+    {
+      "epoch": 186.64,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029368203365874373,
+      "loss": 0.456,
+      "step": 69990
+    },
+    {
+      "epoch": 186.66666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002936802279989072,
+      "loss": 0.4711,
+      "step": 70000
+    },
+    {
+      "epoch": 186.69333333333333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002936784220866333,
+      "loss": 0.4541,
+      "step": 70010
+    },
+    {
+      "epoch": 186.72,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029367661592192537,
+      "loss": 0.471,
+      "step": 70020
+    },
+    {
+      "epoch": 186.74666666666667,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002936748095047865,
+      "loss": 0.4752,
+      "step": 70030
+    },
+    {
+      "epoch": 186.77333333333334,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029367300283521984,
+      "loss": 0.4829,
+      "step": 70040
+    },
+    {
+      "epoch": 186.8,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.00029367119591322863,
+      "loss": 0.4707,
+      "step": 70050
+    },
+    {
+      "epoch": 186.82666666666665,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.000293669388738816,
+      "loss": 0.4642,
+      "step": 70060
+    },
+    {
+      "epoch": 186.85333333333332,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029366758131198515,
+      "loss": 0.4735,
+      "step": 70070
+    },
+    {
+      "epoch": 186.88,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002936657736327393,
+      "loss": 0.4614,
+      "step": 70080
+    },
+    {
+      "epoch": 186.90666666666667,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002936639657010815,
+      "loss": 0.4648,
+      "step": 70090
+    },
+    {
+      "epoch": 186.93333333333334,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029366215751701504,
+      "loss": 0.4664,
+      "step": 70100
+    },
+    {
+      "epoch": 186.96,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.000293660349080543,
+      "loss": 0.4638,
+      "step": 70110
+    },
+    {
+      "epoch": 186.98666666666668,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029365854039166867,
+      "loss": 0.4775,
+      "step": 70120
+    },
+    {
+      "epoch": 187.0,
+      "eval_loss": 0.4807330071926117,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.6059,
+      "eval_samples_per_second": 1.509,
+      "eval_steps_per_second": 0.094,
+      "step": 70125
+    },
+    {
+      "epoch": 187.01333333333332,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00029365673145039515,
+      "loss": 0.4674,
+      "step": 70130
+    },
+    {
+      "epoch": 187.04,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029365492225672565,
+      "loss": 0.4909,
+      "step": 70140
+    },
+    {
+      "epoch": 187.06666666666666,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002936531128106633,
+      "loss": 0.4785,
+      "step": 70150
+    },
+    {
+      "epoch": 187.09333333333333,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029365130311221135,
+      "loss": 0.4726,
+      "step": 70160
+    },
+    {
+      "epoch": 187.12,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029364949316137295,
+      "loss": 0.4699,
+      "step": 70170
+    },
+    {
+      "epoch": 187.14666666666668,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002936476829581513,
+      "loss": 0.4781,
+      "step": 70180
+    },
+    {
+      "epoch": 187.17333333333335,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002936458725025495,
+      "loss": 0.4691,
+      "step": 70190
+    },
+    {
+      "epoch": 187.2,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029364406179457084,
+      "loss": 0.4671,
+      "step": 70200
+    },
+    {
+      "epoch": 187.22666666666666,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002936422508342184,
+      "loss": 0.4652,
+      "step": 70210
+    },
+    {
+      "epoch": 187.25333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029364043962149547,
+      "loss": 0.4648,
+      "step": 70220
+    },
+    {
+      "epoch": 187.28,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029363862815640513,
+      "loss": 0.4785,
+      "step": 70230
+    },
+    {
+      "epoch": 187.30666666666667,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029363681643895063,
+      "loss": 0.4594,
+      "step": 70240
+    },
+    {
+      "epoch": 187.33333333333334,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002936350044691351,
+      "loss": 0.4751,
+      "step": 70250
+    },
+    {
+      "epoch": 187.36,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002936331922469618,
+      "loss": 0.4773,
+      "step": 70260
+    },
+    {
+      "epoch": 187.38666666666666,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0002936313797724338,
+      "loss": 0.4689,
+      "step": 70270
+    },
+    {
+      "epoch": 187.41333333333333,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002936295670455544,
+      "loss": 0.4733,
+      "step": 70280
+    },
+    {
+      "epoch": 187.44,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002936277540663267,
+      "loss": 0.476,
+      "step": 70290
+    },
+    {
+      "epoch": 187.46666666666667,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.00029362594083475396,
+      "loss": 0.486,
+      "step": 70300
+    },
+    {
+      "epoch": 187.49333333333334,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029362412735083927,
+      "loss": 0.4773,
+      "step": 70310
+    },
+    {
+      "epoch": 187.52,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002936223136145859,
+      "loss": 0.4716,
+      "step": 70320
+    },
+    {
+      "epoch": 187.54666666666665,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.000293620499625997,
+      "loss": 0.4681,
+      "step": 70330
+    },
+    {
+      "epoch": 187.57333333333332,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029361868538507584,
+      "loss": 0.4677,
+      "step": 70340
+    },
+    {
+      "epoch": 187.6,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002936168708918254,
+      "loss": 0.4608,
+      "step": 70350
+    },
+    {
+      "epoch": 187.62666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002936150561462491,
+      "loss": 0.4482,
+      "step": 70360
+    },
+    {
+      "epoch": 187.65333333333334,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00029361324114835006,
+      "loss": 0.4683,
+      "step": 70370
+    },
+    {
+      "epoch": 187.68,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029361142589813137,
+      "loss": 0.4585,
+      "step": 70380
+    },
+    {
+      "epoch": 187.70666666666668,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029360961039559633,
+      "loss": 0.4595,
+      "step": 70390
+    },
+    {
+      "epoch": 187.73333333333332,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029360779464074804,
+      "loss": 0.4783,
+      "step": 70400
+    },
+    {
+      "epoch": 187.76,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00029360597863358975,
+      "loss": 0.4803,
+      "step": 70410
+    },
+    {
+      "epoch": 187.78666666666666,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029360416237412466,
+      "loss": 0.4778,
+      "step": 70420
+    },
+    {
+      "epoch": 187.81333333333333,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0002936023458623559,
+      "loss": 0.4639,
+      "step": 70430
+    },
+    {
+      "epoch": 187.84,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029360052909828676,
+      "loss": 0.4691,
+      "step": 70440
+    },
+    {
+      "epoch": 187.86666666666667,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029359871208192034,
+      "loss": 0.4685,
+      "step": 70450
+    },
+    {
+      "epoch": 187.89333333333335,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00029359689481325986,
+      "loss": 0.4598,
+      "step": 70460
+    },
+    {
+      "epoch": 187.92,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002935950772923085,
+      "loss": 0.4704,
+      "step": 70470
+    },
+    {
+      "epoch": 187.94666666666666,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002935932595190695,
+      "loss": 0.463,
+      "step": 70480
+    },
+    {
+      "epoch": 187.97333333333333,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0002935914414935461,
+      "loss": 0.4743,
+      "step": 70490
+    },
+    {
+      "epoch": 188.0,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029358962321574135,
+      "loss": 0.4627,
+      "step": 70500
+    },
+    {
+      "epoch": 188.0,
+      "eval_loss": 0.4792870581150055,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.6616,
+      "eval_samples_per_second": 1.501,
+      "eval_steps_per_second": 0.094,
+      "step": 70500
+    },
+    {
+      "epoch": 188.02666666666667,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0002935878046856585,
+      "loss": 0.4831,
+      "step": 70510
+    },
+    {
+      "epoch": 188.05333333333334,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0002935859859033008,
+      "loss": 0.4885,
+      "step": 70520
+    },
+    {
+      "epoch": 188.08,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029358416686867133,
+      "loss": 0.4757,
+      "step": 70530
+    },
+    {
+      "epoch": 188.10666666666665,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029358234758177344,
+      "loss": 0.4705,
+      "step": 70540
+    },
+    {
+      "epoch": 188.13333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029358052804261017,
+      "loss": 0.4726,
+      "step": 70550
+    },
+    {
+      "epoch": 188.16,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002935787082511849,
+      "loss": 0.4763,
+      "step": 70560
+    },
+    {
+      "epoch": 188.18666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002935768882075006,
+      "loss": 0.4684,
+      "step": 70570
+    },
+    {
+      "epoch": 188.21333333333334,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002935750679115607,
+      "loss": 0.4633,
+      "step": 70580
+    },
+    {
+      "epoch": 188.24,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002935732473633682,
+      "loss": 0.4677,
+      "step": 70590
+    },
+    {
+      "epoch": 188.26666666666668,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002935714265629264,
+      "loss": 0.4717,
+      "step": 70600
+    },
+    {
+      "epoch": 188.29333333333332,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0002935696055102385,
+      "loss": 0.4656,
+      "step": 70610
+    },
+    {
+      "epoch": 188.32,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029356778420530774,
+      "loss": 0.4689,
+      "step": 70620
+    },
+    {
+      "epoch": 188.34666666666666,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002935659626481372,
+      "loss": 0.4769,
+      "step": 70630
+    },
+    {
+      "epoch": 188.37333333333333,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002935641408387302,
+      "loss": 0.473,
+      "step": 70640
+    },
+    {
+      "epoch": 188.4,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029356231877708983,
+      "loss": 0.4701,
+      "step": 70650
+    },
+    {
+      "epoch": 188.42666666666668,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029356049646321933,
+      "loss": 0.4765,
+      "step": 70660
+    },
+    {
+      "epoch": 188.45333333333335,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029355867389712194,
+      "loss": 0.4778,
+      "step": 70670
+    },
+    {
+      "epoch": 188.48,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029355685107880085,
+      "loss": 0.4844,
+      "step": 70680
+    },
+    {
+      "epoch": 188.50666666666666,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029355502800825926,
+      "loss": 0.4741,
+      "step": 70690
+    },
+    {
+      "epoch": 188.53333333333333,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002935532046855004,
+      "loss": 0.4682,
+      "step": 70700
+    },
+    {
+      "epoch": 188.56,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002935513811105274,
+      "loss": 0.4685,
+      "step": 70710
+    },
+    {
+      "epoch": 188.58666666666667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029354955728334343,
+      "loss": 0.4646,
+      "step": 70720
+    },
+    {
+      "epoch": 188.61333333333334,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002935477332039519,
+      "loss": 0.4513,
+      "step": 70730
+    },
+    {
+      "epoch": 188.64,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0002935459088723558,
+      "loss": 0.4563,
+      "step": 70740
+    },
+    {
+      "epoch": 188.66666666666666,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029354408428855847,
+      "loss": 0.4716,
+      "step": 70750
+    },
+    {
+      "epoch": 188.69333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.000293542259452563,
+      "loss": 0.4543,
+      "step": 70760
+    },
+    {
+      "epoch": 188.72,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002935404343643727,
+      "loss": 0.4715,
+      "step": 70770
+    },
+    {
+      "epoch": 188.74666666666667,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0002935386090239907,
+      "loss": 0.4759,
+      "step": 70780
+    },
+    {
+      "epoch": 188.77333333333334,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029353678343142027,
+      "loss": 0.4834,
+      "step": 70790
+    },
+    {
+      "epoch": 188.8,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002935349575866646,
+      "loss": 0.4699,
+      "step": 70800
+    },
+    {
+      "epoch": 188.82666666666665,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002935331314897269,
+      "loss": 0.4645,
+      "step": 70810
+    },
+    {
+      "epoch": 188.85333333333332,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029353130514061035,
+      "loss": 0.4745,
+      "step": 70820
+    },
+    {
+      "epoch": 188.88,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029352947853931814,
+      "loss": 0.4622,
+      "step": 70830
+    },
+    {
+      "epoch": 188.90666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002935276516858535,
+      "loss": 0.4649,
+      "step": 70840
+    },
+    {
+      "epoch": 188.93333333333334,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00029352582458021974,
+      "loss": 0.466,
+      "step": 70850
+    },
+    {
+      "epoch": 188.96,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029352399722241994,
+      "loss": 0.464,
+      "step": 70860
+    },
+    {
+      "epoch": 188.98666666666668,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002935221696124574,
+      "loss": 0.4774,
+      "step": 70870
+    },
+    {
+      "epoch": 189.0,
+      "eval_loss": 0.4771125018596649,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8842,
+      "eval_samples_per_second": 1.619,
+      "eval_steps_per_second": 0.101,
+      "step": 70875
+    },
+    {
+      "epoch": 189.01333333333332,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002935203417503352,
+      "loss": 0.4674,
+      "step": 70880
+    },
+    {
+      "epoch": 189.04,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0002935185136360567,
+      "loss": 0.4918,
+      "step": 70890
+    },
+    {
+      "epoch": 189.06666666666666,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.000293516685269625,
+      "loss": 0.4791,
+      "step": 70900
+    },
+    {
+      "epoch": 189.09333333333333,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00029351485665104343,
+      "loss": 0.4733,
+      "step": 70910
+    },
+    {
+      "epoch": 189.12,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029351302778031507,
+      "loss": 0.4701,
+      "step": 70920
+    },
+    {
+      "epoch": 189.14666666666668,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029351119865744324,
+      "loss": 0.4776,
+      "step": 70930
+    },
+    {
+      "epoch": 189.17333333333335,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029350936928243114,
+      "loss": 0.4685,
+      "step": 70940
+    },
+    {
+      "epoch": 189.2,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002935075396552819,
+      "loss": 0.4671,
+      "step": 70950
+    },
+    {
+      "epoch": 189.22666666666666,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002935057097759988,
+      "loss": 0.4654,
+      "step": 70960
+    },
+    {
+      "epoch": 189.25333333333333,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00029350387964458506,
+      "loss": 0.4646,
+      "step": 70970
+    },
+    {
+      "epoch": 189.28,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0002935020492610439,
+      "loss": 0.4783,
+      "step": 70980
+    },
+    {
+      "epoch": 189.30666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029350021862537847,
+      "loss": 0.4595,
+      "step": 70990
+    },
+    {
+      "epoch": 189.33333333333334,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.00029349838773759203,
+      "loss": 0.4753,
+      "step": 71000
+    },
+    {
+      "epoch": 189.36,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029349655659768783,
+      "loss": 0.4767,
+      "step": 71010
+    },
+    {
+      "epoch": 189.38666666666666,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029349472520566905,
+      "loss": 0.4685,
+      "step": 71020
+    },
+    {
+      "epoch": 189.41333333333333,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0002934928935615389,
+      "loss": 0.4733,
+      "step": 71030
+    },
+    {
+      "epoch": 189.44,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029349106166530063,
+      "loss": 0.4756,
+      "step": 71040
+    },
+    {
+      "epoch": 189.46666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029348922951695744,
+      "loss": 0.4856,
+      "step": 71050
+    },
+    {
+      "epoch": 189.49333333333334,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029348739711651257,
+      "loss": 0.4773,
+      "step": 71060
+    },
+    {
+      "epoch": 189.52,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002934855644639692,
+      "loss": 0.4711,
+      "step": 71070
+    },
+    {
+      "epoch": 189.54666666666665,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002934837315593305,
+      "loss": 0.4676,
+      "step": 71080
+    },
+    {
+      "epoch": 189.57333333333332,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002934818984025998,
+      "loss": 0.4676,
+      "step": 71090
+    },
+    {
+      "epoch": 189.6,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002934800649937803,
+      "loss": 0.4615,
+      "step": 71100
+    },
+    {
+      "epoch": 189.62666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029347823133287523,
+      "loss": 0.4481,
+      "step": 71110
+    },
+    {
+      "epoch": 189.65333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002934763974198877,
+      "loss": 0.4673,
+      "step": 71120
+    },
+    {
+      "epoch": 189.68,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002934745632548211,
+      "loss": 0.4589,
+      "step": 71130
+    },
+    {
+      "epoch": 189.70666666666668,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002934727288376786,
+      "loss": 0.4599,
+      "step": 71140
+    },
+    {
+      "epoch": 189.73333333333332,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002934708941684633,
+      "loss": 0.4789,
+      "step": 71150
+    },
+    {
+      "epoch": 189.76,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.00029346905924717854,
+      "loss": 0.4793,
+      "step": 71160
+    },
+    {
+      "epoch": 189.78666666666666,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00029346722407382753,
+      "loss": 0.4784,
+      "step": 71170
+    },
+    {
+      "epoch": 189.81333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029346538864841344,
+      "loss": 0.4631,
+      "step": 71180
+    },
+    {
+      "epoch": 189.84,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002934635529709396,
+      "loss": 0.4696,
+      "step": 71190
+    },
+    {
+      "epoch": 189.86666666666667,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002934617170414092,
+      "loss": 0.469,
+      "step": 71200
+    },
+    {
+      "epoch": 189.89333333333335,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002934598808598253,
+      "loss": 0.46,
+      "step": 71210
+    },
+    {
+      "epoch": 189.92,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002934580444261913,
+      "loss": 0.47,
+      "step": 71220
+    },
+    {
+      "epoch": 189.94666666666666,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029345620774051044,
+      "loss": 0.4627,
+      "step": 71230
+    },
+    {
+      "epoch": 189.97333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00029345437080278595,
+      "loss": 0.4739,
+      "step": 71240
+    },
+    {
+      "epoch": 190.0,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029345253361302094,
+      "loss": 0.4633,
+      "step": 71250
+    },
+    {
+      "epoch": 190.0,
+      "eval_loss": 0.47591617703437805,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.0874,
+      "eval_samples_per_second": 1.586,
+      "eval_steps_per_second": 0.099,
+      "step": 71250
+    },
+    {
+      "epoch": 190.02666666666667,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002934506961712187,
+      "loss": 0.4825,
+      "step": 71260
+    },
+    {
+      "epoch": 190.05333333333334,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029344885847738247,
+      "loss": 0.4886,
+      "step": 71270
+    },
+    {
+      "epoch": 190.08,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002934470205315154,
+      "loss": 0.4766,
+      "step": 71280
+    },
+    {
+      "epoch": 190.10666666666665,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002934451823336209,
+      "loss": 0.4708,
+      "step": 71290
+    },
+    {
+      "epoch": 190.13333333333333,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.000293443343883702,
+      "loss": 0.4718,
+      "step": 71300
+    },
+    {
+      "epoch": 190.16,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002934415051817621,
+      "loss": 0.4762,
+      "step": 71310
+    },
+    {
+      "epoch": 190.18666666666667,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002934396662278043,
+      "loss": 0.4684,
+      "step": 71320
+    },
+    {
+      "epoch": 190.21333333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002934378270218319,
+      "loss": 0.4631,
+      "step": 71330
+    },
+    {
+      "epoch": 190.24,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002934359875638481,
+      "loss": 0.4662,
+      "step": 71340
+    },
+    {
+      "epoch": 190.26666666666668,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029343414785385613,
+      "loss": 0.4718,
+      "step": 71350
+    },
+    {
+      "epoch": 190.29333333333332,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029343230789185924,
+      "loss": 0.4656,
+      "step": 71360
+    },
+    {
+      "epoch": 190.32,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029343046767786065,
+      "loss": 0.4687,
+      "step": 71370
+    },
+    {
+      "epoch": 190.34666666666666,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029342862721186365,
+      "loss": 0.4776,
+      "step": 71380
+    },
+    {
+      "epoch": 190.37333333333333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002934267864938714,
+      "loss": 0.4734,
+      "step": 71390
+    },
+    {
+      "epoch": 190.4,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029342494552388713,
+      "loss": 0.4692,
+      "step": 71400
+    },
+    {
+      "epoch": 190.42666666666668,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002934231043019141,
+      "loss": 0.476,
+      "step": 71410
+    },
+    {
+      "epoch": 190.45333333333335,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029342126282795556,
+      "loss": 0.4789,
+      "step": 71420
+    },
+    {
+      "epoch": 190.48,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029341942110201475,
+      "loss": 0.485,
+      "step": 71430
+    },
+    {
+      "epoch": 190.50666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002934175791240949,
+      "loss": 0.4742,
+      "step": 71440
+    },
+    {
+      "epoch": 190.53333333333333,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002934157368941992,
+      "loss": 0.4682,
+      "step": 71450
+    },
+    {
+      "epoch": 190.56,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.00029341389441233095,
+      "loss": 0.4692,
+      "step": 71460
+    },
+    {
+      "epoch": 190.58666666666667,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.00029341205167849336,
+      "loss": 0.4642,
+      "step": 71470
+    },
+    {
+      "epoch": 190.61333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002934102086926896,
+      "loss": 0.4508,
+      "step": 71480
+    },
+    {
+      "epoch": 190.64,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002934083654549231,
+      "loss": 0.4568,
+      "step": 71490
+    },
+    {
+      "epoch": 190.66666666666666,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0002934065219651969,
+      "loss": 0.4708,
+      "step": 71500
+    },
+    {
+      "epoch": 190.69333333333333,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002934046782235143,
+      "loss": 0.4543,
+      "step": 71510
+    },
+    {
+      "epoch": 190.72,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029340283422987854,
+      "loss": 0.4712,
+      "step": 71520
+    },
+    {
+      "epoch": 190.74666666666667,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002934009899842929,
+      "loss": 0.4757,
+      "step": 71530
+    },
+    {
+      "epoch": 190.77333333333334,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002933991454867606,
+      "loss": 0.4825,
+      "step": 71540
+    },
+    {
+      "epoch": 190.8,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029339730073728483,
+      "loss": 0.4694,
+      "step": 71550
+    },
+    {
+      "epoch": 190.82666666666665,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002933954557358689,
+      "loss": 0.4643,
+      "step": 71560
+    },
+    {
+      "epoch": 190.85333333333332,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.000293393610482516,
+      "loss": 0.4734,
+      "step": 71570
+    },
+    {
+      "epoch": 190.88,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029339176497722945,
+      "loss": 0.4614,
+      "step": 71580
+    },
+    {
+      "epoch": 190.90666666666667,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002933899192200124,
+      "loss": 0.4643,
+      "step": 71590
+    },
+    {
+      "epoch": 190.93333333333334,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029338807321086814,
+      "loss": 0.4658,
+      "step": 71600
+    },
+    {
+      "epoch": 190.96,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002933862269497999,
+      "loss": 0.4639,
+      "step": 71610
+    },
+    {
+      "epoch": 190.98666666666668,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029338438043681096,
+      "loss": 0.4778,
+      "step": 71620
+    },
+    {
+      "epoch": 191.0,
+      "eval_loss": 0.4782630205154419,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2059,
+      "eval_samples_per_second": 1.568,
+      "eval_steps_per_second": 0.098,
+      "step": 71625
+    },
+    {
+      "epoch": 191.01333333333332,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.00029338253367190454,
+      "loss": 0.4669,
+      "step": 71630
+    },
+    {
+      "epoch": 191.04,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00029338068665508385,
+      "loss": 0.4909,
+      "step": 71640
+    },
+    {
+      "epoch": 191.06666666666666,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002933788393863521,
+      "loss": 0.4784,
+      "step": 71650
+    },
+    {
+      "epoch": 191.09333333333333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002933769918657127,
+      "loss": 0.4735,
+      "step": 71660
+    },
+    {
+      "epoch": 191.12,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029337514409316873,
+      "loss": 0.4692,
+      "step": 71670
+    },
+    {
+      "epoch": 191.14666666666668,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002933732960687235,
+      "loss": 0.4786,
+      "step": 71680
+    },
+    {
+      "epoch": 191.17333333333335,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002933714477923803,
+      "loss": 0.469,
+      "step": 71690
+    },
+    {
+      "epoch": 191.2,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029336959926414235,
+      "loss": 0.4678,
+      "step": 71700
+    },
+    {
+      "epoch": 191.22666666666666,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002933677504840128,
+      "loss": 0.4646,
+      "step": 71710
+    },
+    {
+      "epoch": 191.25333333333333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029336590145199505,
+      "loss": 0.4643,
+      "step": 71720
+    },
+    {
+      "epoch": 191.28,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002933640521680922,
+      "loss": 0.4782,
+      "step": 71730
+    },
+    {
+      "epoch": 191.30666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002933622026323077,
+      "loss": 0.4595,
+      "step": 71740
+    },
+    {
+      "epoch": 191.33333333333334,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002933603528446446,
+      "loss": 0.4742,
+      "step": 71750
+    },
+    {
+      "epoch": 191.36,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002933585028051062,
+      "loss": 0.4768,
+      "step": 71760
+    },
+    {
+      "epoch": 191.38666666666666,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0002933566525136959,
+      "loss": 0.4691,
+      "step": 71770
+    },
+    {
+      "epoch": 191.41333333333333,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029335480197041673,
+      "loss": 0.4726,
+      "step": 71780
+    },
+    {
+      "epoch": 191.44,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029335295117527204,
+      "loss": 0.476,
+      "step": 71790
+    },
+    {
+      "epoch": 191.46666666666667,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002933511001282651,
+      "loss": 0.4857,
+      "step": 71800
+    },
+    {
+      "epoch": 191.49333333333334,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002933492488293992,
+      "loss": 0.477,
+      "step": 71810
+    },
+    {
+      "epoch": 191.52,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029334739727867747,
+      "loss": 0.4708,
+      "step": 71820
+    },
+    {
+      "epoch": 191.54666666666665,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002933455454761033,
+      "loss": 0.4681,
+      "step": 71830
+    },
+    {
+      "epoch": 191.57333333333332,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029334369342167983,
+      "loss": 0.4674,
+      "step": 71840
+    },
+    {
+      "epoch": 191.6,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0002933418411154103,
+      "loss": 0.4619,
+      "step": 71850
+    },
+    {
+      "epoch": 191.62666666666667,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002933399885572981,
+      "loss": 0.4487,
+      "step": 71860
+    },
+    {
+      "epoch": 191.65333333333334,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029333813574734634,
+      "loss": 0.4678,
+      "step": 71870
+    },
+    {
+      "epoch": 191.68,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002933362826855584,
+      "loss": 0.4592,
+      "step": 71880
+    },
+    {
+      "epoch": 191.70666666666668,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002933344293719375,
+      "loss": 0.4601,
+      "step": 71890
+    },
+    {
+      "epoch": 191.73333333333332,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002933325758064868,
+      "loss": 0.4786,
+      "step": 71900
+    },
+    {
+      "epoch": 191.76,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029333072198920964,
+      "loss": 0.4803,
+      "step": 71910
+    },
+    {
+      "epoch": 191.78666666666666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002933288679201093,
+      "loss": 0.4782,
+      "step": 71920
+    },
+    {
+      "epoch": 191.81333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.000293327013599189,
+      "loss": 0.4639,
+      "step": 71930
+    },
+    {
+      "epoch": 191.84,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029332515902645204,
+      "loss": 0.4692,
+      "step": 71940
+    },
+    {
+      "epoch": 191.86666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002933233042019016,
+      "loss": 0.4683,
+      "step": 71950
+    },
+    {
+      "epoch": 191.89333333333335,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029332144912554094,
+      "loss": 0.4592,
+      "step": 71960
+    },
+    {
+      "epoch": 191.92,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002933195937973734,
+      "loss": 0.47,
+      "step": 71970
+    },
+    {
+      "epoch": 191.94666666666666,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0002933177382174022,
+      "loss": 0.4636,
+      "step": 71980
+    },
+    {
+      "epoch": 191.97333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002933158823856306,
+      "loss": 0.4733,
+      "step": 71990
+    },
+    {
+      "epoch": 192.0,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029331402630206185,
+      "loss": 0.463,
+      "step": 72000
+    },
+    {
+      "epoch": 192.0,
+      "eval_loss": 0.47730663418769836,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9336,
+      "eval_samples_per_second": 1.611,
+      "eval_steps_per_second": 0.101,
+      "step": 72000
+    },
+    {
+      "epoch": 192.02666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029331216996669924,
+      "loss": 0.4827,
+      "step": 72010
+    },
+    {
+      "epoch": 192.05333333333334,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.000293310313379546,
+      "loss": 0.4883,
+      "step": 72020
+    },
+    {
+      "epoch": 192.08,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029330845654060544,
+      "loss": 0.4756,
+      "step": 72030
+    },
+    {
+      "epoch": 192.10666666666665,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029330659944988075,
+      "loss": 0.4704,
+      "step": 72040
+    },
+    {
+      "epoch": 192.13333333333333,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002933047421073752,
+      "loss": 0.4721,
+      "step": 72050
+    },
+    {
+      "epoch": 192.16,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029330288451309217,
+      "loss": 0.4758,
+      "step": 72060
+    },
+    {
+      "epoch": 192.18666666666667,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0002933010266670348,
+      "loss": 0.469,
+      "step": 72070
+    },
+    {
+      "epoch": 192.21333333333334,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029329916856920635,
+      "loss": 0.463,
+      "step": 72080
+    },
+    {
+      "epoch": 192.24,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002932973102196101,
+      "loss": 0.467,
+      "step": 72090
+    },
+    {
+      "epoch": 192.26666666666668,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0002932954516182494,
+      "loss": 0.4706,
+      "step": 72100
+    },
+    {
+      "epoch": 192.29333333333332,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029329359276512743,
+      "loss": 0.4658,
+      "step": 72110
+    },
+    {
+      "epoch": 192.32,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029329173366024757,
+      "loss": 0.4696,
+      "step": 72120
+    },
+    {
+      "epoch": 192.34666666666666,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029328987430361287,
+      "loss": 0.4771,
+      "step": 72130
+    },
+    {
+      "epoch": 192.37333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002932880146952268,
+      "loss": 0.4733,
+      "step": 72140
+    },
+    {
+      "epoch": 192.4,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002932861548350925,
+      "loss": 0.4699,
+      "step": 72150
+    },
+    {
+      "epoch": 192.42666666666668,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029328429472321333,
+      "loss": 0.4758,
+      "step": 72160
+    },
+    {
+      "epoch": 192.45333333333335,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002932824343595926,
+      "loss": 0.4781,
+      "step": 72170
+    },
+    {
+      "epoch": 192.48,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002932805737442333,
+      "loss": 0.4846,
+      "step": 72180
+    },
+    {
+      "epoch": 192.50666666666666,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.000293278712877139,
+      "loss": 0.4738,
+      "step": 72190
+    },
+    {
+      "epoch": 192.53333333333333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002932768517583129,
+      "loss": 0.4675,
+      "step": 72200
+    },
+    {
+      "epoch": 192.56,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029327499038775824,
+      "loss": 0.4687,
+      "step": 72210
+    },
+    {
+      "epoch": 192.58666666666667,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029327312876547824,
+      "loss": 0.4643,
+      "step": 72220
+    },
+    {
+      "epoch": 192.61333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029327126689147616,
+      "loss": 0.4508,
+      "step": 72230
+    },
+    {
+      "epoch": 192.64,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029326940476575545,
+      "loss": 0.4564,
+      "step": 72240
+    },
+    {
+      "epoch": 192.66666666666666,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.00029326754238831915,
+      "loss": 0.472,
+      "step": 72250
+    },
+    {
+      "epoch": 192.69333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002932656797591707,
+      "loss": 0.4548,
+      "step": 72260
+    },
+    {
+      "epoch": 192.72,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029326381687831324,
+      "loss": 0.4717,
+      "step": 72270
+    },
+    {
+      "epoch": 192.74666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029326195374575025,
+      "loss": 0.475,
+      "step": 72280
+    },
+    {
+      "epoch": 192.77333333333334,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029326009036148473,
+      "loss": 0.4826,
+      "step": 72290
+    },
+    {
+      "epoch": 192.8,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002932582267255202,
+      "loss": 0.4698,
+      "step": 72300
+    },
+    {
+      "epoch": 192.82666666666665,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0002932563628378598,
+      "loss": 0.4648,
+      "step": 72310
+    },
+    {
+      "epoch": 192.85333333333332,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.00029325449869850676,
+      "loss": 0.4731,
+      "step": 72320
+    },
+    {
+      "epoch": 192.88,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002932526343074645,
+      "loss": 0.4608,
+      "step": 72330
+    },
+    {
+      "epoch": 192.90666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002932507696647362,
+      "loss": 0.4646,
+      "step": 72340
+    },
+    {
+      "epoch": 192.93333333333334,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029324890477032514,
+      "loss": 0.4655,
+      "step": 72350
+    },
+    {
+      "epoch": 192.96,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029324703962423467,
+      "loss": 0.4639,
+      "step": 72360
+    },
+    {
+      "epoch": 192.98666666666668,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.000293245174226468,
+      "loss": 0.4784,
+      "step": 72370
+    },
+    {
+      "epoch": 193.0,
+      "eval_loss": 0.47962862253189087,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.096,
+      "eval_samples_per_second": 1.585,
+      "eval_steps_per_second": 0.099,
+      "step": 72375
+    },
+    {
+      "epoch": 193.01333333333332,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002932433085770284,
+      "loss": 0.4667,
+      "step": 72380
+    },
+    {
+      "epoch": 193.04,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029324144267591916,
+      "loss": 0.4909,
+      "step": 72390
+    },
+    {
+      "epoch": 193.06666666666666,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00029323957652314356,
+      "loss": 0.4788,
+      "step": 72400
+    },
+    {
+      "epoch": 193.09333333333333,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029323771011870495,
+      "loss": 0.4733,
+      "step": 72410
+    },
+    {
+      "epoch": 193.12,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029323584346260643,
+      "loss": 0.4699,
+      "step": 72420
+    },
+    {
+      "epoch": 193.14666666666668,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029323397655485147,
+      "loss": 0.4782,
+      "step": 72430
+    },
+    {
+      "epoch": 193.17333333333335,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002932321093954433,
+      "loss": 0.4695,
+      "step": 72440
+    },
+    {
+      "epoch": 193.2,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002932302419843851,
+      "loss": 0.467,
+      "step": 72450
+    },
+    {
+      "epoch": 193.22666666666666,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002932283743216803,
+      "loss": 0.4652,
+      "step": 72460
+    },
+    {
+      "epoch": 193.25333333333333,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029322650640733203,
+      "loss": 0.4645,
+      "step": 72470
+    },
+    {
+      "epoch": 193.28,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0002932246382413437,
+      "loss": 0.4787,
+      "step": 72480
+    },
+    {
+      "epoch": 193.30666666666667,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0002932227698237185,
+      "loss": 0.4599,
+      "step": 72490
+    },
+    {
+      "epoch": 193.33333333333334,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002932209011544598,
+      "loss": 0.475,
+      "step": 72500
+    },
+    {
+      "epoch": 193.36,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029321903223357085,
+      "loss": 0.4775,
+      "step": 72510
+    },
+    {
+      "epoch": 193.38666666666666,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029321716306105486,
+      "loss": 0.4684,
+      "step": 72520
+    },
+    {
+      "epoch": 193.41333333333333,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00029321529363691523,
+      "loss": 0.4731,
+      "step": 72530
+    },
+    {
+      "epoch": 193.44,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029321342396115515,
+      "loss": 0.4753,
+      "step": 72540
+    },
+    {
+      "epoch": 193.46666666666667,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.00029321155403377793,
+      "loss": 0.4861,
+      "step": 72550
+    },
+    {
+      "epoch": 193.49333333333334,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002932096838547869,
+      "loss": 0.477,
+      "step": 72560
+    },
+    {
+      "epoch": 193.52,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002932078134241853,
+      "loss": 0.4717,
+      "step": 72570
+    },
+    {
+      "epoch": 193.54666666666665,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002932059427419764,
+      "loss": 0.4683,
+      "step": 72580
+    },
+    {
+      "epoch": 193.57333333333332,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029320407180816356,
+      "loss": 0.4676,
+      "step": 72590
+    },
+    {
+      "epoch": 193.6,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.00029320220062275,
+      "loss": 0.4609,
+      "step": 72600
+    },
+    {
+      "epoch": 193.62666666666667,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002932003291857391,
+      "loss": 0.4478,
+      "step": 72610
+    },
+    {
+      "epoch": 193.65333333333334,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.000293198457497134,
+      "loss": 0.4681,
+      "step": 72620
+    },
+    {
+      "epoch": 193.68,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002931965855569381,
+      "loss": 0.4593,
+      "step": 72630
+    },
+    {
+      "epoch": 193.70666666666668,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.00029319471336515464,
+      "loss": 0.4589,
+      "step": 72640
+    },
+    {
+      "epoch": 193.73333333333332,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029319284092178696,
+      "loss": 0.4783,
+      "step": 72650
+    },
+    {
+      "epoch": 193.76,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.00029319096822683825,
+      "loss": 0.4794,
+      "step": 72660
+    },
+    {
+      "epoch": 193.78666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002931890952803119,
+      "loss": 0.4785,
+      "step": 72670
+    },
+    {
+      "epoch": 193.81333333333333,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002931872220822112,
+      "loss": 0.4634,
+      "step": 72680
+    },
+    {
+      "epoch": 193.84,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0002931853486325394,
+      "loss": 0.4689,
+      "step": 72690
+    },
+    {
+      "epoch": 193.86666666666667,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002931834749312998,
+      "loss": 0.4685,
+      "step": 72700
+    },
+    {
+      "epoch": 193.89333333333335,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00029318160097849564,
+      "loss": 0.4595,
+      "step": 72710
+    },
+    {
+      "epoch": 193.92,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002931797267741303,
+      "loss": 0.4706,
+      "step": 72720
+    },
+    {
+      "epoch": 193.94666666666666,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029317785231820705,
+      "loss": 0.4625,
+      "step": 72730
+    },
+    {
+      "epoch": 193.97333333333333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029317597761072914,
+      "loss": 0.4745,
+      "step": 72740
+    },
+    {
+      "epoch": 194.0,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002931741026516999,
+      "loss": 0.463,
+      "step": 72750
+    },
+    {
+      "epoch": 194.0,
+      "eval_loss": 0.4789677858352661,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.3169,
+      "eval_samples_per_second": 1.717,
+      "eval_steps_per_second": 0.107,
+      "step": 72750
+    },
+    {
+      "epoch": 194.02666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002931722274411227,
+      "loss": 0.4826,
+      "step": 72760
+    },
+    {
+      "epoch": 194.05333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002931703519790007,
+      "loss": 0.4883,
+      "step": 72770
+    },
+    {
+      "epoch": 194.08,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002931684762653372,
+      "loss": 0.4752,
+      "step": 72780
+    },
+    {
+      "epoch": 194.10666666666665,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002931666003001356,
+      "loss": 0.471,
+      "step": 72790
+    },
+    {
+      "epoch": 194.13333333333333,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029316472408339903,
+      "loss": 0.4721,
+      "step": 72800
+    },
+    {
+      "epoch": 194.16,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00029316284761513105,
+      "loss": 0.4762,
+      "step": 72810
+    },
+    {
+      "epoch": 194.18666666666667,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0002931609708953347,
+      "loss": 0.4679,
+      "step": 72820
+    },
+    {
+      "epoch": 194.21333333333334,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002931590939240134,
+      "loss": 0.4637,
+      "step": 72830
+    },
+    {
+      "epoch": 194.24,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029315721670117046,
+      "loss": 0.4671,
+      "step": 72840
+    },
+    {
+      "epoch": 194.26666666666668,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029315533922680915,
+      "loss": 0.471,
+      "step": 72850
+    },
+    {
+      "epoch": 194.29333333333332,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029315346150093273,
+      "loss": 0.4661,
+      "step": 72860
+    },
+    {
+      "epoch": 194.32,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029315158352354455,
+      "loss": 0.4689,
+      "step": 72870
+    },
+    {
+      "epoch": 194.34666666666666,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029314970529464786,
+      "loss": 0.477,
+      "step": 72880
+    },
+    {
+      "epoch": 194.37333333333333,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029314782681424603,
+      "loss": 0.473,
+      "step": 72890
+    },
+    {
+      "epoch": 194.4,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0002931459480823423,
+      "loss": 0.4702,
+      "step": 72900
+    },
+    {
+      "epoch": 194.42666666666668,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00029314406909894,
+      "loss": 0.4762,
+      "step": 72910
+    },
+    {
+      "epoch": 194.45333333333335,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029314218986404244,
+      "loss": 0.4781,
+      "step": 72920
+    },
+    {
+      "epoch": 194.48,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002931403103776529,
+      "loss": 0.4845,
+      "step": 72930
+    },
+    {
+      "epoch": 194.50666666666666,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002931384306397747,
+      "loss": 0.4734,
+      "step": 72940
+    },
+    {
+      "epoch": 194.53333333333333,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002931365506504111,
+      "loss": 0.4684,
+      "step": 72950
+    },
+    {
+      "epoch": 194.56,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0002931346704095655,
+      "loss": 0.4688,
+      "step": 72960
+    },
+    {
+      "epoch": 194.58666666666667,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002931327899172411,
+      "loss": 0.4648,
+      "step": 72970
+    },
+    {
+      "epoch": 194.61333333333334,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002931309091734412,
+      "loss": 0.4509,
+      "step": 72980
+    },
+    {
+      "epoch": 194.64,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002931290281781692,
+      "loss": 0.4557,
+      "step": 72990
+    },
+    {
+      "epoch": 194.66666666666666,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00029312714693142836,
+      "loss": 0.4714,
+      "step": 73000
+    },
+    {
+      "epoch": 194.69333333333333,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0002931252654332219,
+      "loss": 0.4542,
+      "step": 73010
+    },
+    {
+      "epoch": 194.72,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002931233836835533,
+      "loss": 0.471,
+      "step": 73020
+    },
+    {
+      "epoch": 194.74666666666667,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029312150168242567,
+      "loss": 0.4755,
+      "step": 73030
+    },
+    {
+      "epoch": 194.77333333333334,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002931196194298425,
+      "loss": 0.4824,
+      "step": 73040
+    },
+    {
+      "epoch": 194.8,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029311773692580696,
+      "loss": 0.47,
+      "step": 73050
+    },
+    {
+      "epoch": 194.82666666666665,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029311585417032246,
+      "loss": 0.4646,
+      "step": 73060
+    },
+    {
+      "epoch": 194.85333333333332,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002931139711633922,
+      "loss": 0.4733,
+      "step": 73070
+    },
+    {
+      "epoch": 194.88,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0002931120879050195,
+      "loss": 0.4619,
+      "step": 73080
+    },
+    {
+      "epoch": 194.90666666666667,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002931102043952078,
+      "loss": 0.464,
+      "step": 73090
+    },
+    {
+      "epoch": 194.93333333333334,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002931083206339604,
+      "loss": 0.4658,
+      "step": 73100
+    },
+    {
+      "epoch": 194.96,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0002931064366212804,
+      "loss": 0.4639,
+      "step": 73110
+    },
+    {
+      "epoch": 194.98666666666668,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002931045523571713,
+      "loss": 0.4778,
+      "step": 73120
+    },
+    {
+      "epoch": 195.0,
+      "eval_loss": 0.47874346375465393,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.6037,
+      "eval_samples_per_second": 1.509,
+      "eval_steps_per_second": 0.094,
+      "step": 73125
+    },
+    {
+      "epoch": 195.01333333333332,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002931026678416363,
+      "loss": 0.4665,
+      "step": 73130
+    },
+    {
+      "epoch": 195.04,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029310078307467885,
+      "loss": 0.4907,
+      "step": 73140
+    },
+    {
+      "epoch": 195.06666666666666,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002930988980563021,
+      "loss": 0.4784,
+      "step": 73150
+    },
+    {
+      "epoch": 195.09333333333333,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.00029309701278650946,
+      "loss": 0.4726,
+      "step": 73160
+    },
+    {
+      "epoch": 195.12,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00029309512726530425,
+      "loss": 0.4691,
+      "step": 73170
+    },
+    {
+      "epoch": 195.14666666666668,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00029309324149268974,
+      "loss": 0.4784,
+      "step": 73180
+    },
+    {
+      "epoch": 195.17333333333335,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029309135546866925,
+      "loss": 0.4692,
+      "step": 73190
+    },
+    {
+      "epoch": 195.2,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.00029308946919324606,
+      "loss": 0.4678,
+      "step": 73200
+    },
+    {
+      "epoch": 195.22666666666666,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029308758266642355,
+      "loss": 0.4647,
+      "step": 73210
+    },
+    {
+      "epoch": 195.25333333333333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029308569588820503,
+      "loss": 0.465,
+      "step": 73220
+    },
+    {
+      "epoch": 195.28,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002930838088585938,
+      "loss": 0.4783,
+      "step": 73230
+    },
+    {
+      "epoch": 195.30666666666667,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00029308192157759315,
+      "loss": 0.4592,
+      "step": 73240
+    },
+    {
+      "epoch": 195.33333333333334,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002930800340452064,
+      "loss": 0.4747,
+      "step": 73250
+    },
+    {
+      "epoch": 195.36,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029307814626143694,
+      "loss": 0.477,
+      "step": 73260
+    },
+    {
+      "epoch": 195.38666666666666,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.000293076258226288,
+      "loss": 0.4689,
+      "step": 73270
+    },
+    {
+      "epoch": 195.41333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029307436993976284,
+      "loss": 0.4733,
+      "step": 73280
+    },
+    {
+      "epoch": 195.44,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029307248140186495,
+      "loss": 0.4763,
+      "step": 73290
+    },
+    {
+      "epoch": 195.46666666666667,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.00029307059261259754,
+      "loss": 0.4854,
+      "step": 73300
+    },
+    {
+      "epoch": 195.49333333333334,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029306870357196393,
+      "loss": 0.4777,
+      "step": 73310
+    },
+    {
+      "epoch": 195.52,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002930668142799675,
+      "loss": 0.4717,
+      "step": 73320
+    },
+    {
+      "epoch": 195.54666666666665,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002930649247366115,
+      "loss": 0.4681,
+      "step": 73330
+    },
+    {
+      "epoch": 195.57333333333332,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002930630349418993,
+      "loss": 0.4673,
+      "step": 73340
+    },
+    {
+      "epoch": 195.6,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002930611448958342,
+      "loss": 0.4608,
+      "step": 73350
+    },
+    {
+      "epoch": 195.62666666666667,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029305925459841947,
+      "loss": 0.4479,
+      "step": 73360
+    },
+    {
+      "epoch": 195.65333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002930573640496585,
+      "loss": 0.4666,
+      "step": 73370
+    },
+    {
+      "epoch": 195.68,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029305547324955464,
+      "loss": 0.4589,
+      "step": 73380
+    },
+    {
+      "epoch": 195.70666666666668,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002930535821981111,
+      "loss": 0.4599,
+      "step": 73390
+    },
+    {
+      "epoch": 195.73333333333332,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002930516908953313,
+      "loss": 0.4787,
+      "step": 73400
+    },
+    {
+      "epoch": 195.76,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029304979934121847,
+      "loss": 0.4789,
+      "step": 73410
+    },
+    {
+      "epoch": 195.78666666666666,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.000293047907535776,
+      "loss": 0.4782,
+      "step": 73420
+    },
+    {
+      "epoch": 195.81333333333333,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029304601547900733,
+      "loss": 0.4634,
+      "step": 73430
+    },
+    {
+      "epoch": 195.84,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0002930441231709156,
+      "loss": 0.4693,
+      "step": 73440
+    },
+    {
+      "epoch": 195.86666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002930422306115041,
+      "loss": 0.4684,
+      "step": 73450
+    },
+    {
+      "epoch": 195.89333333333335,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029304033780077634,
+      "loss": 0.4598,
+      "step": 73460
+    },
+    {
+      "epoch": 195.92,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00029303844473873555,
+      "loss": 0.4706,
+      "step": 73470
+    },
+    {
+      "epoch": 195.94666666666666,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.000293036551425385,
+      "loss": 0.4625,
+      "step": 73480
+    },
+    {
+      "epoch": 195.97333333333333,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029303465786072815,
+      "loss": 0.4736,
+      "step": 73490
+    },
+    {
+      "epoch": 196.0,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0002930327640447682,
+      "loss": 0.463,
+      "step": 73500
+    },
+    {
+      "epoch": 196.0,
+      "eval_loss": 0.4790534973144531,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.5986,
+      "eval_samples_per_second": 1.667,
+      "eval_steps_per_second": 0.104,
+      "step": 73500
+    },
+    {
+      "epoch": 196.02666666666667,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029303086997750855,
+      "loss": 0.4823,
+      "step": 73510
+    },
+    {
+      "epoch": 196.05333333333334,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029302897565895253,
+      "loss": 0.4882,
+      "step": 73520
+    },
+    {
+      "epoch": 196.08,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00029302708108910343,
+      "loss": 0.4752,
+      "step": 73530
+    },
+    {
+      "epoch": 196.10666666666665,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002930251862679646,
+      "loss": 0.4704,
+      "step": 73540
+    },
+    {
+      "epoch": 196.13333333333333,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029302329119553936,
+      "loss": 0.4717,
+      "step": 73550
+    },
+    {
+      "epoch": 196.16,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00029302139587183106,
+      "loss": 0.4759,
+      "step": 73560
+    },
+    {
+      "epoch": 196.18666666666667,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.00029301950029684296,
+      "loss": 0.4691,
+      "step": 73570
+    },
+    {
+      "epoch": 196.21333333333334,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029301760447057853,
+      "loss": 0.4639,
+      "step": 73580
+    },
+    {
+      "epoch": 196.24,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00029301570839304094,
+      "loss": 0.4671,
+      "step": 73590
+    },
+    {
+      "epoch": 196.26666666666668,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029301381206423363,
+      "loss": 0.4707,
+      "step": 73600
+    },
+    {
+      "epoch": 196.29333333333332,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002930119154841599,
+      "loss": 0.4663,
+      "step": 73610
+    },
+    {
+      "epoch": 196.32,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002930100186528231,
+      "loss": 0.4694,
+      "step": 73620
+    },
+    {
+      "epoch": 196.34666666666666,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029300812157022647,
+      "loss": 0.4773,
+      "step": 73630
+    },
+    {
+      "epoch": 196.37333333333333,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0002930062242363735,
+      "loss": 0.4729,
+      "step": 73640
+    },
+    {
+      "epoch": 196.4,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002930043266512674,
+      "loss": 0.4705,
+      "step": 73650
+    },
+    {
+      "epoch": 196.42666666666668,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00029300242881491153,
+      "loss": 0.4761,
+      "step": 73660
+    },
+    {
+      "epoch": 196.45333333333335,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002930005307273093,
+      "loss": 0.478,
+      "step": 73670
+    },
+    {
+      "epoch": 196.48,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002929986323884639,
+      "loss": 0.4841,
+      "step": 73680
+    },
+    {
+      "epoch": 196.50666666666666,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029299673379837884,
+      "loss": 0.4746,
+      "step": 73690
+    },
+    {
+      "epoch": 196.53333333333333,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00029299483495705736,
+      "loss": 0.4678,
+      "step": 73700
+    },
+    {
+      "epoch": 196.56,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002929929358645027,
+      "loss": 0.4689,
+      "step": 73710
+    },
+    {
+      "epoch": 196.58666666666667,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029299103652071835,
+      "loss": 0.465,
+      "step": 73720
+    },
+    {
+      "epoch": 196.61333333333334,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002929891369257076,
+      "loss": 0.451,
+      "step": 73730
+    },
+    {
+      "epoch": 196.64,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029298723707947376,
+      "loss": 0.4557,
+      "step": 73740
+    },
+    {
+      "epoch": 196.66666666666666,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029298533698202025,
+      "loss": 0.4713,
+      "step": 73750
+    },
+    {
+      "epoch": 196.69333333333333,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002929834366333503,
+      "loss": 0.4543,
+      "step": 73760
+    },
+    {
+      "epoch": 196.72,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0002929815360334673,
+      "loss": 0.4713,
+      "step": 73770
+    },
+    {
+      "epoch": 196.74666666666667,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0002929796351823746,
+      "loss": 0.4757,
+      "step": 73780
+    },
+    {
+      "epoch": 196.77333333333334,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002929777340800755,
+      "loss": 0.4838,
+      "step": 73790
+    },
+    {
+      "epoch": 196.8,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002929758327265734,
+      "loss": 0.4702,
+      "step": 73800
+    },
+    {
+      "epoch": 196.82666666666665,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002929739311218716,
+      "loss": 0.465,
+      "step": 73810
+    },
+    {
+      "epoch": 196.85333333333332,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002929720292659734,
+      "loss": 0.4735,
+      "step": 73820
+    },
+    {
+      "epoch": 196.88,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029297012715888226,
+      "loss": 0.4614,
+      "step": 73830
+    },
+    {
+      "epoch": 196.90666666666667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0002929682248006014,
+      "loss": 0.465,
+      "step": 73840
+    },
+    {
+      "epoch": 196.93333333333334,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002929663221911342,
+      "loss": 0.4654,
+      "step": 73850
+    },
+    {
+      "epoch": 196.96,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00029296441933048407,
+      "loss": 0.4636,
+      "step": 73860
+    },
+    {
+      "epoch": 196.98666666666668,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029296251621865427,
+      "loss": 0.4778,
+      "step": 73870
+    },
+    {
+      "epoch": 197.0,
+      "eval_loss": 0.47821304202079773,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.7882,
+      "eval_samples_per_second": 1.635,
+      "eval_steps_per_second": 0.102,
+      "step": 73875
+    },
+    {
+      "epoch": 197.01333333333332,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00029296061285564816,
+      "loss": 0.4665,
+      "step": 73880
+    },
+    {
+      "epoch": 197.04,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029295870924146917,
+      "loss": 0.4918,
+      "step": 73890
+    },
+    {
+      "epoch": 197.06666666666666,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0002929568053761205,
+      "loss": 0.478,
+      "step": 73900
+    },
+    {
+      "epoch": 197.09333333333333,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002929549012596056,
+      "loss": 0.4735,
+      "step": 73910
+    },
+    {
+      "epoch": 197.12,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002929529968919277,
+      "loss": 0.4694,
+      "step": 73920
+    },
+    {
+      "epoch": 197.14666666666668,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002929510922730903,
+      "loss": 0.4782,
+      "step": 73930
+    },
+    {
+      "epoch": 197.17333333333335,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029294918740309665,
+      "loss": 0.4687,
+      "step": 73940
+    },
+    {
+      "epoch": 197.2,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029294728228195015,
+      "loss": 0.4667,
+      "step": 73950
+    },
+    {
+      "epoch": 197.22666666666666,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002929453769096541,
+      "loss": 0.4646,
+      "step": 73960
+    },
+    {
+      "epoch": 197.25333333333333,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00029294347128621187,
+      "loss": 0.4654,
+      "step": 73970
+    },
+    {
+      "epoch": 197.28,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002929415654116268,
+      "loss": 0.4789,
+      "step": 73980
+    },
+    {
+      "epoch": 197.30666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002929396592859022,
+      "loss": 0.4594,
+      "step": 73990
+    },
+    {
+      "epoch": 197.33333333333334,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002929377529090415,
+      "loss": 0.4752,
+      "step": 74000
+    },
+    {
+      "epoch": 197.36,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.00029293584628104803,
+      "loss": 0.4767,
+      "step": 74010
+    },
+    {
+      "epoch": 197.38666666666666,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002929339394019251,
+      "loss": 0.4686,
+      "step": 74020
+    },
+    {
+      "epoch": 197.41333333333333,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002929320322716761,
+      "loss": 0.4734,
+      "step": 74030
+    },
+    {
+      "epoch": 197.44,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0002929301248903043,
+      "loss": 0.4758,
+      "step": 74040
+    },
+    {
+      "epoch": 197.46666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002929282172578132,
+      "loss": 0.4853,
+      "step": 74050
+    },
+    {
+      "epoch": 197.49333333333334,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.000292926309374206,
+      "loss": 0.4766,
+      "step": 74060
+    },
+    {
+      "epoch": 197.52,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029292440123948615,
+      "loss": 0.4707,
+      "step": 74070
+    },
+    {
+      "epoch": 197.54666666666665,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002929224928536569,
+      "loss": 0.4683,
+      "step": 74080
+    },
+    {
+      "epoch": 197.57333333333332,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002929205842167218,
+      "loss": 0.4677,
+      "step": 74090
+    },
+    {
+      "epoch": 197.6,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.000292918675328684,
+      "loss": 0.4606,
+      "step": 74100
+    },
+    {
+      "epoch": 197.62666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029291676618954687,
+      "loss": 0.4483,
+      "step": 74110
+    },
+    {
+      "epoch": 197.65333333333334,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029291485679931387,
+      "loss": 0.467,
+      "step": 74120
+    },
+    {
+      "epoch": 197.68,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029291294715798837,
+      "loss": 0.4598,
+      "step": 74130
+    },
+    {
+      "epoch": 197.70666666666668,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002929110372655736,
+      "loss": 0.4596,
+      "step": 74140
+    },
+    {
+      "epoch": 197.73333333333332,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029290912712207296,
+      "loss": 0.4786,
+      "step": 74150
+    },
+    {
+      "epoch": 197.76,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.00029290721672748987,
+      "loss": 0.4795,
+      "step": 74160
+    },
+    {
+      "epoch": 197.78666666666666,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0002929053060818276,
+      "loss": 0.4776,
+      "step": 74170
+    },
+    {
+      "epoch": 197.81333333333333,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.0002929033951850896,
+      "loss": 0.465,
+      "step": 74180
+    },
+    {
+      "epoch": 197.84,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0002929014840372791,
+      "loss": 0.473,
+      "step": 74190
+    },
+    {
+      "epoch": 197.86666666666667,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0002928995726383996,
+      "loss": 0.4711,
+      "step": 74200
+    },
+    {
+      "epoch": 197.89333333333335,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0002928976609884543,
+      "loss": 0.4607,
+      "step": 74210
+    },
+    {
+      "epoch": 197.92,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029289574908744674,
+      "loss": 0.4703,
+      "step": 74220
+    },
+    {
+      "epoch": 197.94666666666666,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002928938369353802,
+      "loss": 0.4629,
+      "step": 74230
+    },
+    {
+      "epoch": 197.97333333333333,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.000292891924532258,
+      "loss": 0.4745,
+      "step": 74240
+    },
+    {
+      "epoch": 198.0,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0002928900118780835,
+      "loss": 0.4626,
+      "step": 74250
+    },
+    {
+      "epoch": 198.0,
+      "eval_loss": 0.47928979992866516,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2462,
+      "eval_samples_per_second": 1.562,
+      "eval_steps_per_second": 0.098,
+      "step": 74250
+    },
+    {
+      "epoch": 198.02666666666667,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029288809897286006,
+      "loss": 0.4829,
+      "step": 74260
+    },
+    {
+      "epoch": 198.05333333333334,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029288618581659113,
+      "loss": 0.4887,
+      "step": 74270
+    },
+    {
+      "epoch": 198.08,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029288427240928,
+      "loss": 0.4753,
+      "step": 74280
+    },
+    {
+      "epoch": 198.10666666666665,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029288235875093006,
+      "loss": 0.4704,
+      "step": 74290
+    },
+    {
+      "epoch": 198.13333333333333,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029288044484154465,
+      "loss": 0.472,
+      "step": 74300
+    },
+    {
+      "epoch": 198.16,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002928785306811271,
+      "loss": 0.4765,
+      "step": 74310
+    },
+    {
+      "epoch": 198.18666666666667,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0002928766162696808,
+      "loss": 0.4678,
+      "step": 74320
+    },
+    {
+      "epoch": 198.21333333333334,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029287470160720917,
+      "loss": 0.4629,
+      "step": 74330
+    },
+    {
+      "epoch": 198.24,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002928727866937155,
+      "loss": 0.4674,
+      "step": 74340
+    },
+    {
+      "epoch": 198.26666666666668,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029287087152920317,
+      "loss": 0.4721,
+      "step": 74350
+    },
+    {
+      "epoch": 198.29333333333332,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002928689561136756,
+      "loss": 0.4651,
+      "step": 74360
+    },
+    {
+      "epoch": 198.32,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002928670404471361,
+      "loss": 0.4684,
+      "step": 74370
+    },
+    {
+      "epoch": 198.34666666666666,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.00029286512452958804,
+      "loss": 0.4774,
+      "step": 74380
+    },
+    {
+      "epoch": 198.37333333333333,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002928632083610348,
+      "loss": 0.4731,
+      "step": 74390
+    },
+    {
+      "epoch": 198.4,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029286129194147967,
+      "loss": 0.4694,
+      "step": 74400
+    },
+    {
+      "epoch": 198.42666666666668,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002928593752709262,
+      "loss": 0.4761,
+      "step": 74410
+    },
+    {
+      "epoch": 198.45333333333335,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029285745834937754,
+      "loss": 0.4777,
+      "step": 74420
+    },
+    {
+      "epoch": 198.48,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029285554117683724,
+      "loss": 0.4846,
+      "step": 74430
+    },
+    {
+      "epoch": 198.50666666666666,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002928536237533086,
+      "loss": 0.4744,
+      "step": 74440
+    },
+    {
+      "epoch": 198.53333333333333,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.00029285170607879494,
+      "loss": 0.4672,
+      "step": 74450
+    },
+    {
+      "epoch": 198.56,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00029284978815329965,
+      "loss": 0.4685,
+      "step": 74460
+    },
+    {
+      "epoch": 198.58666666666667,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029284786997682614,
+      "loss": 0.465,
+      "step": 74470
+    },
+    {
+      "epoch": 198.61333333333334,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0002928459515493778,
+      "loss": 0.4499,
+      "step": 74480
+    },
+    {
+      "epoch": 198.64,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.00029284403287095786,
+      "loss": 0.456,
+      "step": 74490
+    },
+    {
+      "epoch": 198.66666666666666,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002928421139415699,
+      "loss": 0.4716,
+      "step": 74500
+    },
+    {
+      "epoch": 198.69333333333333,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029284019476121713,
+      "loss": 0.4545,
+      "step": 74510
+    },
+    {
+      "epoch": 198.72,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029283827532990294,
+      "loss": 0.4713,
+      "step": 74520
+    },
+    {
+      "epoch": 198.74666666666667,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002928363556476308,
+      "loss": 0.4758,
+      "step": 74530
+    },
+    {
+      "epoch": 198.77333333333334,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.00029283443571440396,
+      "loss": 0.4826,
+      "step": 74540
+    },
+    {
+      "epoch": 198.8,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0002928325155302259,
+      "loss": 0.4706,
+      "step": 74550
+    },
+    {
+      "epoch": 198.82666666666665,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.00029283059509509993,
+      "loss": 0.4646,
+      "step": 74560
+    },
+    {
+      "epoch": 198.85333333333332,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029282867440902946,
+      "loss": 0.4736,
+      "step": 74570
+    },
+    {
+      "epoch": 198.88,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029282675347201784,
+      "loss": 0.4613,
+      "step": 74580
+    },
+    {
+      "epoch": 198.90666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002928248322840684,
+      "loss": 0.4638,
+      "step": 74590
+    },
+    {
+      "epoch": 198.93333333333334,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002928229108451846,
+      "loss": 0.4662,
+      "step": 74600
+    },
+    {
+      "epoch": 198.96,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029282098915536976,
+      "loss": 0.4646,
+      "step": 74610
+    },
+    {
+      "epoch": 198.98666666666668,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0002928190672146273,
+      "loss": 0.4773,
+      "step": 74620
+    },
+    {
+      "epoch": 199.0,
+      "eval_loss": 0.47776076197624207,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9108,
+      "eval_samples_per_second": 1.614,
+      "eval_steps_per_second": 0.101,
+      "step": 74625
+    },
+    {
+      "epoch": 199.01333333333332,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002928171450229606,
+      "loss": 0.4669,
+      "step": 74630
+    },
+    {
+      "epoch": 199.04,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029281522258037295,
+      "loss": 0.4913,
+      "step": 74640
+    },
+    {
+      "epoch": 199.06666666666666,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.00029281329988686784,
+      "loss": 0.4783,
+      "step": 74650
+    },
+    {
+      "epoch": 199.09333333333333,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002928113769424486,
+      "loss": 0.4737,
+      "step": 74660
+    },
+    {
+      "epoch": 199.12,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029280945374711854,
+      "loss": 0.47,
+      "step": 74670
+    },
+    {
+      "epoch": 199.14666666666668,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029280753030088113,
+      "loss": 0.4777,
+      "step": 74680
+    },
+    {
+      "epoch": 199.17333333333335,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029280560660373977,
+      "loss": 0.4689,
+      "step": 74690
+    },
+    {
+      "epoch": 199.2,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029280368265569777,
+      "loss": 0.4675,
+      "step": 74700
+    },
+    {
+      "epoch": 199.22666666666666,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00029280175845675853,
+      "loss": 0.4645,
+      "step": 74710
+    },
+    {
+      "epoch": 199.25333333333333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029279983400692537,
+      "loss": 0.4649,
+      "step": 74720
+    },
+    {
+      "epoch": 199.28,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002927979093062018,
+      "loss": 0.4784,
+      "step": 74730
+    },
+    {
+      "epoch": 199.30666666666667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029279598435459116,
+      "loss": 0.4593,
+      "step": 74740
+    },
+    {
+      "epoch": 199.33333333333334,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029279405915209677,
+      "loss": 0.4742,
+      "step": 74750
+    },
+    {
+      "epoch": 199.36,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029279213369872207,
+      "loss": 0.4768,
+      "step": 74760
+    },
+    {
+      "epoch": 199.38666666666666,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002927902079944704,
+      "loss": 0.4682,
+      "step": 74770
+    },
+    {
+      "epoch": 199.41333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029278828203934516,
+      "loss": 0.4733,
+      "step": 74780
+    },
+    {
+      "epoch": 199.44,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002927863558333498,
+      "loss": 0.4756,
+      "step": 74790
+    },
+    {
+      "epoch": 199.46666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002927844293764876,
+      "loss": 0.4861,
+      "step": 74800
+    },
+    {
+      "epoch": 199.49333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.000292782502668762,
+      "loss": 0.476,
+      "step": 74810
+    },
+    {
+      "epoch": 199.52,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029278057571017633,
+      "loss": 0.471,
+      "step": 74820
+    },
+    {
+      "epoch": 199.54666666666665,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029277864850073406,
+      "loss": 0.4683,
+      "step": 74830
+    },
+    {
+      "epoch": 199.57333333333332,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0002927767210404386,
+      "loss": 0.4676,
+      "step": 74840
+    },
+    {
+      "epoch": 199.6,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029277479332929314,
+      "loss": 0.4602,
+      "step": 74850
+    },
+    {
+      "epoch": 199.62666666666667,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002927728653673013,
+      "loss": 0.448,
+      "step": 74860
+    },
+    {
+      "epoch": 199.65333333333334,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002927709371544663,
+      "loss": 0.4674,
+      "step": 74870
+    },
+    {
+      "epoch": 199.68,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029276900869079165,
+      "loss": 0.459,
+      "step": 74880
+    },
+    {
+      "epoch": 199.70666666666668,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002927670799762807,
+      "loss": 0.4599,
+      "step": 74890
+    },
+    {
+      "epoch": 199.73333333333332,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002927651510109368,
+      "loss": 0.4786,
+      "step": 74900
+    },
+    {
+      "epoch": 199.76,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029276322179476336,
+      "loss": 0.4789,
+      "step": 74910
+    },
+    {
+      "epoch": 199.78666666666666,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002927612923277637,
+      "loss": 0.4784,
+      "step": 74920
+    },
+    {
+      "epoch": 199.81333333333333,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002927593626099414,
+      "loss": 0.4634,
+      "step": 74930
+    },
+    {
+      "epoch": 199.84,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002927574326412996,
+      "loss": 0.4694,
+      "step": 74940
+    },
+    {
+      "epoch": 199.86666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029275550242184193,
+      "loss": 0.4685,
+      "step": 74950
+    },
+    {
+      "epoch": 199.89333333333335,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029275357195157164,
+      "loss": 0.4598,
+      "step": 74960
+    },
+    {
+      "epoch": 199.92,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029275164123049216,
+      "loss": 0.4703,
+      "step": 74970
+    },
+    {
+      "epoch": 199.94666666666666,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029274971025860684,
+      "loss": 0.4625,
+      "step": 74980
+    },
+    {
+      "epoch": 199.97333333333333,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00029274777903591916,
+      "loss": 0.4737,
+      "step": 74990
+    },
+    {
+      "epoch": 200.0,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002927458475624324,
+      "loss": 0.4624,
+      "step": 75000
+    },
+    {
+      "epoch": 200.0,
+      "eval_loss": 0.47762927412986755,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.3295,
+      "eval_samples_per_second": 1.549,
+      "eval_steps_per_second": 0.097,
+      "step": 75000
+    },
+    {
+      "epoch": 200.02666666666667,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002927439158381501,
+      "loss": 0.483,
+      "step": 75010
+    },
+    {
+      "epoch": 200.05333333333334,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029274198386307554,
+      "loss": 0.4882,
+      "step": 75020
+    },
+    {
+      "epoch": 200.08,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029274005163721213,
+      "loss": 0.4755,
+      "step": 75030
+    },
+    {
+      "epoch": 200.10666666666665,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029273811916056327,
+      "loss": 0.471,
+      "step": 75040
+    },
+    {
+      "epoch": 200.13333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029273618643313237,
+      "loss": 0.4726,
+      "step": 75050
+    },
+    {
+      "epoch": 200.16,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002927342534549228,
+      "loss": 0.4768,
+      "step": 75060
+    },
+    {
+      "epoch": 200.18666666666667,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.000292732320225938,
+      "loss": 0.4684,
+      "step": 75070
+    },
+    {
+      "epoch": 200.21333333333334,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00029273038674618136,
+      "loss": 0.4631,
+      "step": 75080
+    },
+    {
+      "epoch": 200.24,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029272845301565626,
+      "loss": 0.4672,
+      "step": 75090
+    },
+    {
+      "epoch": 200.26666666666668,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002927265190343661,
+      "loss": 0.472,
+      "step": 75100
+    },
+    {
+      "epoch": 200.29333333333332,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002927245848023142,
+      "loss": 0.4651,
+      "step": 75110
+    },
+    {
+      "epoch": 200.32,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0002927226503195041,
+      "loss": 0.469,
+      "step": 75120
+    },
+    {
+      "epoch": 200.34666666666666,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029272071558593914,
+      "loss": 0.4771,
+      "step": 75130
+    },
+    {
+      "epoch": 200.37333333333333,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029271878060162265,
+      "loss": 0.4726,
+      "step": 75140
+    },
+    {
+      "epoch": 200.4,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029271684536655816,
+      "loss": 0.4694,
+      "step": 75150
+    },
+    {
+      "epoch": 200.42666666666668,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029271490988074894,
+      "loss": 0.4759,
+      "step": 75160
+    },
+    {
+      "epoch": 200.45333333333335,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002927129741441985,
+      "loss": 0.4783,
+      "step": 75170
+    },
+    {
+      "epoch": 200.48,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029271103815691016,
+      "loss": 0.4843,
+      "step": 75180
+    },
+    {
+      "epoch": 200.50666666666666,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002927091019188874,
+      "loss": 0.4742,
+      "step": 75190
+    },
+    {
+      "epoch": 200.53333333333333,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00029270716543013353,
+      "loss": 0.4685,
+      "step": 75200
+    },
+    {
+      "epoch": 200.56,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.000292705228690652,
+      "loss": 0.4693,
+      "step": 75210
+    },
+    {
+      "epoch": 200.58666666666667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029270329170044625,
+      "loss": 0.4647,
+      "step": 75220
+    },
+    {
+      "epoch": 200.61333333333334,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002927013544595196,
+      "loss": 0.4507,
+      "step": 75230
+    },
+    {
+      "epoch": 200.64,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029269941696787553,
+      "loss": 0.4557,
+      "step": 75240
+    },
+    {
+      "epoch": 200.66666666666666,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002926974792255174,
+      "loss": 0.471,
+      "step": 75250
+    },
+    {
+      "epoch": 200.69333333333333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029269554123244866,
+      "loss": 0.4542,
+      "step": 75260
+    },
+    {
+      "epoch": 200.72,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002926936029886726,
+      "loss": 0.4716,
+      "step": 75270
+    },
+    {
+      "epoch": 200.74666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002926916644941928,
+      "loss": 0.4756,
+      "step": 75280
+    },
+    {
+      "epoch": 200.77333333333334,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002926897257490125,
+      "loss": 0.4825,
+      "step": 75290
+    },
+    {
+      "epoch": 200.8,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002926877867531352,
+      "loss": 0.4702,
+      "step": 75300
+    },
+    {
+      "epoch": 200.82666666666665,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029268584750656427,
+      "loss": 0.4645,
+      "step": 75310
+    },
+    {
+      "epoch": 200.85333333333332,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002926839080093031,
+      "loss": 0.4732,
+      "step": 75320
+    },
+    {
+      "epoch": 200.88,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029268196826135525,
+      "loss": 0.4612,
+      "step": 75330
+    },
+    {
+      "epoch": 200.90666666666667,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029268002826272394,
+      "loss": 0.464,
+      "step": 75340
+    },
+    {
+      "epoch": 200.93333333333334,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0002926780880134127,
+      "loss": 0.4661,
+      "step": 75350
+    },
+    {
+      "epoch": 200.96,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0002926761475134248,
+      "loss": 0.4644,
+      "step": 75360
+    },
+    {
+      "epoch": 200.98666666666668,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.00029267420676276374,
+      "loss": 0.4775,
+      "step": 75370
+    },
+    {
+      "epoch": 201.0,
+      "eval_loss": 0.4787846803665161,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 11.2928,
+      "eval_samples_per_second": 1.417,
+      "eval_steps_per_second": 0.089,
+      "step": 75375
+    },
+    {
+      "epoch": 201.01333333333332,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029267226576143294,
+      "loss": 0.4674,
+      "step": 75380
+    },
+    {
+      "epoch": 201.04,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002926703245094358,
+      "loss": 0.4915,
+      "step": 75390
+    },
+    {
+      "epoch": 201.06666666666666,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0002926683830067757,
+      "loss": 0.4789,
+      "step": 75400
+    },
+    {
+      "epoch": 201.09333333333333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002926664412534561,
+      "loss": 0.4734,
+      "step": 75410
+    },
+    {
+      "epoch": 201.12,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029266449924948045,
+      "loss": 0.47,
+      "step": 75420
+    },
+    {
+      "epoch": 201.14666666666668,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029266255699485205,
+      "loss": 0.4779,
+      "step": 75430
+    },
+    {
+      "epoch": 201.17333333333335,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029266061448957433,
+      "loss": 0.4689,
+      "step": 75440
+    },
+    {
+      "epoch": 201.2,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029265867173365075,
+      "loss": 0.4671,
+      "step": 75450
+    },
+    {
+      "epoch": 201.22666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002926567287270847,
+      "loss": 0.4653,
+      "step": 75460
+    },
+    {
+      "epoch": 201.25333333333333,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029265478546987967,
+      "loss": 0.4645,
+      "step": 75470
+    },
+    {
+      "epoch": 201.28,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029265284196203895,
+      "loss": 0.4787,
+      "step": 75480
+    },
+    {
+      "epoch": 201.30666666666667,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029265089820356603,
+      "loss": 0.4588,
+      "step": 75490
+    },
+    {
+      "epoch": 201.33333333333334,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002926489541944643,
+      "loss": 0.4753,
+      "step": 75500
+    },
+    {
+      "epoch": 201.36,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0002926470099347371,
+      "loss": 0.4768,
+      "step": 75510
+    },
+    {
+      "epoch": 201.38666666666666,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.00029264506542438804,
+      "loss": 0.4689,
+      "step": 75520
+    },
+    {
+      "epoch": 201.41333333333333,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0002926431206634204,
+      "loss": 0.4733,
+      "step": 75530
+    },
+    {
+      "epoch": 201.44,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002926411756518376,
+      "loss": 0.4765,
+      "step": 75540
+    },
+    {
+      "epoch": 201.46666666666667,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002926392303896431,
+      "loss": 0.4855,
+      "step": 75550
+    },
+    {
+      "epoch": 201.49333333333334,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029263728487684027,
+      "loss": 0.4767,
+      "step": 75560
+    },
+    {
+      "epoch": 201.52,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00029263533911343256,
+      "loss": 0.4712,
+      "step": 75570
+    },
+    {
+      "epoch": 201.54666666666665,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029263339309942344,
+      "loss": 0.4683,
+      "step": 75580
+    },
+    {
+      "epoch": 201.57333333333332,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002926314468348162,
+      "loss": 0.4672,
+      "step": 75590
+    },
+    {
+      "epoch": 201.6,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.00029262950031961433,
+      "loss": 0.4607,
+      "step": 75600
+    },
+    {
+      "epoch": 201.62666666666667,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00029262755355382127,
+      "loss": 0.4481,
+      "step": 75610
+    },
+    {
+      "epoch": 201.65333333333334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002926256065374404,
+      "loss": 0.4674,
+      "step": 75620
+    },
+    {
+      "epoch": 201.68,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00029262365927047523,
+      "loss": 0.4583,
+      "step": 75630
+    },
+    {
+      "epoch": 201.70666666666668,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029262171175292907,
+      "loss": 0.4601,
+      "step": 75640
+    },
+    {
+      "epoch": 201.73333333333332,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002926197639848053,
+      "loss": 0.4784,
+      "step": 75650
+    },
+    {
+      "epoch": 201.76,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00029261781596610757,
+      "loss": 0.4801,
+      "step": 75660
+    },
+    {
+      "epoch": 201.78666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002926158676968391,
+      "loss": 0.478,
+      "step": 75670
+    },
+    {
+      "epoch": 201.81333333333333,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002926139191770034,
+      "loss": 0.4641,
+      "step": 75680
+    },
+    {
+      "epoch": 201.84,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002926119704066038,
+      "loss": 0.4696,
+      "step": 75690
+    },
+    {
+      "epoch": 201.86666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029261002138564386,
+      "loss": 0.4687,
+      "step": 75700
+    },
+    {
+      "epoch": 201.89333333333335,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00029260807211412687,
+      "loss": 0.4593,
+      "step": 75710
+    },
+    {
+      "epoch": 201.92,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029260612259205635,
+      "loss": 0.4706,
+      "step": 75720
+    },
+    {
+      "epoch": 201.94666666666666,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002926041728194357,
+      "loss": 0.4622,
+      "step": 75730
+    },
+    {
+      "epoch": 201.97333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029260222279626835,
+      "loss": 0.4738,
+      "step": 75740
+    },
+    {
+      "epoch": 202.0,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002926002725225577,
+      "loss": 0.4626,
+      "step": 75750
+    },
+    {
+      "epoch": 202.0,
+      "eval_loss": 0.4782838225364685,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.7972,
+      "eval_samples_per_second": 1.482,
+      "eval_steps_per_second": 0.093,
+      "step": 75750
+    },
+    {
+      "epoch": 202.02666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002925983219983072,
+      "loss": 0.483,
+      "step": 75760
+    },
+    {
+      "epoch": 202.05333333333334,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029259637122352024,
+      "loss": 0.4883,
+      "step": 75770
+    },
+    {
+      "epoch": 202.08,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002925944201982003,
+      "loss": 0.4755,
+      "step": 75780
+    },
+    {
+      "epoch": 202.10666666666665,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002925924689223508,
+      "loss": 0.4707,
+      "step": 75790
+    },
+    {
+      "epoch": 202.13333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029259051739597515,
+      "loss": 0.472,
+      "step": 75800
+    },
+    {
+      "epoch": 202.16,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029258856561907673,
+      "loss": 0.4762,
+      "step": 75810
+    },
+    {
+      "epoch": 202.18666666666667,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029258661359165904,
+      "loss": 0.4682,
+      "step": 75820
+    },
+    {
+      "epoch": 202.21333333333334,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029258466131372554,
+      "loss": 0.4632,
+      "step": 75830
+    },
+    {
+      "epoch": 202.24,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0002925827087852796,
+      "loss": 0.4678,
+      "step": 75840
+    },
+    {
+      "epoch": 202.26666666666668,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002925807560063246,
+      "loss": 0.472,
+      "step": 75850
+    },
+    {
+      "epoch": 202.29333333333332,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0002925788029768641,
+      "loss": 0.4659,
+      "step": 75860
+    },
+    {
+      "epoch": 202.32,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002925768496969014,
+      "loss": 0.469,
+      "step": 75870
+    },
+    {
+      "epoch": 202.34666666666666,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029257489616644003,
+      "loss": 0.4774,
+      "step": 75880
+    },
+    {
+      "epoch": 202.37333333333333,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002925729423854834,
+      "loss": 0.4736,
+      "step": 75890
+    },
+    {
+      "epoch": 202.4,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002925709883540349,
+      "loss": 0.4697,
+      "step": 75900
+    },
+    {
+      "epoch": 202.42666666666668,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.000292569034072098,
+      "loss": 0.4755,
+      "step": 75910
+    },
+    {
+      "epoch": 202.45333333333335,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029256707953967615,
+      "loss": 0.4776,
+      "step": 75920
+    },
+    {
+      "epoch": 202.48,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029256512475677273,
+      "loss": 0.4843,
+      "step": 75930
+    },
+    {
+      "epoch": 202.50666666666666,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029256316972339124,
+      "loss": 0.474,
+      "step": 75940
+    },
+    {
+      "epoch": 202.53333333333333,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002925612144395351,
+      "loss": 0.4684,
+      "step": 75950
+    },
+    {
+      "epoch": 202.56,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002925592589052076,
+      "loss": 0.4688,
+      "step": 75960
+    },
+    {
+      "epoch": 202.58666666666667,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029255730312041244,
+      "loss": 0.4647,
+      "step": 75970
+    },
+    {
+      "epoch": 202.61333333333334,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00029255534708515283,
+      "loss": 0.4511,
+      "step": 75980
+    },
+    {
+      "epoch": 202.64,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00029255339079943234,
+      "loss": 0.4562,
+      "step": 75990
+    },
+    {
+      "epoch": 202.66666666666666,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029255143426325437,
+      "loss": 0.4718,
+      "step": 76000
+    },
+    {
+      "epoch": 202.69333333333333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002925494774766223,
+      "loss": 0.4546,
+      "step": 76010
+    },
+    {
+      "epoch": 202.72,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029254752043953965,
+      "loss": 0.4711,
+      "step": 76020
+    },
+    {
+      "epoch": 202.74666666666667,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002925455631520098,
+      "loss": 0.4752,
+      "step": 76030
+    },
+    {
+      "epoch": 202.77333333333334,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0002925436056140362,
+      "loss": 0.4828,
+      "step": 76040
+    },
+    {
+      "epoch": 202.8,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00029254164782562236,
+      "loss": 0.4701,
+      "step": 76050
+    },
+    {
+      "epoch": 202.82666666666665,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002925396897867716,
+      "loss": 0.4646,
+      "step": 76060
+    },
+    {
+      "epoch": 202.85333333333332,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00029253773149748747,
+      "loss": 0.4736,
+      "step": 76070
+    },
+    {
+      "epoch": 202.88,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002925357729577733,
+      "loss": 0.4611,
+      "step": 76080
+    },
+    {
+      "epoch": 202.90666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002925338141676327,
+      "loss": 0.4641,
+      "step": 76090
+    },
+    {
+      "epoch": 202.93333333333334,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029253185512706894,
+      "loss": 0.466,
+      "step": 76100
+    },
+    {
+      "epoch": 202.96,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002925298958360855,
+      "loss": 0.4636,
+      "step": 76110
+    },
+    {
+      "epoch": 202.98666666666668,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002925279362946859,
+      "loss": 0.478,
+      "step": 76120
+    },
+    {
+      "epoch": 203.0,
+      "eval_loss": 0.4781941771507263,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.5103,
+      "eval_samples_per_second": 1.522,
+      "eval_steps_per_second": 0.095,
+      "step": 76125
+    },
+    {
+      "epoch": 203.01333333333332,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00029252597650287354,
+      "loss": 0.4664,
+      "step": 76130
+    },
+    {
+      "epoch": 203.04,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002925240164606518,
+      "loss": 0.4904,
+      "step": 76140
+    },
+    {
+      "epoch": 203.06666666666666,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.00029252205616802425,
+      "loss": 0.4791,
+      "step": 76150
+    },
+    {
+      "epoch": 203.09333333333333,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0002925200956249942,
+      "loss": 0.4733,
+      "step": 76160
+    },
+    {
+      "epoch": 203.12,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002925181348315652,
+      "loss": 0.4697,
+      "step": 76170
+    },
+    {
+      "epoch": 203.14666666666668,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029251617378774064,
+      "loss": 0.4775,
+      "step": 76180
+    },
+    {
+      "epoch": 203.17333333333335,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029251421249352393,
+      "loss": 0.4685,
+      "step": 76190
+    },
+    {
+      "epoch": 203.2,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00029251225094891864,
+      "loss": 0.467,
+      "step": 76200
+    },
+    {
+      "epoch": 203.22666666666666,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002925102891539281,
+      "loss": 0.4645,
+      "step": 76210
+    },
+    {
+      "epoch": 203.25333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002925083271085558,
+      "loss": 0.4651,
+      "step": 76220
+    },
+    {
+      "epoch": 203.28,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029250636481280523,
+      "loss": 0.4787,
+      "step": 76230
+    },
+    {
+      "epoch": 203.30666666666667,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029250440226667973,
+      "loss": 0.4601,
+      "step": 76240
+    },
+    {
+      "epoch": 203.33333333333334,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00029250243947018283,
+      "loss": 0.4743,
+      "step": 76250
+    },
+    {
+      "epoch": 203.36,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.000292500476423318,
+      "loss": 0.4769,
+      "step": 76260
+    },
+    {
+      "epoch": 203.38666666666666,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.00029249851312608863,
+      "loss": 0.4686,
+      "step": 76270
+    },
+    {
+      "epoch": 203.41333333333333,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002924965495784981,
+      "loss": 0.4727,
+      "step": 76280
+    },
+    {
+      "epoch": 203.44,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00029249458578055,
+      "loss": 0.4764,
+      "step": 76290
+    },
+    {
+      "epoch": 203.46666666666667,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00029249262173224776,
+      "loss": 0.4856,
+      "step": 76300
+    },
+    {
+      "epoch": 203.49333333333334,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002924906574335948,
+      "loss": 0.4769,
+      "step": 76310
+    },
+    {
+      "epoch": 203.52,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029248869288459457,
+      "loss": 0.4712,
+      "step": 76320
+    },
+    {
+      "epoch": 203.54666666666665,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029248672808525046,
+      "loss": 0.4685,
+      "step": 76330
+    },
+    {
+      "epoch": 203.57333333333332,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.000292484763035566,
+      "loss": 0.467,
+      "step": 76340
+    },
+    {
+      "epoch": 203.6,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002924827977355447,
+      "loss": 0.4602,
+      "step": 76350
+    },
+    {
+      "epoch": 203.62666666666667,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002924808321851899,
+      "loss": 0.4484,
+      "step": 76360
+    },
+    {
+      "epoch": 203.65333333333334,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002924788663845051,
+      "loss": 0.4676,
+      "step": 76370
+    },
+    {
+      "epoch": 203.68,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002924769003334937,
+      "loss": 0.4592,
+      "step": 76380
+    },
+    {
+      "epoch": 203.70666666666668,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0002924749340321592,
+      "loss": 0.4601,
+      "step": 76390
+    },
+    {
+      "epoch": 203.73333333333332,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0002924729674805051,
+      "loss": 0.4785,
+      "step": 76400
+    },
+    {
+      "epoch": 203.76,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002924710006785348,
+      "loss": 0.4796,
+      "step": 76410
+    },
+    {
+      "epoch": 203.78666666666666,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002924690336262517,
+      "loss": 0.4782,
+      "step": 76420
+    },
+    {
+      "epoch": 203.81333333333333,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.00029246706632365943,
+      "loss": 0.4634,
+      "step": 76430
+    },
+    {
+      "epoch": 203.84,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029246509877076126,
+      "loss": 0.4696,
+      "step": 76440
+    },
+    {
+      "epoch": 203.86666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00029246313096756077,
+      "loss": 0.4683,
+      "step": 76450
+    },
+    {
+      "epoch": 203.89333333333335,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002924611629140613,
+      "loss": 0.4601,
+      "step": 76460
+    },
+    {
+      "epoch": 203.92,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029245919461026645,
+      "loss": 0.4704,
+      "step": 76470
+    },
+    {
+      "epoch": 203.94666666666666,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029245722605617956,
+      "loss": 0.4624,
+      "step": 76480
+    },
+    {
+      "epoch": 203.97333333333333,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002924552572518042,
+      "loss": 0.474,
+      "step": 76490
+    },
+    {
+      "epoch": 204.0,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0002924532881971437,
+      "loss": 0.4632,
+      "step": 76500
+    },
+    {
+      "epoch": 204.0,
+      "eval_loss": 0.4801044166088104,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.9231,
+      "eval_samples_per_second": 1.612,
+      "eval_steps_per_second": 0.101,
+      "step": 76500
+    },
+    {
+      "epoch": 204.02666666666667,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002924513188922016,
+      "loss": 0.4826,
+      "step": 76510
+    },
+    {
+      "epoch": 204.05333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029244934933698134,
+      "loss": 0.4887,
+      "step": 76520
+    },
+    {
+      "epoch": 204.08,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002924473795314864,
+      "loss": 0.4756,
+      "step": 76530
+    },
+    {
+      "epoch": 204.10666666666665,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0002924454094757202,
+      "loss": 0.4697,
+      "step": 76540
+    },
+    {
+      "epoch": 204.13333333333333,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002924434391696863,
+      "loss": 0.4713,
+      "step": 76550
+    },
+    {
+      "epoch": 204.16,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029244146861338806,
+      "loss": 0.4767,
+      "step": 76560
+    },
+    {
+      "epoch": 204.18666666666667,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029243949780682893,
+      "loss": 0.4683,
+      "step": 76570
+    },
+    {
+      "epoch": 204.21333333333334,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002924375267500124,
+      "loss": 0.4637,
+      "step": 76580
+    },
+    {
+      "epoch": 204.24,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029243555544294204,
+      "loss": 0.4679,
+      "step": 76590
+    },
+    {
+      "epoch": 204.26666666666668,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.00029243358388562115,
+      "loss": 0.4718,
+      "step": 76600
+    },
+    {
+      "epoch": 204.29333333333332,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029243161207805323,
+      "loss": 0.4654,
+      "step": 76610
+    },
+    {
+      "epoch": 204.32,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00029242964002024186,
+      "loss": 0.4683,
+      "step": 76620
+    },
+    {
+      "epoch": 204.34666666666666,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002924276677121904,
+      "loss": 0.4771,
+      "step": 76630
+    },
+    {
+      "epoch": 204.37333333333333,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00029242569515390234,
+      "loss": 0.4725,
+      "step": 76640
+    },
+    {
+      "epoch": 204.4,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002924237223453811,
+      "loss": 0.4703,
+      "step": 76650
+    },
+    {
+      "epoch": 204.42666666666668,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0002924217492866302,
+      "loss": 0.4754,
+      "step": 76660
+    },
+    {
+      "epoch": 204.45333333333335,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029241977597765315,
+      "loss": 0.4781,
+      "step": 76670
+    },
+    {
+      "epoch": 204.48,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0002924178024184533,
+      "loss": 0.4838,
+      "step": 76680
+    },
+    {
+      "epoch": 204.50666666666666,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002924158286090342,
+      "loss": 0.4737,
+      "step": 76690
+    },
+    {
+      "epoch": 204.53333333333333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029241385454939934,
+      "loss": 0.4676,
+      "step": 76700
+    },
+    {
+      "epoch": 204.56,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002924118802395521,
+      "loss": 0.4679,
+      "step": 76710
+    },
+    {
+      "epoch": 204.58666666666667,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.000292409905679496,
+      "loss": 0.4651,
+      "step": 76720
+    },
+    {
+      "epoch": 204.61333333333334,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029240793086923457,
+      "loss": 0.4506,
+      "step": 76730
+    },
+    {
+      "epoch": 204.64,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0002924059558087711,
+      "loss": 0.456,
+      "step": 76740
+    },
+    {
+      "epoch": 204.66666666666666,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029240398049810924,
+      "loss": 0.471,
+      "step": 76750
+    },
+    {
+      "epoch": 204.69333333333333,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002924020049372524,
+      "loss": 0.4546,
+      "step": 76760
+    },
+    {
+      "epoch": 204.72,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029240002912620406,
+      "loss": 0.4711,
+      "step": 76770
+    },
+    {
+      "epoch": 204.74666666666667,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029239805306496765,
+      "loss": 0.4751,
+      "step": 76780
+    },
+    {
+      "epoch": 204.77333333333334,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00029239607675354666,
+      "loss": 0.4825,
+      "step": 76790
+    },
+    {
+      "epoch": 204.8,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002923941001919446,
+      "loss": 0.4697,
+      "step": 76800
+    },
+    {
+      "epoch": 204.82666666666665,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002923921233801649,
+      "loss": 0.465,
+      "step": 76810
+    },
+    {
+      "epoch": 204.85333333333332,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00029239014631821103,
+      "loss": 0.4733,
+      "step": 76820
+    },
+    {
+      "epoch": 204.88,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0002923881690060865,
+      "loss": 0.4615,
+      "step": 76830
+    },
+    {
+      "epoch": 204.90666666666667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002923861914437948,
+      "loss": 0.464,
+      "step": 76840
+    },
+    {
+      "epoch": 204.93333333333334,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0002923842136313393,
+      "loss": 0.4658,
+      "step": 76850
+    },
+    {
+      "epoch": 204.96,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002923822355687236,
+      "loss": 0.4642,
+      "step": 76860
+    },
+    {
+      "epoch": 204.98666666666668,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002923802572559511,
+      "loss": 0.4777,
+      "step": 76870
+    },
+    {
+      "epoch": 205.0,
+      "eval_loss": 0.4777953624725342,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.4051,
+      "eval_samples_per_second": 1.538,
+      "eval_steps_per_second": 0.096,
+      "step": 76875
+    },
+    {
+      "epoch": 205.01333333333332,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029237827869302524,
+      "loss": 0.4664,
+      "step": 76880
+    },
+    {
+      "epoch": 205.04,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00029237629987994966,
+      "loss": 0.4905,
+      "step": 76890
+    },
+    {
+      "epoch": 205.06666666666666,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029237432081672765,
+      "loss": 0.4783,
+      "step": 76900
+    },
+    {
+      "epoch": 205.09333333333333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002923723415033628,
+      "loss": 0.4725,
+      "step": 76910
+    },
+    {
+      "epoch": 205.12,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00029237036193985853,
+      "loss": 0.4696,
+      "step": 76920
+    },
+    {
+      "epoch": 205.14666666666668,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029236838212621835,
+      "loss": 0.4788,
+      "step": 76930
+    },
+    {
+      "epoch": 205.17333333333335,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00029236640206244574,
+      "loss": 0.4689,
+      "step": 76940
+    },
+    {
+      "epoch": 205.2,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029236442174854416,
+      "loss": 0.4672,
+      "step": 76950
+    },
+    {
+      "epoch": 205.22666666666666,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002923624411845171,
+      "loss": 0.4655,
+      "step": 76960
+    },
+    {
+      "epoch": 205.25333333333333,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029236046037036805,
+      "loss": 0.465,
+      "step": 76970
+    },
+    {
+      "epoch": 205.28,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029235847930610045,
+      "loss": 0.4785,
+      "step": 76980
+    },
+    {
+      "epoch": 205.30666666666667,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002923564979917178,
+      "loss": 0.4592,
+      "step": 76990
+    },
+    {
+      "epoch": 205.33333333333334,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002923545164272236,
+      "loss": 0.4754,
+      "step": 77000
+    },
+    {
+      "epoch": 205.36,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002923525346126213,
+      "loss": 0.4771,
+      "step": 77010
+    },
+    {
+      "epoch": 205.38666666666666,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002923505525479145,
+      "loss": 0.4689,
+      "step": 77020
+    },
+    {
+      "epoch": 205.41333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002923485702331065,
+      "loss": 0.4732,
+      "step": 77030
+    },
+    {
+      "epoch": 205.44,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0002923465876682009,
+      "loss": 0.4758,
+      "step": 77040
+    },
+    {
+      "epoch": 205.46666666666667,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002923446048532011,
+      "loss": 0.4848,
+      "step": 77050
+    },
+    {
+      "epoch": 205.49333333333334,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0002923426217881107,
+      "loss": 0.477,
+      "step": 77060
+    },
+    {
+      "epoch": 205.52,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002923406384729331,
+      "loss": 0.471,
+      "step": 77070
+    },
+    {
+      "epoch": 205.54666666666665,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029233865490767177,
+      "loss": 0.4677,
+      "step": 77080
+    },
+    {
+      "epoch": 205.57333333333332,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0002923366710923302,
+      "loss": 0.4673,
+      "step": 77090
+    },
+    {
+      "epoch": 205.6,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029233468702691203,
+      "loss": 0.461,
+      "step": 77100
+    },
+    {
+      "epoch": 205.62666666666667,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029233270271142054,
+      "loss": 0.4477,
+      "step": 77110
+    },
+    {
+      "epoch": 205.65333333333334,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002923307181458593,
+      "loss": 0.4675,
+      "step": 77120
+    },
+    {
+      "epoch": 205.68,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002923287333302318,
+      "loss": 0.4589,
+      "step": 77130
+    },
+    {
+      "epoch": 205.70666666666668,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002923267482645415,
+      "loss": 0.4595,
+      "step": 77140
+    },
+    {
+      "epoch": 205.73333333333332,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.00029232476294879193,
+      "loss": 0.4778,
+      "step": 77150
+    },
+    {
+      "epoch": 205.76,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002923227773829865,
+      "loss": 0.4792,
+      "step": 77160
+    },
+    {
+      "epoch": 205.78666666666666,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002923207915671288,
+      "loss": 0.4774,
+      "step": 77170
+    },
+    {
+      "epoch": 205.81333333333333,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029231880550122227,
+      "loss": 0.4634,
+      "step": 77180
+    },
+    {
+      "epoch": 205.84,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029231681918527044,
+      "loss": 0.4692,
+      "step": 77190
+    },
+    {
+      "epoch": 205.86666666666667,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0002923148326192767,
+      "loss": 0.4681,
+      "step": 77200
+    },
+    {
+      "epoch": 205.89333333333335,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029231284580324464,
+      "loss": 0.4599,
+      "step": 77210
+    },
+    {
+      "epoch": 205.92,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002923108587371777,
+      "loss": 0.4704,
+      "step": 77220
+    },
+    {
+      "epoch": 205.94666666666666,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029230887142107936,
+      "loss": 0.4624,
+      "step": 77230
+    },
+    {
+      "epoch": 205.97333333333333,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00029230688385495325,
+      "loss": 0.4739,
+      "step": 77240
+    },
+    {
+      "epoch": 206.0,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00029230489603880263,
+      "loss": 0.4626,
+      "step": 77250
+    },
+    {
+      "epoch": 206.0,
+      "eval_loss": 0.47875362634658813,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.8448,
+      "eval_samples_per_second": 1.625,
+      "eval_steps_per_second": 0.102,
+      "step": 77250
+    },
+    {
+      "epoch": 206.02666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00029230290797263114,
+      "loss": 0.4827,
+      "step": 77260
+    },
+    {
+      "epoch": 206.05333333333334,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002923009196564422,
+      "loss": 0.488,
+      "step": 77270
+    },
+    {
+      "epoch": 206.08,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029229893109023944,
+      "loss": 0.4754,
+      "step": 77280
+    },
+    {
+      "epoch": 206.10666666666665,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002922969422740262,
+      "loss": 0.4707,
+      "step": 77290
+    },
+    {
+      "epoch": 206.13333333333333,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029229495320780605,
+      "loss": 0.4717,
+      "step": 77300
+    },
+    {
+      "epoch": 206.16,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0002922929638915825,
+      "loss": 0.4768,
+      "step": 77310
+    },
+    {
+      "epoch": 206.18666666666667,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029229097432535895,
+      "loss": 0.4683,
+      "step": 77320
+    },
+    {
+      "epoch": 206.21333333333334,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.00029228898450913896,
+      "loss": 0.4633,
+      "step": 77330
+    },
+    {
+      "epoch": 206.24,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029228699444292606,
+      "loss": 0.467,
+      "step": 77340
+    },
+    {
+      "epoch": 206.26666666666668,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029228500412672374,
+      "loss": 0.4712,
+      "step": 77350
+    },
+    {
+      "epoch": 206.29333333333332,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002922830135605354,
+      "loss": 0.466,
+      "step": 77360
+    },
+    {
+      "epoch": 206.32,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029228102274436464,
+      "loss": 0.4692,
+      "step": 77370
+    },
+    {
+      "epoch": 206.34666666666666,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00029227903167821495,
+      "loss": 0.4771,
+      "step": 77380
+    },
+    {
+      "epoch": 206.37333333333333,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029227704036208983,
+      "loss": 0.4729,
+      "step": 77390
+    },
+    {
+      "epoch": 206.4,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002922750487959926,
+      "loss": 0.4699,
+      "step": 77400
+    },
+    {
+      "epoch": 206.42666666666668,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002922730569799271,
+      "loss": 0.476,
+      "step": 77410
+    },
+    {
+      "epoch": 206.45333333333335,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002922710649138965,
+      "loss": 0.4777,
+      "step": 77420
+    },
+    {
+      "epoch": 206.48,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0002922690725979045,
+      "loss": 0.485,
+      "step": 77430
+    },
+    {
+      "epoch": 206.50666666666666,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00029226708003195456,
+      "loss": 0.4745,
+      "step": 77440
+    },
+    {
+      "epoch": 206.53333333333333,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.00029226508721605013,
+      "loss": 0.4676,
+      "step": 77450
+    },
+    {
+      "epoch": 206.56,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00029226309415019476,
+      "loss": 0.4687,
+      "step": 77460
+    },
+    {
+      "epoch": 206.58666666666667,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00029226110083439193,
+      "loss": 0.4649,
+      "step": 77470
+    },
+    {
+      "epoch": 206.61333333333334,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00029225910726864515,
+      "loss": 0.4507,
+      "step": 77480
+    },
+    {
+      "epoch": 206.64,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002922571134529579,
+      "loss": 0.4555,
+      "step": 77490
+    },
+    {
+      "epoch": 206.66666666666666,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0002922551193873337,
+      "loss": 0.4719,
+      "step": 77500
+    },
+    {
+      "epoch": 206.69333333333333,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029225312507177604,
+      "loss": 0.4543,
+      "step": 77510
+    },
+    {
+      "epoch": 206.72,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002922511305062885,
+      "loss": 0.4715,
+      "step": 77520
+    },
+    {
+      "epoch": 206.74666666666667,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002922491356908744,
+      "loss": 0.4756,
+      "step": 77530
+    },
+    {
+      "epoch": 206.77333333333334,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002922471406255375,
+      "loss": 0.4832,
+      "step": 77540
+    },
+    {
+      "epoch": 206.8,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002922451453102811,
+      "loss": 0.4698,
+      "step": 77550
+    },
+    {
+      "epoch": 206.82666666666665,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002922431497451088,
+      "loss": 0.4648,
+      "step": 77560
+    },
+    {
+      "epoch": 206.85333333333332,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0002922411539300241,
+      "loss": 0.4732,
+      "step": 77570
+    },
+    {
+      "epoch": 206.88,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0002922391578650305,
+      "loss": 0.4615,
+      "step": 77580
+    },
+    {
+      "epoch": 206.90666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0002922371615501315,
+      "loss": 0.4643,
+      "step": 77590
+    },
+    {
+      "epoch": 206.93333333333334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00029223516498533053,
+      "loss": 0.4663,
+      "step": 77600
+    },
+    {
+      "epoch": 206.96,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00029223316817063126,
+      "loss": 0.4634,
+      "step": 77610
+    },
+    {
+      "epoch": 206.98666666666668,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029223117110603706,
+      "loss": 0.4772,
+      "step": 77620
+    },
+    {
+      "epoch": 207.0,
+      "eval_loss": 0.4780822694301605,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.47,
+      "eval_samples_per_second": 1.69,
+      "eval_steps_per_second": 0.106,
+      "step": 77625
+    },
+    {
+      "epoch": 207.01333333333332,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002922291737915515,
+      "loss": 0.4666,
+      "step": 77630
+    },
+    {
+      "epoch": 207.04,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029222717622717813,
+      "loss": 0.4908,
+      "step": 77640
+    },
+    {
+      "epoch": 207.06666666666666,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029222517841292036,
+      "loss": 0.4788,
+      "step": 77650
+    },
+    {
+      "epoch": 207.09333333333333,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00029222318034878174,
+      "loss": 0.4723,
+      "step": 77660
+    },
+    {
+      "epoch": 207.12,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00029222118203476585,
+      "loss": 0.4699,
+      "step": 77670
+    },
+    {
+      "epoch": 207.14666666666668,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.00029221918347087605,
+      "loss": 0.4777,
+      "step": 77680
+    },
+    {
+      "epoch": 207.17333333333335,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.000292217184657116,
+      "loss": 0.4685,
+      "step": 77690
+    },
+    {
+      "epoch": 207.2,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029221518559348917,
+      "loss": 0.4667,
+      "step": 77700
+    },
+    {
+      "epoch": 207.22666666666666,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.000292213186279999,
+      "loss": 0.4651,
+      "step": 77710
+    },
+    {
+      "epoch": 207.25333333333333,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002922111867166491,
+      "loss": 0.4655,
+      "step": 77720
+    },
+    {
+      "epoch": 207.28,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029220918690344295,
+      "loss": 0.4781,
+      "step": 77730
+    },
+    {
+      "epoch": 207.30666666666667,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00029220718684038404,
+      "loss": 0.4597,
+      "step": 77740
+    },
+    {
+      "epoch": 207.33333333333334,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0002922051865274759,
+      "loss": 0.4747,
+      "step": 77750
+    },
+    {
+      "epoch": 207.36,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.000292203185964722,
+      "loss": 0.4767,
+      "step": 77760
+    },
+    {
+      "epoch": 207.38666666666666,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.00029220118515212596,
+      "loss": 0.4689,
+      "step": 77770
+    },
+    {
+      "epoch": 207.41333333333333,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00029219918408969123,
+      "loss": 0.4729,
+      "step": 77780
+    },
+    {
+      "epoch": 207.44,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002921971827774213,
+      "loss": 0.4759,
+      "step": 77790
+    },
+    {
+      "epoch": 207.46666666666667,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00029219518121531974,
+      "loss": 0.4857,
+      "step": 77800
+    },
+    {
+      "epoch": 207.49333333333334,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00029219317940339004,
+      "loss": 0.4769,
+      "step": 77810
+    },
+    {
+      "epoch": 207.52,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002921911773416357,
+      "loss": 0.4718,
+      "step": 77820
+    },
+    {
+      "epoch": 207.54666666666665,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029218917503006026,
+      "loss": 0.4678,
+      "step": 77830
+    },
+    {
+      "epoch": 207.57333333333332,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029218717246866724,
+      "loss": 0.4672,
+      "step": 77840
+    },
+    {
+      "epoch": 207.6,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029218516965746015,
+      "loss": 0.46,
+      "step": 77850
+    },
+    {
+      "epoch": 207.62666666666667,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00029218316659644254,
+      "loss": 0.4483,
+      "step": 77860
+    },
+    {
+      "epoch": 207.65333333333334,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.00029218116328561786,
+      "loss": 0.4678,
+      "step": 77870
+    },
+    {
+      "epoch": 207.68,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00029217915972498974,
+      "loss": 0.4591,
+      "step": 77880
+    },
+    {
+      "epoch": 207.70666666666668,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.00029217715591456155,
+      "loss": 0.4587,
+      "step": 77890
+    },
+    {
+      "epoch": 207.73333333333332,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0002921751518543369,
+      "loss": 0.4782,
+      "step": 77900
+    },
+    {
+      "epoch": 207.76,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.00029217314754431933,
+      "loss": 0.4789,
+      "step": 77910
+    },
+    {
+      "epoch": 207.78666666666666,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002921711429845124,
+      "loss": 0.4776,
+      "step": 77920
+    },
+    {
+      "epoch": 207.81333333333333,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002921691381749195,
+      "loss": 0.4634,
+      "step": 77930
+    },
+    {
+      "epoch": 207.84,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002921671331155442,
+      "loss": 0.4698,
+      "step": 77940
+    },
+    {
+      "epoch": 207.86666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0002921651278063901,
+      "loss": 0.4691,
+      "step": 77950
+    },
+    {
+      "epoch": 207.89333333333335,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0002921631222474606,
+      "loss": 0.4591,
+      "step": 77960
+    },
+    {
+      "epoch": 207.92,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0002921611164387593,
+      "loss": 0.4699,
+      "step": 77970
+    },
+    {
+      "epoch": 207.94666666666666,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029215911038028977,
+      "loss": 0.4631,
+      "step": 77980
+    },
+    {
+      "epoch": 207.97333333333333,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002921571040720554,
+      "loss": 0.4742,
+      "step": 77990
+    },
+    {
+      "epoch": 208.0,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0002921550975140598,
+      "loss": 0.4633,
+      "step": 78000
+    },
+    {
+      "epoch": 208.0,
+      "eval_loss": 0.4784125089645386,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 9.5043,
+      "eval_samples_per_second": 1.683,
+      "eval_steps_per_second": 0.105,
+      "step": 78000
+    },
+    {
+      "epoch": 208.02666666666667,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002921530907063065,
+      "loss": 0.4831,
+      "step": 78010
+    },
+    {
+      "epoch": 208.05333333333334,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002921510836487991,
+      "loss": 0.4887,
+      "step": 78020
+    },
+    {
+      "epoch": 208.08,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0002921490763415409,
+      "loss": 0.4754,
+      "step": 78030
+    },
+    {
+      "epoch": 208.10666666666665,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00029214706878453566,
+      "loss": 0.4708,
+      "step": 78040
+    },
+    {
+      "epoch": 208.13333333333333,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002921450609777868,
+      "loss": 0.4719,
+      "step": 78050
+    },
+    {
+      "epoch": 208.16,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002921430529212978,
+      "loss": 0.477,
+      "step": 78060
+    },
+    {
+      "epoch": 208.18666666666667,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002921410446150723,
+      "loss": 0.4683,
+      "step": 78070
+    },
+    {
+      "epoch": 208.21333333333334,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002921390360591138,
+      "loss": 0.4636,
+      "step": 78080
+    },
+    {
+      "epoch": 208.24,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00029213702725342574,
+      "loss": 0.467,
+      "step": 78090
+    },
+    {
+      "epoch": 208.26666666666668,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0002921350181980118,
+      "loss": 0.4718,
+      "step": 78100
+    },
+    {
+      "epoch": 208.29333333333332,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00029213300889287534,
+      "loss": 0.4657,
+      "step": 78110
+    },
+    {
+      "epoch": 208.32,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00029213099933802003,
+      "loss": 0.4696,
+      "step": 78120
+    },
+    {
+      "epoch": 208.34666666666666,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0002921289895334493,
+      "loss": 0.477,
+      "step": 78130
+    },
+    {
+      "epoch": 208.37333333333333,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0002921269794791668,
+      "loss": 0.4725,
+      "step": 78140
+    },
+    {
+      "epoch": 208.4,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029212496917517593,
+      "loss": 0.4697,
+      "step": 78150
+    },
+    {
+      "epoch": 208.42666666666668,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00029212295862148034,
+      "loss": 0.4755,
+      "step": 78160
+    },
+    {
+      "epoch": 208.45333333333335,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0002921209478180834,
+      "loss": 0.4786,
+      "step": 78170
+    },
+    {
+      "epoch": 208.48,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.00029211893676498883,
+      "loss": 0.485,
+      "step": 78180
+    },
+    {
+      "epoch": 208.50666666666666,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002921169254622,
+      "loss": 0.4737,
+      "step": 78190
+    },
+    {
+      "epoch": 208.53333333333333,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029211491390972064,
+      "loss": 0.4679,
+      "step": 78200
+    },
+    {
+      "epoch": 208.56,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0002921129021075541,
+      "loss": 0.4692,
+      "step": 78210
+    },
+    {
+      "epoch": 208.58666666666667,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.000292110890055704,
+      "loss": 0.4646,
+      "step": 78220
+    },
+    {
+      "epoch": 208.61333333333334,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00029210887775417383,
+      "loss": 0.4506,
+      "step": 78230
+    },
+    {
+      "epoch": 208.64,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00029210686520296714,
+      "loss": 0.4564,
+      "step": 78240
+    },
+    {
+      "epoch": 208.66666666666666,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0002921048524020875,
+      "loss": 0.4708,
+      "step": 78250
+    },
+    {
+      "epoch": 208.69333333333333,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029210283935153844,
+      "loss": 0.4551,
+      "step": 78260
+    },
+    {
+      "epoch": 208.72,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002921008260513235,
+      "loss": 0.4712,
+      "step": 78270
+    },
+    {
+      "epoch": 208.74666666666667,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029209881250144614,
+      "loss": 0.4752,
+      "step": 78280
+    },
+    {
+      "epoch": 208.77333333333334,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.00029209679870191005,
+      "loss": 0.4824,
+      "step": 78290
+    },
+    {
+      "epoch": 208.8,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00029209478465271857,
+      "loss": 0.4701,
+      "step": 78300
+    },
+    {
+      "epoch": 208.82666666666665,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002920927703538754,
+      "loss": 0.4646,
+      "step": 78310
+    },
+    {
+      "epoch": 208.85333333333332,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.000292090755805384,
+      "loss": 0.4729,
+      "step": 78320
+    },
+    {
+      "epoch": 208.88,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00029208874100724794,
+      "loss": 0.4617,
+      "step": 78330
+    },
+    {
+      "epoch": 208.90666666666667,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.00029208672595947076,
+      "loss": 0.4643,
+      "step": 78340
+    },
+    {
+      "epoch": 208.93333333333334,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00029208471066205597,
+      "loss": 0.4659,
+      "step": 78350
+    },
+    {
+      "epoch": 208.96,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00029208269511500715,
+      "loss": 0.4637,
+      "step": 78360
+    },
+    {
+      "epoch": 208.98666666666668,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00029208067931832784,
+      "loss": 0.4776,
+      "step": 78370
+    },
+    {
+      "epoch": 209.0,
+      "eval_loss": 0.47980669140815735,
+      "eval_model_preparation_time": 0.0016,
+      "eval_runtime": 10.2353,
+      "eval_samples_per_second": 1.563,
+      "eval_steps_per_second": 0.098,
+      "step": 78375
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 750000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2000,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}