diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,14821 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6299212598425197,
+  "eval_steps": 20,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00031496062992125983,
+      "grad_norm": NaN,
+      "learning_rate": 1e-05,
+      "loss": 0.6279,
+      "step": 1
+    },
+    {
+      "epoch": 0.0006299212598425197,
+      "grad_norm": NaN,
+      "learning_rate": 1e-05,
+      "loss": 0.7073,
+      "step": 2
+    },
+    {
+      "epoch": 0.0009448818897637795,
+      "grad_norm": Infinity,
+      "learning_rate": 1e-05,
+      "loss": 0.4724,
+      "step": 3
+    },
+    {
+      "epoch": 0.0012598425196850393,
+      "grad_norm": Infinity,
+      "learning_rate": 1e-05,
+      "loss": 0.8548,
+      "step": 4
+    },
+    {
+      "epoch": 0.0015748031496062992,
+      "grad_norm": 241.3546600341797,
+      "learning_rate": 9.99999842519685e-06,
+      "loss": 0.9812,
+      "step": 5
+    },
+    {
+      "epoch": 0.001889763779527559,
+      "grad_norm": 68.62373352050781,
+      "learning_rate": 9.999996850393701e-06,
+      "loss": 0.6851,
+      "step": 6
+    },
+    {
+      "epoch": 0.002204724409448819,
+      "grad_norm": 45.99556350708008,
+      "learning_rate": 9.999995275590552e-06,
+      "loss": 0.7491,
+      "step": 7
+    },
+    {
+      "epoch": 0.0025196850393700786,
+      "grad_norm": 27.5998592376709,
+      "learning_rate": 9.999993700787403e-06,
+      "loss": 0.6691,
+      "step": 8
+    },
+    {
+      "epoch": 0.0028346456692913387,
+      "grad_norm": 53.23931121826172,
+      "learning_rate": 9.999992125984252e-06,
+      "loss": 0.7518,
+      "step": 9
+    },
+    {
+      "epoch": 0.0031496062992125984,
+      "grad_norm": 21.858654022216797,
+      "learning_rate": 9.999990551181103e-06,
+      "loss": 0.6986,
+      "step": 10
+    },
+    {
+      "epoch": 0.0034645669291338585,
+      "grad_norm": 38.456905364990234,
+      "learning_rate": 9.999988976377953e-06,
+      "loss": 0.6296,
+      "step": 11
+    },
+    {
+      "epoch": 0.003779527559055118,
+      "grad_norm": 36.96352005004883,
+      "learning_rate": 9.999987401574804e-06,
+      "loss": 0.6674,
+      "step": 12
+    },
+    {
+      "epoch": 0.004094488188976378,
+      "grad_norm": 50.633941650390625,
+      "learning_rate": 9.999985826771655e-06,
+      "loss": 0.6214,
+      "step": 13
+    },
+    {
+      "epoch": 0.004409448818897638,
+      "grad_norm": 90.81790161132812,
+      "learning_rate": 9.999984251968506e-06,
+      "loss": 0.7667,
+      "step": 14
+    },
+    {
+      "epoch": 0.004724409448818898,
+      "grad_norm": 157.15757751464844,
+      "learning_rate": 9.999982677165355e-06,
+      "loss": 1.0109,
+      "step": 15
+    },
+    {
+      "epoch": 0.005039370078740157,
+      "grad_norm": 57.94607925415039,
+      "learning_rate": 9.999981102362206e-06,
+      "loss": 0.6623,
+      "step": 16
+    },
+    {
+      "epoch": 0.005354330708661417,
+      "grad_norm": 96.26383972167969,
+      "learning_rate": 9.999979527559057e-06,
+      "loss": 0.7511,
+      "step": 17
+    },
+    {
+      "epoch": 0.005669291338582677,
+      "grad_norm": 63.17537307739258,
+      "learning_rate": 9.999977952755906e-06,
+      "loss": 0.7292,
+      "step": 18
+    },
+    {
+      "epoch": 0.005984251968503937,
+      "grad_norm": 28.436891555786133,
+      "learning_rate": 9.999976377952757e-06,
+      "loss": 0.6612,
+      "step": 19
+    },
+    {
+      "epoch": 0.006299212598425197,
+      "grad_norm": 46.60204315185547,
+      "learning_rate": 9.999974803149607e-06,
+      "loss": 0.471,
+      "step": 20
+    },
+    {
+      "epoch": 0.006299212598425197,
+      "eval_loss": 0.6495372653007507,
+      "eval_runtime": 323.1698,
+      "eval_samples_per_second": 0.362,
+      "eval_steps_per_second": 0.362,
+      "step": 20
+    },
+    {
+      "epoch": 0.006614173228346456,
+      "grad_norm": 34.931636810302734,
+      "learning_rate": 9.999973228346457e-06,
+      "loss": 0.6546,
+      "step": 21
+    },
+    {
+      "epoch": 0.006929133858267717,
+      "grad_norm": 37.24106216430664,
+      "learning_rate": 9.999971653543308e-06,
+      "loss": 0.5786,
+      "step": 22
+    },
+    {
+      "epoch": 0.007244094488188977,
+      "grad_norm": 49.713348388671875,
+      "learning_rate": 9.99997007874016e-06,
+      "loss": 0.605,
+      "step": 23
+    },
+    {
+      "epoch": 0.007559055118110236,
+      "grad_norm": 54.44386291503906,
+      "learning_rate": 9.999968503937009e-06,
+      "loss": 0.3443,
+      "step": 24
+    },
+    {
+      "epoch": 0.007874015748031496,
+      "grad_norm": 79.05270385742188,
+      "learning_rate": 9.999966929133858e-06,
+      "loss": 0.6105,
+      "step": 25
+    },
+    {
+      "epoch": 0.008188976377952756,
+      "grad_norm": 77.4219741821289,
+      "learning_rate": 9.999965354330709e-06,
+      "loss": 0.864,
+      "step": 26
+    },
+    {
+      "epoch": 0.008503937007874015,
+      "grad_norm": 137.48190307617188,
+      "learning_rate": 9.99996377952756e-06,
+      "loss": 1.1885,
+      "step": 27
+    },
+    {
+      "epoch": 0.008818897637795276,
+      "grad_norm": 156.0934600830078,
+      "learning_rate": 9.999962204724411e-06,
+      "loss": 1.1261,
+      "step": 28
+    },
+    {
+      "epoch": 0.009133858267716535,
+      "grad_norm": 117.86957550048828,
+      "learning_rate": 9.99996062992126e-06,
+      "loss": 0.6776,
+      "step": 29
+    },
+    {
+      "epoch": 0.009448818897637795,
+      "grad_norm": 107.25189971923828,
+      "learning_rate": 9.999959055118111e-06,
+      "loss": 0.7123,
+      "step": 30
+    },
+    {
+      "epoch": 0.009763779527559056,
+      "grad_norm": 53.983299255371094,
+      "learning_rate": 9.99995748031496e-06,
+      "loss": 0.6213,
+      "step": 31
+    },
+    {
+      "epoch": 0.010078740157480314,
+      "grad_norm": 60.340389251708984,
+      "learning_rate": 9.999955905511812e-06,
+      "loss": 0.7145,
+      "step": 32
+    },
+    {
+      "epoch": 0.010393700787401575,
+      "grad_norm": 76.8556137084961,
+      "learning_rate": 9.999954330708663e-06,
+      "loss": 0.6812,
+      "step": 33
+    },
+    {
+      "epoch": 0.010708661417322834,
+      "grad_norm": 45.225807189941406,
+      "learning_rate": 9.999952755905514e-06,
+      "loss": 0.6361,
+      "step": 34
+    },
+    {
+      "epoch": 0.011023622047244094,
+      "grad_norm": 65.44268798828125,
+      "learning_rate": 9.999951181102363e-06,
+      "loss": 0.6421,
+      "step": 35
+    },
+    {
+      "epoch": 0.011338582677165355,
+      "grad_norm": 42.70692825317383,
+      "learning_rate": 9.999949606299212e-06,
+      "loss": 0.6809,
+      "step": 36
+    },
+    {
+      "epoch": 0.011653543307086614,
+      "grad_norm": 43.51832580566406,
+      "learning_rate": 9.999948031496063e-06,
+      "loss": 0.6612,
+      "step": 37
+    },
+    {
+      "epoch": 0.011968503937007874,
+      "grad_norm": 37.09170913696289,
+      "learning_rate": 9.999946456692914e-06,
+      "loss": 0.7119,
+      "step": 38
+    },
+    {
+      "epoch": 0.012283464566929133,
+      "grad_norm": 29.477069854736328,
+      "learning_rate": 9.999944881889765e-06,
+      "loss": 0.6933,
+      "step": 39
+    },
+    {
+      "epoch": 0.012598425196850394,
+      "grad_norm": 44.96734619140625,
+      "learning_rate": 9.999943307086614e-06,
+      "loss": 0.7328,
+      "step": 40
+    },
+    {
+      "epoch": 0.012598425196850394,
+      "eval_loss": 0.6671837568283081,
+      "eval_runtime": 309.18,
+      "eval_samples_per_second": 0.378,
+      "eval_steps_per_second": 0.378,
+      "step": 40
+    },
+    {
+      "epoch": 0.012913385826771654,
+      "grad_norm": 19.817779541015625,
+      "learning_rate": 9.999941732283465e-06,
+      "loss": 0.6673,
+      "step": 41
+    },
+    {
+      "epoch": 0.013228346456692913,
+      "grad_norm": 23.819435119628906,
+      "learning_rate": 9.999940157480316e-06,
+      "loss": 0.6966,
+      "step": 42
+    },
+    {
+      "epoch": 0.013543307086614173,
+      "grad_norm": 21.729511260986328,
+      "learning_rate": 9.999938582677167e-06,
+      "loss": 0.6791,
+      "step": 43
+    },
+    {
+      "epoch": 0.013858267716535434,
+      "grad_norm": 18.30646514892578,
+      "learning_rate": 9.999937007874017e-06,
+      "loss": 0.6786,
+      "step": 44
+    },
+    {
+      "epoch": 0.014173228346456693,
+      "grad_norm": 11.647773742675781,
+      "learning_rate": 9.999935433070866e-06,
+      "loss": 0.6956,
+      "step": 45
+    },
+    {
+      "epoch": 0.014488188976377953,
+      "grad_norm": 15.509359359741211,
+      "learning_rate": 9.999933858267717e-06,
+      "loss": 0.6616,
+      "step": 46
+    },
+    {
+      "epoch": 0.014803149606299212,
+      "grad_norm": 20.342838287353516,
+      "learning_rate": 9.999932283464568e-06,
+      "loss": 0.6184,
+      "step": 47
+    },
+    {
+      "epoch": 0.015118110236220473,
+      "grad_norm": 15.10333251953125,
+      "learning_rate": 9.999930708661419e-06,
+      "loss": 0.6671,
+      "step": 48
+    },
+    {
+      "epoch": 0.015433070866141733,
+      "grad_norm": 22.806962966918945,
+      "learning_rate": 9.999929133858268e-06,
+      "loss": 0.7039,
+      "step": 49
+    },
+    {
+      "epoch": 0.015748031496062992,
+      "grad_norm": 22.755117416381836,
+      "learning_rate": 9.99992755905512e-06,
+      "loss": 0.6512,
+      "step": 50
+    },
+    {
+      "epoch": 0.016062992125984252,
+      "grad_norm": 30.809261322021484,
+      "learning_rate": 9.999925984251969e-06,
+      "loss": 0.7041,
+      "step": 51
+    },
+    {
+      "epoch": 0.016377952755905513,
+      "grad_norm": 30.779508590698242,
+      "learning_rate": 9.99992440944882e-06,
+      "loss": 0.6321,
+      "step": 52
+    },
+    {
+      "epoch": 0.01669291338582677,
+      "grad_norm": 41.0311279296875,
+      "learning_rate": 9.99992283464567e-06,
+      "loss": 0.7655,
+      "step": 53
+    },
+    {
+      "epoch": 0.01700787401574803,
+      "grad_norm": 38.755794525146484,
+      "learning_rate": 9.999921259842522e-06,
+      "loss": 0.5663,
+      "step": 54
+    },
+    {
+      "epoch": 0.01732283464566929,
+      "grad_norm": 76.18267059326172,
+      "learning_rate": 9.99991968503937e-06,
+      "loss": 0.817,
+      "step": 55
+    },
+    {
+      "epoch": 0.01763779527559055,
+      "grad_norm": 64.29126739501953,
+      "learning_rate": 9.99991811023622e-06,
+      "loss": 0.7239,
+      "step": 56
+    },
+    {
+      "epoch": 0.017952755905511812,
+      "grad_norm": 64.24166107177734,
+      "learning_rate": 9.999916535433071e-06,
+      "loss": 0.5583,
+      "step": 57
+    },
+    {
+      "epoch": 0.01826771653543307,
+      "grad_norm": 35.76716232299805,
+      "learning_rate": 9.999914960629922e-06,
+      "loss": 0.6473,
+      "step": 58
+    },
+    {
+      "epoch": 0.01858267716535433,
+      "grad_norm": 31.879194259643555,
+      "learning_rate": 9.999913385826773e-06,
+      "loss": 0.5875,
+      "step": 59
+    },
+    {
+      "epoch": 0.01889763779527559,
+      "grad_norm": 43.807613372802734,
+      "learning_rate": 9.999911811023622e-06,
+      "loss": 0.6888,
+      "step": 60
+    },
+    {
+      "epoch": 0.01889763779527559,
+      "eval_loss": 0.6864338517189026,
+      "eval_runtime": 308.0649,
+      "eval_samples_per_second": 0.38,
+      "eval_steps_per_second": 0.38,
+      "step": 60
+    },
+    {
+      "epoch": 0.01921259842519685,
+      "grad_norm": 68.39881896972656,
+      "learning_rate": 9.999910236220473e-06,
+      "loss": 0.7587,
+      "step": 61
+    },
+    {
+      "epoch": 0.01952755905511811,
+      "grad_norm": 43.71537780761719,
+      "learning_rate": 9.999908661417323e-06,
+      "loss": 0.6054,
+      "step": 62
+    },
+    {
+      "epoch": 0.01984251968503937,
+      "grad_norm": 38.36960220336914,
+      "learning_rate": 9.999907086614175e-06,
+      "loss": 0.527,
+      "step": 63
+    },
+    {
+      "epoch": 0.02015748031496063,
+      "grad_norm": 53.25741958618164,
+      "learning_rate": 9.999905511811025e-06,
+      "loss": 0.8724,
+      "step": 64
+    },
+    {
+      "epoch": 0.02047244094488189,
+      "grad_norm": 38.55160140991211,
+      "learning_rate": 9.999903937007874e-06,
+      "loss": 0.6693,
+      "step": 65
+    },
+    {
+      "epoch": 0.02078740157480315,
+      "grad_norm": 36.05056381225586,
+      "learning_rate": 9.999902362204725e-06,
+      "loss": 0.6024,
+      "step": 66
+    },
+    {
+      "epoch": 0.02110236220472441,
+      "grad_norm": 83.4520492553711,
+      "learning_rate": 9.999900787401576e-06,
+      "loss": 0.6391,
+      "step": 67
+    },
+    {
+      "epoch": 0.021417322834645668,
+      "grad_norm": 23.22808837890625,
+      "learning_rate": 9.999899212598427e-06,
+      "loss": 0.6275,
+      "step": 68
+    },
+    {
+      "epoch": 0.021732283464566928,
+      "grad_norm": 42.370445251464844,
+      "learning_rate": 9.999897637795276e-06,
+      "loss": 0.3825,
+      "step": 69
+    },
+    {
+      "epoch": 0.02204724409448819,
+      "grad_norm": 38.64667892456055,
+      "learning_rate": 9.999896062992127e-06,
+      "loss": 0.6354,
+      "step": 70
+    },
+    {
+      "epoch": 0.02236220472440945,
+      "grad_norm": 44.61943054199219,
+      "learning_rate": 9.999894488188977e-06,
+      "loss": 0.6041,
+      "step": 71
+    },
+    {
+      "epoch": 0.02267716535433071,
+      "grad_norm": 36.52523422241211,
+      "learning_rate": 9.999892913385828e-06,
+      "loss": 0.5188,
+      "step": 72
+    },
+    {
+      "epoch": 0.022992125984251967,
+      "grad_norm": 38.4240608215332,
+      "learning_rate": 9.999891338582679e-06,
+      "loss": 0.612,
+      "step": 73
+    },
+    {
+      "epoch": 0.023307086614173227,
+      "grad_norm": 112.46929168701172,
+      "learning_rate": 9.99988976377953e-06,
+      "loss": 0.9316,
+      "step": 74
+    },
+    {
+      "epoch": 0.023622047244094488,
+      "grad_norm": 91.35350799560547,
+      "learning_rate": 9.999888188976379e-06,
+      "loss": 0.4754,
+      "step": 75
+    },
+    {
+      "epoch": 0.02393700787401575,
+      "grad_norm": 136.8651123046875,
+      "learning_rate": 9.999886614173228e-06,
+      "loss": 0.8443,
+      "step": 76
+    },
+    {
+      "epoch": 0.02425196850393701,
+      "grad_norm": 64.04878997802734,
+      "learning_rate": 9.999885039370079e-06,
+      "loss": 0.5659,
+      "step": 77
+    },
+    {
+      "epoch": 0.024566929133858266,
+      "grad_norm": 127.41741180419922,
+      "learning_rate": 9.99988346456693e-06,
+      "loss": 0.5924,
+      "step": 78
+    },
+    {
+      "epoch": 0.024881889763779527,
+      "grad_norm": 88.72442626953125,
+      "learning_rate": 9.999881889763781e-06,
+      "loss": 0.6118,
+      "step": 79
+    },
+    {
+      "epoch": 0.025196850393700787,
+      "grad_norm": 91.45403289794922,
+      "learning_rate": 9.99988031496063e-06,
+      "loss": 0.7566,
+      "step": 80
+    },
+    {
+      "epoch": 0.025196850393700787,
+      "eval_loss": 0.620968222618103,
+      "eval_runtime": 308.6891,
+      "eval_samples_per_second": 0.379,
+      "eval_steps_per_second": 0.379,
+      "step": 80
+    },
+    {
+      "epoch": 0.025511811023622048,
+      "grad_norm": 40.077823638916016,
+      "learning_rate": 9.999878740157481e-06,
+      "loss": 0.3255,
+      "step": 81
+    },
+    {
+      "epoch": 0.025826771653543308,
+      "grad_norm": 86.21344757080078,
+      "learning_rate": 9.99987716535433e-06,
+      "loss": 0.5621,
+      "step": 82
+    },
+    {
+      "epoch": 0.02614173228346457,
+      "grad_norm": 102.60726165771484,
+      "learning_rate": 9.999875590551182e-06,
+      "loss": 0.8275,
+      "step": 83
+    },
+    {
+      "epoch": 0.026456692913385826,
+      "grad_norm": 118.94241333007812,
+      "learning_rate": 9.999874015748033e-06,
+      "loss": 0.5316,
+      "step": 84
+    },
+    {
+      "epoch": 0.026771653543307086,
+      "grad_norm": 44.944576263427734,
+      "learning_rate": 9.999872440944882e-06,
+      "loss": 0.6057,
+      "step": 85
+    },
+    {
+      "epoch": 0.027086614173228347,
+      "grad_norm": 43.299503326416016,
+      "learning_rate": 9.999870866141733e-06,
+      "loss": 0.3967,
+      "step": 86
+    },
+    {
+      "epoch": 0.027401574803149607,
+      "grad_norm": 67.91696166992188,
+      "learning_rate": 9.999869291338584e-06,
+      "loss": 0.5094,
+      "step": 87
+    },
+    {
+      "epoch": 0.027716535433070868,
+      "grad_norm": 54.101783752441406,
+      "learning_rate": 9.999867716535435e-06,
+      "loss": 0.7315,
+      "step": 88
+    },
+    {
+      "epoch": 0.028031496062992125,
+      "grad_norm": 72.56822204589844,
+      "learning_rate": 9.999866141732284e-06,
+      "loss": 0.7512,
+      "step": 89
+    },
+    {
+      "epoch": 0.028346456692913385,
+      "grad_norm": 74.34241485595703,
+      "learning_rate": 9.999864566929135e-06,
+      "loss": 0.6363,
+      "step": 90
+    },
+    {
+      "epoch": 0.028661417322834646,
+      "grad_norm": 63.87611770629883,
+      "learning_rate": 9.999862992125984e-06,
+      "loss": 0.7365,
+      "step": 91
+    },
+    {
+      "epoch": 0.028976377952755906,
+      "grad_norm": 90.9892807006836,
+      "learning_rate": 9.999861417322835e-06,
+      "loss": 0.8111,
+      "step": 92
+    },
+    {
+      "epoch": 0.029291338582677167,
+      "grad_norm": 51.74814987182617,
+      "learning_rate": 9.999859842519686e-06,
+      "loss": 0.4766,
+      "step": 93
+    },
+    {
+      "epoch": 0.029606299212598424,
+      "grad_norm": 48.99016571044922,
+      "learning_rate": 9.999858267716537e-06,
+      "loss": 0.4001,
+      "step": 94
+    },
+    {
+      "epoch": 0.029921259842519685,
+      "grad_norm": 35.5272216796875,
+      "learning_rate": 9.999856692913387e-06,
+      "loss": 0.4208,
+      "step": 95
+    },
+    {
+      "epoch": 0.030236220472440945,
+      "grad_norm": 53.10519790649414,
+      "learning_rate": 9.999855118110236e-06,
+      "loss": 0.7011,
+      "step": 96
+    },
+    {
+      "epoch": 0.030551181102362206,
+      "grad_norm": 51.19492721557617,
+      "learning_rate": 9.999853543307087e-06,
+      "loss": 0.5737,
+      "step": 97
+    },
+    {
+      "epoch": 0.030866141732283466,
+      "grad_norm": 40.27799987792969,
+      "learning_rate": 9.999851968503938e-06,
+      "loss": 0.4889,
+      "step": 98
+    },
+    {
+      "epoch": 0.031181102362204723,
+      "grad_norm": 53.10594940185547,
+      "learning_rate": 9.999850393700789e-06,
+      "loss": 0.2341,
+      "step": 99
+    },
+    {
+      "epoch": 0.031496062992125984,
+      "grad_norm": 64.59747314453125,
+      "learning_rate": 9.999848818897638e-06,
+      "loss": 0.5309,
+      "step": 100
+    },
+    {
+      "epoch": 0.031496062992125984,
+      "eval_loss": 0.6238653659820557,
+      "eval_runtime": 308.8997,
+      "eval_samples_per_second": 0.379,
+      "eval_steps_per_second": 0.379,
+      "step": 100
+    },
+    {
+      "epoch": 0.03181102362204724,
+      "grad_norm": 56.20338821411133,
+      "learning_rate": 9.99984724409449e-06,
+      "loss": 0.5554,
+      "step": 101
+    },
+    {
+      "epoch": 0.032125984251968505,
+      "grad_norm": 78.87137603759766,
+      "learning_rate": 9.999845669291339e-06,
+      "loss": 0.6822,
+      "step": 102
+    },
+    {
+      "epoch": 0.03244094488188976,
+      "grad_norm": 76.1572036743164,
+      "learning_rate": 9.99984409448819e-06,
+      "loss": 0.4211,
+      "step": 103
+    },
+    {
+      "epoch": 0.032755905511811026,
+      "grad_norm": 53.714942932128906,
+      "learning_rate": 9.99984251968504e-06,
+      "loss": 0.2694,
+      "step": 104
+    },
+    {
+      "epoch": 0.03307086614173228,
+      "grad_norm": 100.88641357421875,
+      "learning_rate": 9.99984094488189e-06,
+      "loss": 0.6587,
+      "step": 105
+    },
+    {
+      "epoch": 0.03338582677165354,
+      "grad_norm": 59.46120071411133,
+      "learning_rate": 9.999839370078741e-06,
+      "loss": 0.3308,
+      "step": 106
+    },
+    {
+      "epoch": 0.033700787401574804,
+      "grad_norm": 142.22496032714844,
+      "learning_rate": 9.99983779527559e-06,
+      "loss": 0.9014,
+      "step": 107
+    },
+    {
+      "epoch": 0.03401574803149606,
+      "grad_norm": 116.70782470703125,
+      "learning_rate": 9.999836220472441e-06,
+      "loss": 0.7636,
+      "step": 108
+    },
+    {
+      "epoch": 0.034330708661417325,
+      "grad_norm": 122.21369171142578,
+      "learning_rate": 9.999834645669292e-06,
+      "loss": 0.6502,
+      "step": 109
+    },
+    {
+      "epoch": 0.03464566929133858,
+      "grad_norm": 146.69210815429688,
+      "learning_rate": 9.999833070866143e-06,
+      "loss": 1.088,
+      "step": 110
+    },
+    {
+      "epoch": 0.03496062992125984,
+      "grad_norm": 90.0801010131836,
+      "learning_rate": 9.999831496062992e-06,
+      "loss": 0.5323,
+      "step": 111
+    },
+    {
+      "epoch": 0.0352755905511811,
+      "grad_norm": 64.70466613769531,
+      "learning_rate": 9.999829921259843e-06,
+      "loss": 0.4768,
+      "step": 112
+    },
+    {
+      "epoch": 0.03559055118110236,
+      "grad_norm": 43.21613311767578,
+      "learning_rate": 9.999828346456694e-06,
+      "loss": 0.5867,
+      "step": 113
+    },
+    {
+      "epoch": 0.035905511811023624,
+      "grad_norm": 98.97393798828125,
+      "learning_rate": 9.999826771653545e-06,
+      "loss": 0.9022,
+      "step": 114
+    },
+    {
+      "epoch": 0.03622047244094488,
+      "grad_norm": 49.0715446472168,
+      "learning_rate": 9.999825196850395e-06,
+      "loss": 0.4157,
+      "step": 115
+    },
+    {
+      "epoch": 0.03653543307086614,
+      "grad_norm": 49.2851676940918,
+      "learning_rate": 9.999823622047244e-06,
+      "loss": 0.4487,
+      "step": 116
+    },
+    {
+      "epoch": 0.0368503937007874,
+      "grad_norm": 37.42869567871094,
+      "learning_rate": 9.999822047244095e-06,
+      "loss": 0.576,
+      "step": 117
+    },
+    {
+      "epoch": 0.03716535433070866,
+      "grad_norm": 43.0858154296875,
+      "learning_rate": 9.999820472440946e-06,
+      "loss": 0.6744,
+      "step": 118
+    },
+    {
+      "epoch": 0.037480314960629924,
+      "grad_norm": 51.691558837890625,
+      "learning_rate": 9.999818897637797e-06,
+      "loss": 0.4399,
+      "step": 119
+    },
+    {
+      "epoch": 0.03779527559055118,
+      "grad_norm": 48.11525344848633,
+      "learning_rate": 9.999817322834646e-06,
+      "loss": 0.7208,
+      "step": 120
+    },
+    {
+      "epoch": 0.03779527559055118,
+      "eval_loss": 0.6145237684249878,
+      "eval_runtime": 297.9679,
+      "eval_samples_per_second": 0.393,
+      "eval_steps_per_second": 0.393,
+      "step": 120
+    },
+    {
+      "epoch": 0.03811023622047244,
+      "grad_norm": 59.46645736694336,
+      "learning_rate": 9.999815748031497e-06,
+      "loss": 0.6338,
+      "step": 121
+    },
+    {
+      "epoch": 0.0384251968503937,
+      "grad_norm": 41.5179443359375,
+      "learning_rate": 9.999814173228347e-06,
+      "loss": 0.6118,
+      "step": 122
+    },
+    {
+      "epoch": 0.03874015748031496,
+      "grad_norm": 30.39054298400879,
+      "learning_rate": 9.999812598425198e-06,
+      "loss": 0.589,
+      "step": 123
+    },
+    {
+      "epoch": 0.03905511811023622,
+      "grad_norm": 47.73324966430664,
+      "learning_rate": 9.999811023622049e-06,
+      "loss": 0.4502,
+      "step": 124
+    },
+    {
+      "epoch": 0.03937007874015748,
+      "grad_norm": 63.116180419921875,
+      "learning_rate": 9.999809448818898e-06,
+      "loss": 0.5543,
+      "step": 125
+    },
+    {
+      "epoch": 0.03968503937007874,
+      "grad_norm": 48.51982879638672,
+      "learning_rate": 9.999807874015749e-06,
+      "loss": 0.3345,
+      "step": 126
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 70.61180114746094,
+      "learning_rate": 9.999806299212598e-06,
+      "loss": 0.408,
+      "step": 127
+    },
+    {
+      "epoch": 0.04031496062992126,
+      "grad_norm": 100.2572021484375,
+      "learning_rate": 9.999804724409449e-06,
+      "loss": 0.5952,
+      "step": 128
+    },
+    {
+      "epoch": 0.04062992125984252,
+      "grad_norm": 120.8499984741211,
+      "learning_rate": 9.9998031496063e-06,
+      "loss": 0.5751,
+      "step": 129
+    },
+    {
+      "epoch": 0.04094488188976378,
+      "grad_norm": 116.19609069824219,
+      "learning_rate": 9.999801574803151e-06,
+      "loss": 0.9086,
+      "step": 130
+    },
+    {
+      "epoch": 0.041259842519685036,
+      "grad_norm": 203.10231018066406,
+      "learning_rate": 9.9998e-06,
+      "loss": 0.907,
+      "step": 131
+    },
+    {
+      "epoch": 0.0415748031496063,
+      "grad_norm": 90.6951904296875,
+      "learning_rate": 9.999798425196851e-06,
+      "loss": 0.4064,
+      "step": 132
+    },
+    {
+      "epoch": 0.04188976377952756,
+      "grad_norm": 67.41472625732422,
+      "learning_rate": 9.999796850393702e-06,
+      "loss": 0.4828,
+      "step": 133
+    },
+    {
+      "epoch": 0.04220472440944882,
+      "grad_norm": 137.20547485351562,
+      "learning_rate": 9.999795275590553e-06,
+      "loss": 0.7428,
+      "step": 134
+    },
+    {
+      "epoch": 0.04251968503937008,
+      "grad_norm": 290.8530578613281,
+      "learning_rate": 9.999793700787403e-06,
+      "loss": 0.8161,
+      "step": 135
+    },
+    {
+      "epoch": 0.042834645669291335,
+      "grad_norm": 149.33602905273438,
+      "learning_rate": 9.999792125984252e-06,
+      "loss": 0.5009,
+      "step": 136
+    },
+    {
+      "epoch": 0.0431496062992126,
+      "grad_norm": 114.50115203857422,
+      "learning_rate": 9.999790551181103e-06,
+      "loss": 1.0042,
+      "step": 137
+    },
+    {
+      "epoch": 0.043464566929133856,
+      "grad_norm": 101.2292251586914,
+      "learning_rate": 9.999788976377954e-06,
+      "loss": 0.4708,
+      "step": 138
+    },
+    {
+      "epoch": 0.04377952755905512,
+      "grad_norm": 91.65269470214844,
+      "learning_rate": 9.999787401574805e-06,
+      "loss": 0.6723,
+      "step": 139
+    },
+    {
+      "epoch": 0.04409448818897638,
+      "grad_norm": 54.82379913330078,
+      "learning_rate": 9.999785826771654e-06,
+      "loss": 0.4811,
+      "step": 140
+    },
+    {
+      "epoch": 0.04409448818897638,
+      "eval_loss": 0.5761768221855164,
+      "eval_runtime": 296.7123,
+      "eval_samples_per_second": 0.394,
+      "eval_steps_per_second": 0.394,
+      "step": 140
+    },
+    {
+      "epoch": 0.044409448818897634,
+      "grad_norm": 30.785768508911133,
+      "learning_rate": 9.999784251968505e-06,
+      "loss": 0.2963,
+      "step": 141
+    },
+    {
+      "epoch": 0.0447244094488189,
+      "grad_norm": 44.203250885009766,
+      "learning_rate": 9.999782677165354e-06,
+      "loss": 0.6864,
+      "step": 142
+    },
+    {
+      "epoch": 0.045039370078740155,
+      "grad_norm": 67.31315612792969,
+      "learning_rate": 9.999781102362205e-06,
+      "loss": 0.6552,
+      "step": 143
+    },
+    {
+      "epoch": 0.04535433070866142,
+      "grad_norm": 36.43077087402344,
+      "learning_rate": 9.999779527559056e-06,
+      "loss": 0.6163,
+      "step": 144
+    },
+    {
+      "epoch": 0.04566929133858268,
+      "grad_norm": 81.87039184570312,
+      "learning_rate": 9.999777952755906e-06,
+      "loss": 1.0169,
+      "step": 145
+    },
+    {
+      "epoch": 0.045984251968503934,
+      "grad_norm": 32.918399810791016,
+      "learning_rate": 9.999776377952757e-06,
+      "loss": 0.4143,
+      "step": 146
+    },
+    {
+      "epoch": 0.0462992125984252,
+      "grad_norm": 40.705284118652344,
+      "learning_rate": 9.999774803149606e-06,
+      "loss": 0.5362,
+      "step": 147
+    },
+    {
+      "epoch": 0.046614173228346455,
+      "grad_norm": 25.83769416809082,
+      "learning_rate": 9.999773228346457e-06,
+      "loss": 0.4726,
+      "step": 148
+    },
+    {
+      "epoch": 0.04692913385826772,
+      "grad_norm": 51.383758544921875,
+      "learning_rate": 9.999771653543308e-06,
+      "loss": 0.6378,
+      "step": 149
+    },
+    {
+      "epoch": 0.047244094488188976,
+      "grad_norm": 59.23312759399414,
+      "learning_rate": 9.999770078740159e-06,
+      "loss": 0.4571,
+      "step": 150
+    },
+    {
+      "epoch": 0.04755905511811023,
+      "grad_norm": 32.09741973876953,
+      "learning_rate": 9.999768503937008e-06,
+      "loss": 0.3173,
+      "step": 151
+    },
+    {
+      "epoch": 0.0478740157480315,
+      "grad_norm": 40.37042999267578,
+      "learning_rate": 9.99976692913386e-06,
+      "loss": 0.6127,
+      "step": 152
+    },
+    {
+      "epoch": 0.048188976377952754,
+      "grad_norm": 50.806793212890625,
+      "learning_rate": 9.999765354330709e-06,
+      "loss": 0.6617,
+      "step": 153
+    },
+    {
+      "epoch": 0.04850393700787402,
+      "grad_norm": 42.13128662109375,
+      "learning_rate": 9.99976377952756e-06,
+      "loss": 0.4311,
+      "step": 154
+    },
+    {
+      "epoch": 0.048818897637795275,
+      "grad_norm": 51.54093933105469,
+      "learning_rate": 9.99976220472441e-06,
+      "loss": 0.7501,
+      "step": 155
+    },
+    {
+      "epoch": 0.04913385826771653,
+      "grad_norm": 71.40542602539062,
+      "learning_rate": 9.99976062992126e-06,
+      "loss": 1.0623,
+      "step": 156
+    },
+    {
+      "epoch": 0.049448818897637796,
+      "grad_norm": 23.955883026123047,
+      "learning_rate": 9.999759055118111e-06,
+      "loss": 0.118,
+      "step": 157
+    },
+    {
+      "epoch": 0.04976377952755905,
+      "grad_norm": 46.93206024169922,
+      "learning_rate": 9.999757480314962e-06,
+      "loss": 0.6759,
+      "step": 158
+    },
+    {
+      "epoch": 0.05007874015748032,
+      "grad_norm": 40.86898422241211,
+      "learning_rate": 9.999755905511813e-06,
+      "loss": 0.5849,
+      "step": 159
+    },
+    {
+      "epoch": 0.050393700787401574,
+      "grad_norm": 54.818450927734375,
+      "learning_rate": 9.999754330708662e-06,
+      "loss": 0.4963,
+      "step": 160
+    },
+    {
+      "epoch": 0.050393700787401574,
+      "eval_loss": 0.646256148815155,
+      "eval_runtime": 299.1934,
+      "eval_samples_per_second": 0.391,
+      "eval_steps_per_second": 0.391,
+      "step": 160
+    },
+    {
+      "epoch": 0.05070866141732284,
+      "grad_norm": 39.67280578613281,
+      "learning_rate": 9.999752755905513e-06,
+      "loss": 0.6,
+      "step": 161
+    },
+    {
+      "epoch": 0.051023622047244095,
+      "grad_norm": 44.99142074584961,
+      "learning_rate": 9.999751181102362e-06,
+      "loss": 0.4204,
+      "step": 162
+    },
+    {
+      "epoch": 0.05133858267716535,
+      "grad_norm": 47.1932373046875,
+      "learning_rate": 9.999749606299213e-06,
+      "loss": 0.5264,
+      "step": 163
+    },
+    {
+      "epoch": 0.051653543307086616,
+      "grad_norm": 59.98406219482422,
+      "learning_rate": 9.999748031496064e-06,
+      "loss": 0.5378,
+      "step": 164
+    },
+    {
+      "epoch": 0.05196850393700787,
+      "grad_norm": 54.76002883911133,
+      "learning_rate": 9.999746456692914e-06,
+      "loss": 0.4653,
+      "step": 165
+    },
+    {
+      "epoch": 0.05228346456692914,
+      "grad_norm": 65.97516632080078,
+      "learning_rate": 9.999744881889765e-06,
+      "loss": 0.493,
+      "step": 166
+    },
+    {
+      "epoch": 0.052598425196850394,
+      "grad_norm": 74.50453186035156,
+      "learning_rate": 9.999743307086614e-06,
+      "loss": 0.5024,
+      "step": 167
+    },
+    {
+      "epoch": 0.05291338582677165,
+      "grad_norm": 79.50423431396484,
+      "learning_rate": 9.999741732283465e-06,
+      "loss": 0.683,
+      "step": 168
+    },
+    {
+      "epoch": 0.053228346456692915,
+      "grad_norm": 30.747211456298828,
+      "learning_rate": 9.999740157480316e-06,
+      "loss": 0.1244,
+      "step": 169
+    },
+    {
+      "epoch": 0.05354330708661417,
+      "grad_norm": 127.78273010253906,
+      "learning_rate": 9.999738582677167e-06,
+      "loss": 1.0797,
+      "step": 170
+    },
+    {
+      "epoch": 0.053858267716535436,
+      "grad_norm": 33.981021881103516,
+      "learning_rate": 9.999737007874016e-06,
+      "loss": 0.1601,
+      "step": 171
+    },
+    {
+      "epoch": 0.054173228346456694,
+      "grad_norm": 89.67857360839844,
+      "learning_rate": 9.999735433070867e-06,
+      "loss": 0.5669,
+      "step": 172
+    },
+    {
+      "epoch": 0.05448818897637795,
+      "grad_norm": 44.71755599975586,
+      "learning_rate": 9.999733858267717e-06,
+      "loss": 0.2556,
+      "step": 173
+    },
+    {
+      "epoch": 0.054803149606299215,
+      "grad_norm": 63.42751693725586,
+      "learning_rate": 9.999732283464568e-06,
+      "loss": 0.353,
+      "step": 174
+    },
+    {
+      "epoch": 0.05511811023622047,
+      "grad_norm": 145.7510986328125,
+      "learning_rate": 9.999730708661419e-06,
+      "loss": 0.6861,
+      "step": 175
+    },
+    {
+      "epoch": 0.055433070866141736,
+      "grad_norm": 86.73828125,
+      "learning_rate": 9.999729133858268e-06,
+      "loss": 0.7471,
+      "step": 176
+    },
+    {
+      "epoch": 0.05574803149606299,
+      "grad_norm": 78.96038055419922,
+      "learning_rate": 9.999727559055119e-06,
+      "loss": 0.5997,
+      "step": 177
+    },
+    {
+      "epoch": 0.05606299212598425,
+      "grad_norm": 77.76657104492188,
+      "learning_rate": 9.999725984251968e-06,
+      "loss": 0.4139,
+      "step": 178
+    },
+    {
+      "epoch": 0.056377952755905514,
+      "grad_norm": 56.66273880004883,
+      "learning_rate": 9.999724409448819e-06,
+      "loss": 0.3804,
+      "step": 179
+    },
+    {
+      "epoch": 0.05669291338582677,
+      "grad_norm": 45.65275192260742,
+      "learning_rate": 9.99972283464567e-06,
+      "loss": 0.2486,
+      "step": 180
+    },
+    {
+      "epoch": 0.05669291338582677,
+      "eval_loss": 0.6303219795227051,
+      "eval_runtime": 297.0224,
+      "eval_samples_per_second": 0.394,
+      "eval_steps_per_second": 0.394,
+      "step": 180
+    },
+    {
+      "epoch": 0.057007874015748035,
+      "grad_norm": 110.07376098632812,
+      "learning_rate": 9.999721259842521e-06,
+      "loss": 0.9565,
+      "step": 181
+    },
+    {
+      "epoch": 0.05732283464566929,
+      "grad_norm": 83.31719970703125,
+      "learning_rate": 9.99971968503937e-06,
+      "loss": 0.7403,
+      "step": 182
+    },
+    {
+      "epoch": 0.05763779527559055,
+      "grad_norm": 74.98153686523438,
+      "learning_rate": 9.999718110236221e-06,
+      "loss": 0.9278,
+      "step": 183
+    },
+    {
+      "epoch": 0.05795275590551181,
+      "grad_norm": 86.86173248291016,
+      "learning_rate": 9.999716535433072e-06,
+      "loss": 0.5827,
+      "step": 184
+    },
+    {
+      "epoch": 0.05826771653543307,
+      "grad_norm": 33.1976318359375,
+      "learning_rate": 9.999714960629922e-06,
+      "loss": 0.3653,
+      "step": 185
+    },
+    {
+      "epoch": 0.058582677165354334,
+      "grad_norm": 80.26583862304688,
+      "learning_rate": 9.999713385826773e-06,
+      "loss": 0.4716,
+      "step": 186
+    },
+    {
+      "epoch": 0.05889763779527559,
+      "grad_norm": 58.74989700317383,
+      "learning_rate": 9.999711811023622e-06,
+      "loss": 0.3977,
+      "step": 187
+    },
+    {
+      "epoch": 0.05921259842519685,
+      "grad_norm": 75.13705444335938,
+      "learning_rate": 9.999710236220473e-06,
+      "loss": 0.595,
+      "step": 188
+    },
+    {
+      "epoch": 0.05952755905511811,
+      "grad_norm": 44.060882568359375,
+      "learning_rate": 9.999708661417324e-06,
+      "loss": 0.3714,
+      "step": 189
+    },
+    {
+      "epoch": 0.05984251968503937,
+      "grad_norm": 35.92017364501953,
+      "learning_rate": 9.999707086614175e-06,
+      "loss": 0.4465,
+      "step": 190
+    },
+    {
+      "epoch": 0.06015748031496063,
+      "grad_norm": 35.966800689697266,
+      "learning_rate": 9.999705511811024e-06,
+      "loss": 0.4875,
+      "step": 191
+    },
+    {
+      "epoch": 0.06047244094488189,
+      "grad_norm": 48.5458869934082,
+      "learning_rate": 9.999703937007875e-06,
+      "loss": 0.6907,
+      "step": 192
+    },
+    {
+      "epoch": 0.06078740157480315,
+      "grad_norm": 38.40484619140625,
+      "learning_rate": 9.999702362204725e-06,
+      "loss": 0.3482,
+      "step": 193
+    },
+    {
+      "epoch": 0.06110236220472441,
+      "grad_norm": 87.23228454589844,
+      "learning_rate": 9.999700787401576e-06,
+      "loss": 0.4824,
+      "step": 194
+    },
+    {
+      "epoch": 0.06141732283464567,
+      "grad_norm": 62.46897888183594,
+      "learning_rate": 9.999699212598427e-06,
+      "loss": 0.3294,
+      "step": 195
+    },
+    {
+      "epoch": 0.06173228346456693,
+      "grad_norm": 33.665218353271484,
+      "learning_rate": 9.999697637795276e-06,
+      "loss": 0.222,
+      "step": 196
+    },
+    {
+      "epoch": 0.06204724409448819,
+      "grad_norm": 91.13434600830078,
+      "learning_rate": 9.999696062992127e-06,
+      "loss": 0.9194,
+      "step": 197
+    },
+    {
+      "epoch": 0.06236220472440945,
+      "grad_norm": 31.107872009277344,
+      "learning_rate": 9.999694488188976e-06,
+      "loss": 0.2354,
+      "step": 198
+    },
+    {
+      "epoch": 0.06267716535433071,
+      "grad_norm": 99.7812728881836,
+      "learning_rate": 9.999692913385827e-06,
+      "loss": 0.6351,
+      "step": 199
+    },
+    {
+      "epoch": 0.06299212598425197,
+      "grad_norm": 41.42717361450195,
+      "learning_rate": 9.999691338582678e-06,
+      "loss": 0.1414,
+      "step": 200
+    },
+    {
+      "epoch": 0.06299212598425197,
+      "eval_loss": 0.6573231220245361,
+      "eval_runtime": 302.1543,
+      "eval_samples_per_second": 0.387,
+      "eval_steps_per_second": 0.387,
+      "step": 200
+    },
+    {
+      "epoch": 0.06330708661417322,
+      "grad_norm": 61.020408630371094,
+      "learning_rate": 9.999689763779529e-06,
+      "loss": 0.6242,
+      "step": 201
+    },
+    {
+      "epoch": 0.06362204724409448,
+      "grad_norm": 70.84980010986328,
+      "learning_rate": 9.999688188976378e-06,
+      "loss": 0.1907,
+      "step": 202
+    },
+    {
+      "epoch": 0.06393700787401575,
+      "grad_norm": 174.39080810546875,
+      "learning_rate": 9.99968661417323e-06,
+      "loss": 0.9131,
+      "step": 203
+    },
+    {
+      "epoch": 0.06425196850393701,
+      "grad_norm": 47.28941345214844,
+      "learning_rate": 9.99968503937008e-06,
+      "loss": 0.2425,
+      "step": 204
+    },
+    {
+      "epoch": 0.06456692913385827,
+      "grad_norm": 51.628211975097656,
+      "learning_rate": 9.99968346456693e-06,
+      "loss": 0.2873,
+      "step": 205
+    },
+    {
+      "epoch": 0.06488188976377952,
+      "grad_norm": 63.0713996887207,
+      "learning_rate": 9.99968188976378e-06,
+      "loss": 0.6194,
+      "step": 206
+    },
+    {
+      "epoch": 0.06519685039370078,
+      "grad_norm": 164.83543395996094,
+      "learning_rate": 9.99968031496063e-06,
+      "loss": 0.5043,
+      "step": 207
+    },
+    {
+      "epoch": 0.06551181102362205,
+      "grad_norm": 83.96135711669922,
+      "learning_rate": 9.999678740157481e-06,
+      "loss": 0.6309,
+      "step": 208
+    },
+    {
+      "epoch": 0.06582677165354331,
+      "grad_norm": 81.93275451660156,
+      "learning_rate": 9.999677165354332e-06,
+      "loss": 0.4175,
+      "step": 209
+    },
+    {
+      "epoch": 0.06614173228346457,
+      "grad_norm": 129.6193389892578,
+      "learning_rate": 9.999675590551183e-06,
+      "loss": 0.5378,
+      "step": 210
+    },
+    {
+      "epoch": 0.06645669291338582,
+      "grad_norm": 137.78428649902344,
+      "learning_rate": 9.999674015748032e-06,
+      "loss": 1.2212,
+      "step": 211
+    },
+    {
+      "epoch": 0.06677165354330708,
+      "grad_norm": 42.24091339111328,
+      "learning_rate": 9.999672440944883e-06,
+      "loss": 0.173,
+      "step": 212
+    },
+    {
+      "epoch": 0.06708661417322835,
+      "grad_norm": 68.79737091064453,
+      "learning_rate": 9.999670866141732e-06,
+      "loss": 0.1975,
+      "step": 213
+    },
+    {
+      "epoch": 0.06740157480314961,
+      "grad_norm": 125.35755920410156,
+      "learning_rate": 9.999669291338583e-06,
+      "loss": 0.7453,
+      "step": 214
+    },
+    {
+      "epoch": 0.06771653543307087,
+      "grad_norm": 56.881229400634766,
+      "learning_rate": 9.999667716535434e-06,
+      "loss": 0.3073,
+      "step": 215
+    },
+    {
+      "epoch": 0.06803149606299212,
+      "grad_norm": 121.92823028564453,
+      "learning_rate": 9.999666141732284e-06,
+      "loss": 1.0795,
+      "step": 216
+    },
+    {
+      "epoch": 0.06834645669291338,
+      "grad_norm": 44.86691665649414,
+      "learning_rate": 9.999664566929135e-06,
+      "loss": 0.4312,
+      "step": 217
+    },
+    {
+      "epoch": 0.06866141732283465,
+      "grad_norm": 142.288330078125,
+      "learning_rate": 9.999662992125984e-06,
+      "loss": 0.6631,
+      "step": 218
+    },
+    {
+      "epoch": 0.06897637795275591,
+      "grad_norm": 200.1629180908203,
+      "learning_rate": 9.999661417322835e-06,
+      "loss": 0.5105,
+      "step": 219
+    },
+    {
+      "epoch": 0.06929133858267716,
+      "grad_norm": 83.08853149414062,
+      "learning_rate": 9.999659842519686e-06,
+      "loss": 0.6537,
+      "step": 220
+    },
+    {
+      "epoch": 0.06929133858267716,
+      "eval_loss": 0.7070333957672119,
+      "eval_runtime": 307.2171,
+      "eval_samples_per_second": 0.381,
+      "eval_steps_per_second": 0.381,
+      "step": 220
+    },
+    {
+      "epoch": 0.06960629921259842,
+      "grad_norm": 131.77316284179688,
+      "learning_rate": 9.999658267716537e-06,
+      "loss": 0.7263,
+      "step": 221
+    },
+    {
+      "epoch": 0.06992125984251968,
+      "grad_norm": 103.4114761352539,
+      "learning_rate": 9.999656692913386e-06,
+      "loss": 1.0884,
+      "step": 222
+    },
+    {
+      "epoch": 0.07023622047244095,
+      "grad_norm": 68.44525909423828,
+      "learning_rate": 9.999655118110237e-06,
+      "loss": 0.6249,
+      "step": 223
+    },
+    {
+      "epoch": 0.0705511811023622,
+      "grad_norm": 61.6135139465332,
+      "learning_rate": 9.999653543307087e-06,
+      "loss": 0.5604,
+      "step": 224
+    },
+    {
+      "epoch": 0.07086614173228346,
+      "grad_norm": 86.59762573242188,
+      "learning_rate": 9.999651968503938e-06,
+      "loss": 0.5263,
+      "step": 225
+    },
+    {
+      "epoch": 0.07118110236220472,
+      "grad_norm": 42.36429214477539,
+      "learning_rate": 9.999650393700789e-06,
+      "loss": 0.4644,
+      "step": 226
+    },
+    {
+      "epoch": 0.07149606299212598,
+      "grad_norm": 38.47148132324219,
+      "learning_rate": 9.999648818897638e-06,
+      "loss": 0.4286,
+      "step": 227
+    },
+    {
+      "epoch": 0.07181102362204725,
+      "grad_norm": 23.091997146606445,
+      "learning_rate": 9.999647244094489e-06,
+      "loss": 0.3635,
+      "step": 228
+    },
+    {
+      "epoch": 0.0721259842519685,
+      "grad_norm": 48.05474090576172,
+      "learning_rate": 9.99964566929134e-06,
+      "loss": 0.3296,
+      "step": 229
+    },
+    {
+      "epoch": 0.07244094488188976,
+      "grad_norm": 56.6866569519043,
+      "learning_rate": 9.99964409448819e-06,
+      "loss": 0.5982,
+      "step": 230
+    },
+    {
+      "epoch": 0.07275590551181102,
+      "grad_norm": 34.4522705078125,
+      "learning_rate": 9.99964251968504e-06,
+      "loss": 0.5496,
+      "step": 231
+    },
+    {
+      "epoch": 0.07307086614173228,
+      "grad_norm": 36.2459831237793,
+      "learning_rate": 9.999640944881891e-06,
+      "loss": 0.4069,
+      "step": 232
+    },
+    {
+      "epoch": 0.07338582677165355,
+      "grad_norm": 57.894195556640625,
+      "learning_rate": 9.99963937007874e-06,
+      "loss": 0.4789,
+      "step": 233
+    },
+    {
+      "epoch": 0.0737007874015748,
+      "grad_norm": 100.86152648925781,
+      "learning_rate": 9.999637795275591e-06,
+      "loss": 0.3101,
+      "step": 234
+    },
+    {
+      "epoch": 0.07401574803149606,
+      "grad_norm": 49.66980743408203,
+      "learning_rate": 9.999636220472442e-06,
+      "loss": 0.516,
+      "step": 235
+    },
+    {
+      "epoch": 0.07433070866141732,
+      "grad_norm": 52.82820510864258,
+      "learning_rate": 9.999634645669292e-06,
+      "loss": 0.3732,
+      "step": 236
+    },
+    {
+      "epoch": 0.07464566929133858,
+      "grad_norm": 56.593467712402344,
+      "learning_rate": 9.999633070866143e-06,
+      "loss": 0.3873,
+      "step": 237
+    },
+    {
+      "epoch": 0.07496062992125985,
+      "grad_norm": 20.434045791625977,
+      "learning_rate": 9.999631496062992e-06,
+      "loss": 0.2495,
+      "step": 238
+    },
+    {
+      "epoch": 0.0752755905511811,
+      "grad_norm": 65.34156799316406,
+      "learning_rate": 9.999629921259843e-06,
+      "loss": 0.6973,
+      "step": 239
+    },
+    {
+      "epoch": 0.07559055118110236,
+      "grad_norm": 32.1629638671875,
+      "learning_rate": 9.999628346456694e-06,
+      "loss": 0.3504,
+      "step": 240
+    },
+    {
+      "epoch": 0.07559055118110236,
+      "eval_loss": 0.7387034296989441,
+      "eval_runtime": 307.2747,
+      "eval_samples_per_second": 0.381,
+      "eval_steps_per_second": 0.381,
+      "step": 240
+    },
+    {
+      "epoch": 0.07590551181102362,
+      "grad_norm": 78.75101470947266,
+      "learning_rate": 9.999626771653545e-06,
+      "loss": 0.5959,
+      "step": 241
+    },
+    {
+      "epoch": 0.07622047244094488,
+      "grad_norm": 52.39651870727539,
+      "learning_rate": 9.999625196850394e-06,
+      "loss": 0.3459,
+      "step": 242
+    },
+    {
+      "epoch": 0.07653543307086615,
+      "grad_norm": 100.4014663696289,
+      "learning_rate": 9.999623622047245e-06,
+      "loss": 1.4185,
+      "step": 243
+    },
+    {
+      "epoch": 0.0768503937007874,
+      "grad_norm": 63.76593780517578,
+      "learning_rate": 9.999622047244095e-06,
+      "loss": 0.2557,
+      "step": 244
+    },
+    {
+      "epoch": 0.07716535433070866,
+      "grad_norm": 141.5568084716797,
+      "learning_rate": 9.999620472440946e-06,
+      "loss": 0.4333,
+      "step": 245
+    },
+    {
+      "epoch": 0.07748031496062992,
+      "grad_norm": 104.44050598144531,
+      "learning_rate": 9.999618897637797e-06,
+      "loss": 0.5989,
+      "step": 246
+    },
+    {
+      "epoch": 0.07779527559055117,
+      "grad_norm": 43.31072998046875,
+      "learning_rate": 9.999617322834646e-06,
+      "loss": 0.579,
+      "step": 247
+    },
+    {
+      "epoch": 0.07811023622047245,
+      "grad_norm": 47.9522819519043,
+      "learning_rate": 9.999615748031497e-06,
+      "loss": 0.2994,
+      "step": 248
+    },
+    {
+      "epoch": 0.0784251968503937,
+      "grad_norm": 40.461368560791016,
+      "learning_rate": 9.999614173228346e-06,
+      "loss": 0.4243,
+      "step": 249
+    },
+    {
+      "epoch": 0.07874015748031496,
+      "grad_norm": 26.12702751159668,
+      "learning_rate": 9.999612598425197e-06,
+      "loss": 0.3175,
+      "step": 250
+    },
+    {
+      "epoch": 0.07905511811023622,
+      "grad_norm": 76.74534606933594,
+      "learning_rate": 9.999611023622048e-06,
+      "loss": 0.6386,
+      "step": 251
+    },
+    {
+      "epoch": 0.07937007874015747,
+      "grad_norm": 61.15847396850586,
+      "learning_rate": 9.999609448818899e-06,
+      "loss": 0.2725,
+      "step": 252
+    },
+    {
+      "epoch": 0.07968503937007874,
+      "grad_norm": 119.91480255126953,
+      "learning_rate": 9.999607874015748e-06,
+      "loss": 0.2381,
+      "step": 253
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 138.52313232421875,
+      "learning_rate": 9.9996062992126e-06,
+      "loss": 1.0807,
+      "step": 254
+    },
+    {
+      "epoch": 0.08031496062992126,
+      "grad_norm": 99.42451477050781,
+      "learning_rate": 9.99960472440945e-06,
+      "loss": 0.4007,
+      "step": 255
+    },
+    {
+      "epoch": 0.08062992125984252,
+      "grad_norm": 51.6858024597168,
+      "learning_rate": 9.9996031496063e-06,
+      "loss": 0.2963,
+      "step": 256
+    },
+    {
+      "epoch": 0.08094488188976377,
+      "grad_norm": 52.566734313964844,
+      "learning_rate": 9.99960157480315e-06,
+      "loss": 0.2889,
+      "step": 257
+    },
+    {
+      "epoch": 0.08125984251968504,
+      "grad_norm": 94.96017456054688,
+      "learning_rate": 9.9996e-06,
+      "loss": 1.354,
+      "step": 258
+    },
+    {
+      "epoch": 0.0815748031496063,
+      "grad_norm": 54.514915466308594,
+      "learning_rate": 9.999598425196851e-06,
+      "loss": 0.1296,
+      "step": 259
+    },
+    {
+      "epoch": 0.08188976377952756,
+      "grad_norm": 165.0517578125,
+      "learning_rate": 9.999596850393702e-06,
+      "loss": 0.7929,
+      "step": 260
+    },
+    {
+      "epoch": 0.08188976377952756,
+      "eval_loss": 0.767807126045227,
+      "eval_runtime": 307.1459,
+      "eval_samples_per_second": 0.381,
+      "eval_steps_per_second": 0.381,
+      "step": 260
+    },
+    {
+      "epoch": 0.08220472440944881,
+      "grad_norm": 62.38746643066406,
+      "learning_rate": 9.999595275590553e-06,
+      "loss": 0.2907,
+      "step": 261
+    },
+    {
+      "epoch": 0.08251968503937007,
+      "grad_norm": 107.24059295654297,
+      "learning_rate": 9.999593700787402e-06,
+      "loss": 0.9582,
+      "step": 262
+    },
+    {
+      "epoch": 0.08283464566929134,
+      "grad_norm": 182.7991943359375,
+      "learning_rate": 9.999592125984253e-06,
+      "loss": 1.1002,
+      "step": 263
+    },
+    {
+      "epoch": 0.0831496062992126,
+      "grad_norm": 39.42921829223633,
+      "learning_rate": 9.999590551181102e-06,
+      "loss": 0.3591,
+      "step": 264
+    },
+    {
+      "epoch": 0.08346456692913386,
+      "grad_norm": 78.57293701171875,
+      "learning_rate": 9.999588976377953e-06,
+      "loss": 0.5395,
+      "step": 265
+    },
+    {
+      "epoch": 0.08377952755905511,
+      "grad_norm": 46.55572509765625,
+      "learning_rate": 9.999587401574804e-06,
+      "loss": 0.4244,
+      "step": 266
+    },
+    {
+      "epoch": 0.08409448818897637,
+      "grad_norm": 72.65052795410156,
+      "learning_rate": 9.999585826771654e-06,
+      "loss": 0.5564,
+      "step": 267
+    },
+    {
+      "epoch": 0.08440944881889764,
+      "grad_norm": 40.577850341796875,
+      "learning_rate": 9.999584251968505e-06,
+      "loss": 0.319,
+      "step": 268
+    },
+    {
+      "epoch": 0.0847244094488189,
+      "grad_norm": 60.15317916870117,
+      "learning_rate": 9.999582677165354e-06,
+      "loss": 0.2762,
+      "step": 269
+    },
+    {
+      "epoch": 0.08503937007874016,
+      "grad_norm": 64.16828918457031,
+      "learning_rate": 9.999581102362205e-06,
+      "loss": 0.5016,
+      "step": 270
+    },
+    {
+      "epoch": 0.08535433070866141,
+      "grad_norm": 62.69063949584961,
+      "learning_rate": 9.999579527559056e-06,
+      "loss": 0.4175,
+      "step": 271
+    },
+    {
+      "epoch": 0.08566929133858267,
+      "grad_norm": 49.130157470703125,
+      "learning_rate": 9.999577952755907e-06,
+      "loss": 0.3619,
+      "step": 272
+    },
+    {
+      "epoch": 0.08598425196850394,
+      "grad_norm": 71.22623443603516,
+      "learning_rate": 9.999576377952756e-06,
+      "loss": 0.3924,
+      "step": 273
+    },
+    {
+      "epoch": 0.0862992125984252,
+      "grad_norm": 77.7140884399414,
+      "learning_rate": 9.999574803149607e-06,
+      "loss": 0.4717,
+      "step": 274
+    },
+    {
+      "epoch": 0.08661417322834646,
+      "grad_norm": 78.33636474609375,
+      "learning_rate": 9.999573228346458e-06,
+      "loss": 0.5174,
+      "step": 275
+    },
+    {
+      "epoch": 0.08692913385826771,
+      "grad_norm": 48.37542724609375,
+      "learning_rate": 9.999571653543308e-06,
+      "loss": 0.2733,
+      "step": 276
+    },
+    {
+      "epoch": 0.08724409448818897,
+      "grad_norm": 57.93960189819336,
+      "learning_rate": 9.999570078740159e-06,
+      "loss": 0.6626,
+      "step": 277
+    },
+    {
+      "epoch": 0.08755905511811024,
+      "grad_norm": 58.80123519897461,
+      "learning_rate": 9.999568503937008e-06,
+      "loss": 0.4598,
+      "step": 278
+    },
+    {
+      "epoch": 0.0878740157480315,
+      "grad_norm": 49.037818908691406,
+      "learning_rate": 9.999566929133859e-06,
+      "loss": 0.4452,
+      "step": 279
+    },
+    {
+      "epoch": 0.08818897637795275,
+      "grad_norm": 62.81136703491211,
+      "learning_rate": 9.99956535433071e-06,
+      "loss": 0.348,
+      "step": 280
+    },
+    {
+      "epoch": 0.08818897637795275,
+      "eval_loss": 0.618428111076355,
+      "eval_runtime": 306.6914,
+      "eval_samples_per_second": 0.381,
+      "eval_steps_per_second": 0.381,
+      "step": 280
+    },
+    {
+      "epoch": 0.08850393700787401,
+      "grad_norm": 61.92344284057617,
+      "learning_rate": 9.999563779527561e-06,
+      "loss": 0.2919,
+      "step": 281
+    },
+    {
+      "epoch": 0.08881889763779527,
+      "grad_norm": 75.2997817993164,
+      "learning_rate": 9.99956220472441e-06,
+      "loss": 0.4883,
+      "step": 282
+    },
+    {
+      "epoch": 0.08913385826771654,
+      "grad_norm": 34.474639892578125,
+      "learning_rate": 9.999560629921261e-06,
+      "loss": 0.1744,
+      "step": 283
+    },
+    {
+      "epoch": 0.0894488188976378,
+      "grad_norm": 79.71351623535156,
+      "learning_rate": 9.99955905511811e-06,
+      "loss": 0.4965,
+      "step": 284
+    },
+    {
+      "epoch": 0.08976377952755905,
+      "grad_norm": 127.44145202636719,
+      "learning_rate": 9.999557480314961e-06,
+      "loss": 0.7171,
+      "step": 285
+    },
+    {
+      "epoch": 0.09007874015748031,
+      "grad_norm": 57.06454086303711,
+      "learning_rate": 9.999555905511812e-06,
+      "loss": 0.5155,
+      "step": 286
+    },
+    {
+      "epoch": 0.09039370078740157,
+      "grad_norm": 86.23944854736328,
+      "learning_rate": 9.999554330708662e-06,
+      "loss": 0.613,
+      "step": 287
+    },
+    {
+      "epoch": 0.09070866141732284,
+      "grad_norm": 188.0777587890625,
+      "learning_rate": 9.999552755905513e-06,
+      "loss": 0.3885,
+      "step": 288
+    },
+    {
+      "epoch": 0.0910236220472441,
+      "grad_norm": 49.08794403076172,
+      "learning_rate": 9.999551181102362e-06,
+      "loss": 0.3903,
+      "step": 289
+    },
+    {
+      "epoch": 0.09133858267716535,
+      "grad_norm": 77.4734115600586,
+      "learning_rate": 9.999549606299213e-06,
+      "loss": 0.629,
+      "step": 290
+    },
+    {
+      "epoch": 0.09165354330708661,
+      "grad_norm": 31.26721954345703,
+      "learning_rate": 9.999548031496064e-06,
+      "loss": 0.2508,
+      "step": 291
+    },
+    {
+      "epoch": 0.09196850393700787,
+      "grad_norm": 59.19281768798828,
+      "learning_rate": 9.999546456692915e-06,
+      "loss": 0.1862,
+      "step": 292
+    },
+    {
+      "epoch": 0.09228346456692914,
+      "grad_norm": 97.84364318847656,
+      "learning_rate": 9.999544881889764e-06,
+      "loss": 0.8973,
+      "step": 293
+    },
+    {
+      "epoch": 0.0925984251968504,
+      "grad_norm": 94.82438659667969,
+      "learning_rate": 9.999543307086615e-06,
+      "loss": 0.8992,
+      "step": 294
+    },
+    {
+      "epoch": 0.09291338582677165,
+      "grad_norm": 57.59076690673828,
+      "learning_rate": 9.999541732283465e-06,
+      "loss": 0.6437,
+      "step": 295
+    },
+    {
+      "epoch": 0.09322834645669291,
+      "grad_norm": 37.8861198425293,
+      "learning_rate": 9.999540157480316e-06,
+      "loss": 0.4185,
+      "step": 296
+    },
+    {
+      "epoch": 0.09354330708661417,
+      "grad_norm": 28.94227409362793,
+      "learning_rate": 9.999538582677167e-06,
+      "loss": 0.285,
+      "step": 297
+    },
+    {
+      "epoch": 0.09385826771653544,
+      "grad_norm": 50.66032409667969,
+      "learning_rate": 9.999537007874016e-06,
+      "loss": 0.6087,
+      "step": 298
+    },
+    {
+      "epoch": 0.0941732283464567,
+      "grad_norm": 24.23774528503418,
+      "learning_rate": 9.999535433070867e-06,
+      "loss": 0.2489,
+      "step": 299
+    },
+    {
+      "epoch": 0.09448818897637795,
+      "grad_norm": 50.08018493652344,
+      "learning_rate": 9.999533858267718e-06,
+      "loss": 0.6251,
+      "step": 300
+    },
+    {
+      "epoch": 0.09448818897637795,
+      "eval_loss": 0.5396940112113953,
+      "eval_runtime": 294.9781,
+      "eval_samples_per_second": 0.397,
+      "eval_steps_per_second": 0.397,
+      "step": 300
+    },
+    {
+      "epoch": 0.09480314960629921,
+      "grad_norm": 87.98992919921875,
+      "learning_rate": 9.999532283464569e-06,
+      "loss": 0.6764,
+      "step": 301
+    },
+    {
+      "epoch": 0.09511811023622047,
+      "grad_norm": 47.72505187988281,
+      "learning_rate": 9.999530708661418e-06,
+      "loss": 0.754,
+      "step": 302
+    },
+    {
+      "epoch": 0.09543307086614174,
+      "grad_norm": 29.56645393371582,
+      "learning_rate": 9.999529133858269e-06,
+      "loss": 0.2754,
+      "step": 303
+    },
+    {
+      "epoch": 0.095748031496063,
+      "grad_norm": 66.290283203125,
+      "learning_rate": 9.999527559055118e-06,
+      "loss": 0.5663,
+      "step": 304
+    },
+    {
+      "epoch": 0.09606299212598425,
+      "grad_norm": 38.929725646972656,
+      "learning_rate": 9.99952598425197e-06,
+      "loss": 0.3187,
+      "step": 305
+    },
+    {
+      "epoch": 0.09637795275590551,
+      "grad_norm": 55.97653579711914,
+      "learning_rate": 9.99952440944882e-06,
+      "loss": 0.4682,
+      "step": 306
+    },
+    {
+      "epoch": 0.09669291338582676,
+      "grad_norm": 41.88676071166992,
+      "learning_rate": 9.99952283464567e-06,
+      "loss": 0.4863,
+      "step": 307
+    },
+    {
+      "epoch": 0.09700787401574804,
+      "grad_norm": 39.72370529174805,
+      "learning_rate": 9.99952125984252e-06,
+      "loss": 0.3445,
+      "step": 308
+    },
+    {
+      "epoch": 0.09732283464566929,
+      "grad_norm": 48.20722579956055,
+      "learning_rate": 9.99951968503937e-06,
+      "loss": 0.4213,
+      "step": 309
+    },
+    {
+      "epoch": 0.09763779527559055,
+      "grad_norm": 54.454715728759766,
+      "learning_rate": 9.999518110236221e-06,
+      "loss": 0.4397,
+      "step": 310
+    },
+    {
+      "epoch": 0.0979527559055118,
+      "grad_norm": 64.91082000732422,
+      "learning_rate": 9.999516535433072e-06,
+      "loss": 0.2707,
+      "step": 311
+    },
+    {
+      "epoch": 0.09826771653543306,
+      "grad_norm": 94.55459594726562,
+      "learning_rate": 9.999514960629923e-06,
+      "loss": 0.3693,
+      "step": 312
+    },
+    {
+      "epoch": 0.09858267716535433,
+      "grad_norm": 96.01959991455078,
+      "learning_rate": 9.999513385826772e-06,
+      "loss": 0.5796,
+      "step": 313
+    },
+    {
+      "epoch": 0.09889763779527559,
+      "grad_norm": 44.687355041503906,
+      "learning_rate": 9.999511811023623e-06,
+      "loss": 0.4414,
+      "step": 314
+    },
+    {
+      "epoch": 0.09921259842519685,
+      "grad_norm": 108.15480041503906,
+      "learning_rate": 9.999510236220473e-06,
+      "loss": 0.4173,
+      "step": 315
+    },
+    {
+      "epoch": 0.0995275590551181,
+      "grad_norm": 42.95850372314453,
+      "learning_rate": 9.999508661417324e-06,
+      "loss": 0.1891,
+      "step": 316
+    },
+    {
+      "epoch": 0.09984251968503936,
+      "grad_norm": 66.48217010498047,
+      "learning_rate": 9.999507086614174e-06,
+      "loss": 0.2298,
+      "step": 317
+    },
+    {
+      "epoch": 0.10015748031496063,
+      "grad_norm": 152.71719360351562,
+      "learning_rate": 9.999505511811024e-06,
+      "loss": 0.4982,
+      "step": 318
+    },
+    {
+      "epoch": 0.10047244094488189,
+      "grad_norm": 68.98497772216797,
+      "learning_rate": 9.999503937007875e-06,
+      "loss": 0.6307,
+      "step": 319
+    },
+    {
+      "epoch": 0.10078740157480315,
+      "grad_norm": 68.25215911865234,
+      "learning_rate": 9.999502362204724e-06,
+      "loss": 0.2943,
+      "step": 320
+    },
+    {
+      "epoch": 0.10078740157480315,
+      "eval_loss": 0.6371558904647827,
+      "eval_runtime": 297.2013,
+      "eval_samples_per_second": 0.394,
+      "eval_steps_per_second": 0.394,
+      "step": 320
+    },
+    {
+      "epoch": 0.1011023622047244,
+      "grad_norm": 64.41675567626953,
+      "learning_rate": 9.999500787401577e-06,
+      "loss": 0.3071,
+      "step": 321
+    },
+    {
+      "epoch": 0.10141732283464568,
+      "grad_norm": 146.19937133789062,
+      "learning_rate": 9.999499212598426e-06,
+      "loss": 0.1765,
+      "step": 322
+    },
+    {
+      "epoch": 0.10173228346456693,
+      "grad_norm": 81.2210464477539,
+      "learning_rate": 9.999497637795277e-06,
+      "loss": 0.854,
+      "step": 323
+    },
+    {
+      "epoch": 0.10204724409448819,
+      "grad_norm": 142.67649841308594,
+      "learning_rate": 9.999496062992126e-06,
+      "loss": 0.431,
+      "step": 324
+    },
+    {
+      "epoch": 0.10236220472440945,
+      "grad_norm": 23.75156593322754,
+      "learning_rate": 9.999494488188977e-06,
+      "loss": 0.0743,
+      "step": 325
+    },
+    {
+      "epoch": 0.1026771653543307,
+      "grad_norm": 154.65882873535156,
+      "learning_rate": 9.999492913385828e-06,
+      "loss": 0.7567,
+      "step": 326
+    },
+    {
+      "epoch": 0.10299212598425198,
+      "grad_norm": 284.99822998046875,
+      "learning_rate": 9.999491338582678e-06,
+      "loss": 1.2282,
+      "step": 327
+    },
+    {
+      "epoch": 0.10330708661417323,
+      "grad_norm": 87.0584945678711,
+      "learning_rate": 9.999489763779529e-06,
+      "loss": 1.0155,
+      "step": 328
+    },
+    {
+      "epoch": 0.10362204724409449,
+      "grad_norm": 54.77091598510742,
+      "learning_rate": 9.999488188976378e-06,
+      "loss": 0.2331,
+      "step": 329
+    },
+    {
+      "epoch": 0.10393700787401575,
+      "grad_norm": 93.18637084960938,
+      "learning_rate": 9.999486614173229e-06,
+      "loss": 0.5502,
+      "step": 330
+    },
+    {
+      "epoch": 0.104251968503937,
+      "grad_norm": 70.45845031738281,
+      "learning_rate": 9.99948503937008e-06,
+      "loss": 0.5354,
+      "step": 331
+    },
+    {
+      "epoch": 0.10456692913385827,
+      "grad_norm": 55.51031494140625,
+      "learning_rate": 9.999483464566931e-06,
+      "loss": 0.5554,
+      "step": 332
+    },
+    {
+      "epoch": 0.10488188976377953,
+      "grad_norm": 93.07254028320312,
+      "learning_rate": 9.99948188976378e-06,
+      "loss": 0.521,
+      "step": 333
+    },
+    {
+      "epoch": 0.10519685039370079,
+      "grad_norm": 31.241605758666992,
+      "learning_rate": 9.999480314960631e-06,
+      "loss": 0.2664,
+      "step": 334
+    },
+    {
+      "epoch": 0.10551181102362205,
+      "grad_norm": 54.83103942871094,
+      "learning_rate": 9.99947874015748e-06,
+      "loss": 0.4913,
+      "step": 335
+    },
+    {
+      "epoch": 0.1058267716535433,
+      "grad_norm": 55.371360778808594,
+      "learning_rate": 9.999477165354331e-06,
+      "loss": 0.5836,
+      "step": 336
+    },
+    {
+      "epoch": 0.10614173228346457,
+      "grad_norm": 77.42748260498047,
+      "learning_rate": 9.999475590551182e-06,
+      "loss": 0.4551,
+      "step": 337
+    },
+    {
+      "epoch": 0.10645669291338583,
+      "grad_norm": 52.34659194946289,
+      "learning_rate": 9.999474015748032e-06,
+      "loss": 0.3793,
+      "step": 338
+    },
+    {
+      "epoch": 0.10677165354330709,
+      "grad_norm": 69.15506744384766,
+      "learning_rate": 9.999472440944883e-06,
+      "loss": 0.4266,
+      "step": 339
+    },
+    {
+      "epoch": 0.10708661417322834,
+      "grad_norm": 59.99565887451172,
+      "learning_rate": 9.999470866141732e-06,
+      "loss": 0.3804,
+      "step": 340
+    },
+    {
+      "epoch": 0.10708661417322834,
+      "eval_loss": 0.5206155776977539,
+      "eval_runtime": 293.7622,
+      "eval_samples_per_second": 0.398,
+      "eval_steps_per_second": 0.398,
+      "step": 340
+    },
+    {
+      "epoch": 0.1074015748031496,
+      "grad_norm": 62.315391540527344,
+      "learning_rate": 9.999469291338583e-06,
+      "loss": 0.5939,
+      "step": 341
+    },
+    {
+      "epoch": 0.10771653543307087,
+      "grad_norm": 45.72246551513672,
+      "learning_rate": 9.999467716535434e-06,
+      "loss": 0.2962,
+      "step": 342
+    },
+    {
+      "epoch": 0.10803149606299213,
+      "grad_norm": 72.52598571777344,
+      "learning_rate": 9.999466141732285e-06,
+      "loss": 0.773,
+      "step": 343
+    },
+    {
+      "epoch": 0.10834645669291339,
+      "grad_norm": 50.78411102294922,
+      "learning_rate": 9.999464566929134e-06,
+      "loss": 0.4392,
+      "step": 344
+    },
+    {
+      "epoch": 0.10866141732283464,
+      "grad_norm": 70.54865264892578,
+      "learning_rate": 9.999462992125985e-06,
+      "loss": 0.5495,
+      "step": 345
+    },
+    {
+      "epoch": 0.1089763779527559,
+      "grad_norm": 42.886756896972656,
+      "learning_rate": 9.999461417322836e-06,
+      "loss": 0.3558,
+      "step": 346
+    },
+    {
+      "epoch": 0.10929133858267717,
+      "grad_norm": 61.35227584838867,
+      "learning_rate": 9.999459842519686e-06,
+      "loss": 0.583,
+      "step": 347
+    },
+    {
+      "epoch": 0.10960629921259843,
+      "grad_norm": 75.92544555664062,
+      "learning_rate": 9.999458267716537e-06,
+      "loss": 0.6637,
+      "step": 348
+    },
+    {
+      "epoch": 0.10992125984251969,
+      "grad_norm": 47.078548431396484,
+      "learning_rate": 9.999456692913386e-06,
+      "loss": 0.5437,
+      "step": 349
+    },
+    {
+      "epoch": 0.11023622047244094,
+      "grad_norm": 46.12405014038086,
+      "learning_rate": 9.999455118110237e-06,
+      "loss": 0.463,
+      "step": 350
+    },
+    {
+      "epoch": 0.1105511811023622,
+      "grad_norm": 49.5578727722168,
+      "learning_rate": 9.999453543307088e-06,
+      "loss": 0.2595,
+      "step": 351
+    },
+    {
+      "epoch": 0.11086614173228347,
+      "grad_norm": 42.03670883178711,
+      "learning_rate": 9.999451968503939e-06,
+      "loss": 0.3669,
+      "step": 352
+    },
+    {
+      "epoch": 0.11118110236220473,
+      "grad_norm": 55.1522102355957,
+      "learning_rate": 9.999450393700788e-06,
+      "loss": 0.5018,
+      "step": 353
+    },
+    {
+      "epoch": 0.11149606299212599,
+      "grad_norm": 125.36481475830078,
+      "learning_rate": 9.999448818897639e-06,
+      "loss": 0.4888,
+      "step": 354
+    },
+    {
+      "epoch": 0.11181102362204724,
+      "grad_norm": 81.62045288085938,
+      "learning_rate": 9.999447244094488e-06,
+      "loss": 0.4195,
+      "step": 355
+    },
+    {
+      "epoch": 0.1121259842519685,
+      "grad_norm": 85.13298797607422,
+      "learning_rate": 9.99944566929134e-06,
+      "loss": 0.4851,
+      "step": 356
+    },
+    {
+      "epoch": 0.11244094488188977,
+      "grad_norm": 245.27197265625,
+      "learning_rate": 9.99944409448819e-06,
+      "loss": 0.5387,
+      "step": 357
+    },
+    {
+      "epoch": 0.11275590551181103,
+      "grad_norm": 71.68444061279297,
+      "learning_rate": 9.99944251968504e-06,
+      "loss": 0.2865,
+      "step": 358
+    },
+    {
+      "epoch": 0.11307086614173228,
+      "grad_norm": 44.38494110107422,
+      "learning_rate": 9.99944094488189e-06,
+      "loss": 0.4339,
+      "step": 359
+    },
+    {
+      "epoch": 0.11338582677165354,
+      "grad_norm": 62.24411392211914,
+      "learning_rate": 9.99943937007874e-06,
+      "loss": 0.4798,
+      "step": 360
+    },
+    {
+      "epoch": 0.11338582677165354,
+      "eval_loss": 0.5544171929359436,
+      "eval_runtime": 293.2522,
+      "eval_samples_per_second": 0.399,
+      "eval_steps_per_second": 0.399,
+      "step": 360
+    },
+    {
+      "epoch": 0.1137007874015748,
+      "grad_norm": 97.97586059570312,
+      "learning_rate": 9.999437795275591e-06,
+      "loss": 0.8155,
+      "step": 361
+    },
+    {
+      "epoch": 0.11401574803149607,
+      "grad_norm": 101.96649169921875,
+      "learning_rate": 9.999436220472442e-06,
+      "loss": 0.3751,
+      "step": 362
+    },
+    {
+      "epoch": 0.11433070866141733,
+      "grad_norm": 109.22547912597656,
+      "learning_rate": 9.999434645669293e-06,
+      "loss": 0.9483,
+      "step": 363
+    },
+    {
+      "epoch": 0.11464566929133858,
+      "grad_norm": 101.05289459228516,
+      "learning_rate": 9.999433070866142e-06,
+      "loss": 0.4469,
+      "step": 364
+    },
+    {
+      "epoch": 0.11496062992125984,
+      "grad_norm": 97.29914855957031,
+      "learning_rate": 9.999431496062993e-06,
+      "loss": 0.4986,
+      "step": 365
+    },
+    {
+      "epoch": 0.1152755905511811,
+      "grad_norm": 52.88810729980469,
+      "learning_rate": 9.999429921259843e-06,
+      "loss": 0.239,
+      "step": 366
+    },
+    {
+      "epoch": 0.11559055118110237,
+      "grad_norm": 220.28375244140625,
+      "learning_rate": 9.999428346456694e-06,
+      "loss": 0.9343,
+      "step": 367
+    },
+    {
+      "epoch": 0.11590551181102363,
+      "grad_norm": 128.3485565185547,
+      "learning_rate": 9.999426771653545e-06,
+      "loss": 0.6493,
+      "step": 368
+    },
+    {
+      "epoch": 0.11622047244094488,
+      "grad_norm": 68.30548095703125,
+      "learning_rate": 9.999425196850394e-06,
+      "loss": 0.6118,
+      "step": 369
+    },
+    {
+      "epoch": 0.11653543307086614,
+      "grad_norm": 85.01860809326172,
+      "learning_rate": 9.999423622047245e-06,
+      "loss": 0.9292,
+      "step": 370
+    },
+    {
+      "epoch": 0.1168503937007874,
+      "grad_norm": 42.967952728271484,
+      "learning_rate": 9.999422047244096e-06,
+      "loss": 0.3607,
+      "step": 371
+    },
+    {
+      "epoch": 0.11716535433070867,
+      "grad_norm": 31.546159744262695,
+      "learning_rate": 9.999420472440947e-06,
+      "loss": 0.1924,
+      "step": 372
+    },
+    {
+      "epoch": 0.11748031496062993,
+      "grad_norm": 49.90913391113281,
+      "learning_rate": 9.999418897637796e-06,
+      "loss": 0.3681,
+      "step": 373
+    },
+    {
+      "epoch": 0.11779527559055118,
+      "grad_norm": 43.57588195800781,
+      "learning_rate": 9.999417322834647e-06,
+      "loss": 0.2731,
+      "step": 374
+    },
+    {
+      "epoch": 0.11811023622047244,
+      "grad_norm": 53.5254020690918,
+      "learning_rate": 9.999415748031496e-06,
+      "loss": 0.5743,
+      "step": 375
+    },
+    {
+      "epoch": 0.1184251968503937,
+      "grad_norm": 57.816184997558594,
+      "learning_rate": 9.999414173228347e-06,
+      "loss": 0.6925,
+      "step": 376
+    },
+    {
+      "epoch": 0.11874015748031497,
+      "grad_norm": 50.099021911621094,
+      "learning_rate": 9.999412598425198e-06,
+      "loss": 0.4436,
+      "step": 377
+    },
+    {
+      "epoch": 0.11905511811023622,
+      "grad_norm": 38.81980514526367,
+      "learning_rate": 9.999411023622048e-06,
+      "loss": 0.3597,
+      "step": 378
+    },
+    {
+      "epoch": 0.11937007874015748,
+      "grad_norm": 60.50627517700195,
+      "learning_rate": 9.999409448818899e-06,
+      "loss": 0.3588,
+      "step": 379
+    },
+    {
+      "epoch": 0.11968503937007874,
+      "grad_norm": 88.20054626464844,
+      "learning_rate": 9.999407874015748e-06,
+      "loss": 0.3192,
+      "step": 380
+    },
+    {
+      "epoch": 0.11968503937007874,
+      "eval_loss": 0.5467623472213745,
+      "eval_runtime": 292.898,
+      "eval_samples_per_second": 0.399,
+      "eval_steps_per_second": 0.399,
+      "step": 380
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 39.78269577026367,
+      "learning_rate": 9.999406299212599e-06,
+      "loss": 0.4447,
+      "step": 381
+    },
+    {
+      "epoch": 0.12031496062992127,
+      "grad_norm": 61.480377197265625,
+      "learning_rate": 9.99940472440945e-06,
+      "loss": 0.3143,
+      "step": 382
+    },
+    {
+      "epoch": 0.12062992125984252,
+      "grad_norm": 200.373779296875,
+      "learning_rate": 9.999403149606301e-06,
+      "loss": 1.3858,
+      "step": 383
+    },
+    {
+      "epoch": 0.12094488188976378,
+      "grad_norm": 162.78187561035156,
+      "learning_rate": 9.99940157480315e-06,
+      "loss": 0.6117,
+      "step": 384
+    },
+    {
+      "epoch": 0.12125984251968504,
+      "grad_norm": 42.38726043701172,
+      "learning_rate": 9.999400000000001e-06,
+      "loss": 0.3125,
+      "step": 385
+    },
+    {
+      "epoch": 0.1215748031496063,
+      "grad_norm": 47.82020568847656,
+      "learning_rate": 9.99939842519685e-06,
+      "loss": 0.3962,
+      "step": 386
+    },
+    {
+      "epoch": 0.12188976377952757,
+      "grad_norm": 106.18301391601562,
+      "learning_rate": 9.999396850393701e-06,
+      "loss": 0.6107,
+      "step": 387
+    },
+    {
+      "epoch": 0.12220472440944882,
+      "grad_norm": 57.39361572265625,
+      "learning_rate": 9.999395275590552e-06,
+      "loss": 0.2136,
+      "step": 388
+    },
+    {
+      "epoch": 0.12251968503937008,
+      "grad_norm": 32.14236068725586,
+      "learning_rate": 9.999393700787402e-06,
+      "loss": 0.0976,
+      "step": 389
+    },
+    {
+      "epoch": 0.12283464566929134,
+      "grad_norm": 13.914974212646484,
+      "learning_rate": 9.999392125984253e-06,
+      "loss": 0.0404,
+      "step": 390
+    },
+    {
+      "epoch": 0.1231496062992126,
+      "grad_norm": 67.5541000366211,
+      "learning_rate": 9.999390551181104e-06,
+      "loss": 0.5028,
+      "step": 391
+    },
+    {
+      "epoch": 0.12346456692913386,
+      "grad_norm": 108.35496520996094,
+      "learning_rate": 9.999388976377955e-06,
+      "loss": 0.686,
+      "step": 392
+    },
+    {
+      "epoch": 0.12377952755905512,
+      "grad_norm": 69.92194366455078,
+      "learning_rate": 9.999387401574804e-06,
+      "loss": 0.2082,
+      "step": 393
+    },
+    {
+      "epoch": 0.12409448818897638,
+      "grad_norm": 23.27518081665039,
+      "learning_rate": 9.999385826771655e-06,
+      "loss": 0.0539,
+      "step": 394
+    },
+    {
+      "epoch": 0.12440944881889764,
+      "grad_norm": 48.74710464477539,
+      "learning_rate": 9.999384251968504e-06,
+      "loss": 0.3422,
+      "step": 395
+    },
+    {
+      "epoch": 0.1247244094488189,
+      "grad_norm": 138.21983337402344,
+      "learning_rate": 9.999382677165355e-06,
+      "loss": 0.8565,
+      "step": 396
+    },
+    {
+      "epoch": 0.12503937007874016,
+      "grad_norm": 112.90072631835938,
+      "learning_rate": 9.999381102362206e-06,
+      "loss": 0.2024,
+      "step": 397
+    },
+    {
+      "epoch": 0.12535433070866142,
+      "grad_norm": 256.4333190917969,
+      "learning_rate": 9.999379527559056e-06,
+      "loss": 1.8303,
+      "step": 398
+    },
+    {
+      "epoch": 0.12566929133858268,
+      "grad_norm": 112.24787902832031,
+      "learning_rate": 9.999377952755907e-06,
+      "loss": 0.3992,
+      "step": 399
+    },
+    {
+      "epoch": 0.12598425196850394,
+      "grad_norm": 131.27490234375,
+      "learning_rate": 9.999376377952756e-06,
+      "loss": 0.5066,
+      "step": 400
+    },
+    {
+      "epoch": 0.12598425196850394,
+      "eval_loss": 0.6591930985450745,
+      "eval_runtime": 304.46,
+      "eval_samples_per_second": 0.384,
+      "eval_steps_per_second": 0.384,
+      "step": 400
+    },
+    {
+      "epoch": 0.1262992125984252,
+      "grad_norm": 194.88157653808594,
+      "learning_rate": 9.999374803149607e-06,
+      "loss": 1.5517,
+      "step": 401
+    },
+    {
+      "epoch": 0.12661417322834645,
+      "grad_norm": 57.20365524291992,
+      "learning_rate": 9.999373228346458e-06,
+      "loss": 0.4244,
+      "step": 402
+    },
+    {
+      "epoch": 0.1269291338582677,
+      "grad_norm": 36.50507736206055,
+      "learning_rate": 9.999371653543309e-06,
+      "loss": 0.1584,
+      "step": 403
+    },
+    {
+      "epoch": 0.12724409448818896,
+      "grad_norm": 133.4579620361328,
+      "learning_rate": 9.999370078740158e-06,
+      "loss": 1.333,
+      "step": 404
+    },
+    {
+      "epoch": 0.12755905511811025,
+      "grad_norm": 128.0843505859375,
+      "learning_rate": 9.999368503937009e-06,
+      "loss": 0.4923,
+      "step": 405
+    },
+    {
+      "epoch": 0.1278740157480315,
+      "grad_norm": 100.4643325805664,
+      "learning_rate": 9.999366929133858e-06,
+      "loss": 0.4229,
+      "step": 406
+    },
+    {
+      "epoch": 0.12818897637795276,
+      "grad_norm": 51.65610885620117,
+      "learning_rate": 9.99936535433071e-06,
+      "loss": 0.3144,
+      "step": 407
+    },
+    {
+      "epoch": 0.12850393700787402,
+      "grad_norm": 86.17060852050781,
+      "learning_rate": 9.99936377952756e-06,
+      "loss": 0.5492,
+      "step": 408
+    },
+    {
+      "epoch": 0.12881889763779528,
+      "grad_norm": 11.016338348388672,
+      "learning_rate": 9.99936220472441e-06,
+      "loss": 0.0502,
+      "step": 409
+    },
+    {
+      "epoch": 0.12913385826771653,
+      "grad_norm": 75.9980697631836,
+      "learning_rate": 9.99936062992126e-06,
+      "loss": 0.47,
+      "step": 410
+    },
+    {
+      "epoch": 0.1294488188976378,
+      "grad_norm": 64.87591552734375,
+      "learning_rate": 9.99935905511811e-06,
+      "loss": 0.2592,
+      "step": 411
+    },
+    {
+      "epoch": 0.12976377952755905,
+      "grad_norm": 109.087646484375,
+      "learning_rate": 9.999357480314961e-06,
+      "loss": 0.7454,
+      "step": 412
+    },
+    {
+      "epoch": 0.1300787401574803,
+      "grad_norm": 46.490875244140625,
+      "learning_rate": 9.999355905511812e-06,
+      "loss": 0.2048,
+      "step": 413
+    },
+    {
+      "epoch": 0.13039370078740156,
+      "grad_norm": 156.15184020996094,
+      "learning_rate": 9.999354330708663e-06,
+      "loss": 0.4924,
+      "step": 414
+    },
+    {
+      "epoch": 0.13070866141732285,
+      "grad_norm": 120.5556640625,
+      "learning_rate": 9.999352755905512e-06,
+      "loss": 0.8374,
+      "step": 415
+    },
+    {
+      "epoch": 0.1310236220472441,
+      "grad_norm": 151.4144744873047,
+      "learning_rate": 9.999351181102363e-06,
+      "loss": 0.6144,
+      "step": 416
+    },
+    {
+      "epoch": 0.13133858267716536,
+      "grad_norm": 90.7903060913086,
+      "learning_rate": 9.999349606299214e-06,
+      "loss": 0.2709,
+      "step": 417
+    },
+    {
+      "epoch": 0.13165354330708662,
+      "grad_norm": 27.181320190429688,
+      "learning_rate": 9.999348031496064e-06,
+      "loss": 0.3197,
+      "step": 418
+    },
+    {
+      "epoch": 0.13196850393700787,
+      "grad_norm": 98.53707885742188,
+      "learning_rate": 9.999346456692915e-06,
+      "loss": 0.8255,
+      "step": 419
+    },
+    {
+      "epoch": 0.13228346456692913,
+      "grad_norm": 34.432132720947266,
+      "learning_rate": 9.999344881889764e-06,
+      "loss": 0.1117,
+      "step": 420
+    },
+    {
+      "epoch": 0.13228346456692913,
+      "eval_loss": 0.535347580909729,
+      "eval_runtime": 306.3308,
+      "eval_samples_per_second": 0.382,
+      "eval_steps_per_second": 0.382,
+      "step": 420
+    },
+    {
+      "epoch": 0.1325984251968504,
+      "grad_norm": 42.67380142211914,
+      "learning_rate": 9.999343307086615e-06,
+      "loss": 0.3283,
+      "step": 421
+    },
+    {
+      "epoch": 0.13291338582677165,
+      "grad_norm": 71.41397857666016,
+      "learning_rate": 9.999341732283466e-06,
+      "loss": 0.295,
+      "step": 422
+    },
+    {
+      "epoch": 0.1332283464566929,
+      "grad_norm": 28.39377784729004,
+      "learning_rate": 9.999340157480317e-06,
+      "loss": 0.1143,
+      "step": 423
+    },
+    {
+      "epoch": 0.13354330708661416,
+      "grad_norm": 61.82292556762695,
+      "learning_rate": 9.999338582677166e-06,
+      "loss": 0.5727,
+      "step": 424
+    },
+    {
+      "epoch": 0.13385826771653545,
+      "grad_norm": 69.15302276611328,
+      "learning_rate": 9.999337007874017e-06,
+      "loss": 0.3849,
+      "step": 425
+    },
+    {
+      "epoch": 0.1341732283464567,
+      "grad_norm": 38.20407485961914,
+      "learning_rate": 9.999335433070866e-06,
+      "loss": 0.334,
+      "step": 426
+    },
+    {
+      "epoch": 0.13448818897637796,
+      "grad_norm": 25.310161590576172,
+      "learning_rate": 9.999333858267717e-06,
+      "loss": 0.1519,
+      "step": 427
+    },
+    {
+      "epoch": 0.13480314960629922,
+      "grad_norm": 28.452911376953125,
+      "learning_rate": 9.999332283464568e-06,
+      "loss": 0.2238,
+      "step": 428
+    },
+    {
+      "epoch": 0.13511811023622047,
+      "grad_norm": 127.5716552734375,
+      "learning_rate": 9.999330708661418e-06,
+      "loss": 1.3774,
+      "step": 429
+    },
+    {
+      "epoch": 0.13543307086614173,
+      "grad_norm": 60.44778060913086,
+      "learning_rate": 9.999329133858269e-06,
+      "loss": 0.7446,
+      "step": 430
+    },
+    {
+      "epoch": 0.135748031496063,
+      "grad_norm": 54.13089370727539,
+      "learning_rate": 9.999327559055118e-06,
+      "loss": 0.7198,
+      "step": 431
+    },
+    {
+      "epoch": 0.13606299212598424,
+      "grad_norm": 44.978485107421875,
+      "learning_rate": 9.999325984251969e-06,
+      "loss": 0.457,
+      "step": 432
+    },
+    {
+      "epoch": 0.1363779527559055,
+      "grad_norm": 78.32373809814453,
+      "learning_rate": 9.99932440944882e-06,
+      "loss": 0.5859,
+      "step": 433
+    },
+    {
+      "epoch": 0.13669291338582676,
+      "grad_norm": 132.4387969970703,
+      "learning_rate": 9.999322834645671e-06,
+      "loss": 0.4658,
+      "step": 434
+    },
+    {
+      "epoch": 0.13700787401574804,
+      "grad_norm": 14.64592170715332,
+      "learning_rate": 9.99932125984252e-06,
+      "loss": 0.0912,
+      "step": 435
+    },
+    {
+      "epoch": 0.1373228346456693,
+      "grad_norm": 57.1812858581543,
+      "learning_rate": 9.999319685039371e-06,
+      "loss": 0.4572,
+      "step": 436
+    },
+    {
+      "epoch": 0.13763779527559056,
+      "grad_norm": 46.73292541503906,
+      "learning_rate": 9.99931811023622e-06,
+      "loss": 0.4464,
+      "step": 437
+    },
+    {
+      "epoch": 0.13795275590551181,
+      "grad_norm": 21.240659713745117,
+      "learning_rate": 9.999316535433072e-06,
+      "loss": 0.1034,
+      "step": 438
+    },
+    {
+      "epoch": 0.13826771653543307,
+      "grad_norm": 46.250614166259766,
+      "learning_rate": 9.999314960629922e-06,
+      "loss": 0.2897,
+      "step": 439
+    },
+    {
+      "epoch": 0.13858267716535433,
+      "grad_norm": 73.32185363769531,
+      "learning_rate": 9.999313385826772e-06,
+      "loss": 0.5788,
+      "step": 440
+    },
+    {
+      "epoch": 0.13858267716535433,
+      "eval_loss": 0.5959511399269104,
+      "eval_runtime": 304.5113,
+      "eval_samples_per_second": 0.384,
+      "eval_steps_per_second": 0.384,
+      "step": 440
+    },
+    {
+      "epoch": 0.13889763779527559,
+      "grad_norm": 7.073561191558838,
+      "learning_rate": 9.999311811023623e-06,
+      "loss": 0.0235,
+      "step": 441
+    },
+    {
+      "epoch": 0.13921259842519684,
+      "grad_norm": 130.6144561767578,
+      "learning_rate": 9.999310236220474e-06,
+      "loss": 1.088,
+      "step": 442
+    },
+    {
+      "epoch": 0.1395275590551181,
+      "grad_norm": 105.4767837524414,
+      "learning_rate": 9.999308661417325e-06,
+      "loss": 1.3411,
+      "step": 443
+    },
+    {
+      "epoch": 0.13984251968503936,
+      "grad_norm": 45.18183898925781,
+      "learning_rate": 9.999307086614174e-06,
+      "loss": 0.2972,
+      "step": 444
+    },
+    {
+      "epoch": 0.14015748031496064,
+      "grad_norm": 84.5200424194336,
+      "learning_rate": 9.999305511811025e-06,
+      "loss": 0.3241,
+      "step": 445
+    },
+    {
+      "epoch": 0.1404724409448819,
+      "grad_norm": 33.606468200683594,
+      "learning_rate": 9.999303937007874e-06,
+      "loss": 0.3225,
+      "step": 446
+    },
+    {
+      "epoch": 0.14078740157480316,
+      "grad_norm": 156.7371063232422,
+      "learning_rate": 9.999302362204725e-06,
+      "loss": 0.8385,
+      "step": 447
+    },
+    {
+      "epoch": 0.1411023622047244,
+      "grad_norm": 51.86471176147461,
+      "learning_rate": 9.999300787401576e-06,
+      "loss": 0.3886,
+      "step": 448
+    },
+    {
+      "epoch": 0.14141732283464567,
+      "grad_norm": 98.36966705322266,
+      "learning_rate": 9.999299212598426e-06,
+      "loss": 0.3667,
+      "step": 449
+    },
+    {
+      "epoch": 0.14173228346456693,
+      "grad_norm": 163.55325317382812,
+      "learning_rate": 9.999297637795277e-06,
+      "loss": 0.9837,
+      "step": 450
+    },
+    {
+      "epoch": 0.14204724409448818,
+      "grad_norm": 52.89970779418945,
+      "learning_rate": 9.999296062992126e-06,
+      "loss": 0.3358,
+      "step": 451
+    },
+    {
+      "epoch": 0.14236220472440944,
+      "grad_norm": 71.64704132080078,
+      "learning_rate": 9.999294488188977e-06,
+      "loss": 0.7998,
+      "step": 452
+    },
+    {
+      "epoch": 0.1426771653543307,
+      "grad_norm": 19.4017276763916,
+      "learning_rate": 9.999292913385828e-06,
+      "loss": 0.0742,
+      "step": 453
+    },
+    {
+      "epoch": 0.14299212598425196,
+      "grad_norm": 89.31649017333984,
+      "learning_rate": 9.999291338582679e-06,
+      "loss": 0.557,
+      "step": 454
+    },
+    {
+      "epoch": 0.14330708661417324,
+      "grad_norm": 47.06895446777344,
+      "learning_rate": 9.999289763779528e-06,
+      "loss": 0.5652,
+      "step": 455
+    },
+    {
+      "epoch": 0.1436220472440945,
+      "grad_norm": 60.04147720336914,
+      "learning_rate": 9.99928818897638e-06,
+      "loss": 0.4523,
+      "step": 456
+    },
+    {
+      "epoch": 0.14393700787401575,
+      "grad_norm": 33.64701843261719,
+      "learning_rate": 9.999286614173228e-06,
+      "loss": 0.3136,
+      "step": 457
+    },
+    {
+      "epoch": 0.144251968503937,
+      "grad_norm": 24.181163787841797,
+      "learning_rate": 9.99928503937008e-06,
+      "loss": 0.1441,
+      "step": 458
+    },
+    {
+      "epoch": 0.14456692913385827,
+      "grad_norm": 121.68936157226562,
+      "learning_rate": 9.99928346456693e-06,
+      "loss": 0.9653,
+      "step": 459
+    },
+    {
+      "epoch": 0.14488188976377953,
+      "grad_norm": 48.23858642578125,
+      "learning_rate": 9.99928188976378e-06,
+      "loss": 0.4576,
+      "step": 460
+    },
+    {
+      "epoch": 0.14488188976377953,
+      "eval_loss": 0.6207642555236816,
+      "eval_runtime": 295.7863,
+      "eval_samples_per_second": 0.396,
+      "eval_steps_per_second": 0.396,
+      "step": 460
+    },
+    {
+      "epoch": 0.14519685039370078,
+      "grad_norm": 59.632354736328125,
+      "learning_rate": 9.99928031496063e-06,
+      "loss": 0.4868,
+      "step": 461
+    },
+    {
+      "epoch": 0.14551181102362204,
+      "grad_norm": 103.69953918457031,
+      "learning_rate": 9.999278740157482e-06,
+      "loss": 0.5991,
+      "step": 462
+    },
+    {
+      "epoch": 0.1458267716535433,
+      "grad_norm": 36.89337158203125,
+      "learning_rate": 9.999277165354333e-06,
+      "loss": 0.2331,
+      "step": 463
+    },
+    {
+      "epoch": 0.14614173228346455,
+      "grad_norm": 42.319114685058594,
+      "learning_rate": 9.999275590551182e-06,
+      "loss": 0.3162,
+      "step": 464
+    },
+    {
+      "epoch": 0.14645669291338584,
+      "grad_norm": 36.906063079833984,
+      "learning_rate": 9.999274015748033e-06,
+      "loss": 0.3396,
+      "step": 465
+    },
+    {
+      "epoch": 0.1467716535433071,
+      "grad_norm": 25.45098304748535,
+      "learning_rate": 9.999272440944882e-06,
+      "loss": 0.1806,
+      "step": 466
+    },
+    {
+      "epoch": 0.14708661417322835,
+      "grad_norm": 56.87234878540039,
+      "learning_rate": 9.999270866141733e-06,
+      "loss": 0.2911,
+      "step": 467
+    },
+    {
+      "epoch": 0.1474015748031496,
+      "grad_norm": 20.72125244140625,
+      "learning_rate": 9.999269291338584e-06,
+      "loss": 0.0734,
+      "step": 468
+    },
+    {
+      "epoch": 0.14771653543307087,
+      "grad_norm": 69.94297790527344,
+      "learning_rate": 9.999267716535434e-06,
+      "loss": 0.3137,
+      "step": 469
+    },
+    {
+      "epoch": 0.14803149606299212,
+      "grad_norm": 55.783897399902344,
+      "learning_rate": 9.999266141732285e-06,
+      "loss": 0.3248,
+      "step": 470
+    },
+    {
+      "epoch": 0.14834645669291338,
+      "grad_norm": 119.89505004882812,
+      "learning_rate": 9.999264566929134e-06,
+      "loss": 0.7729,
+      "step": 471
+    },
+    {
+      "epoch": 0.14866141732283464,
+      "grad_norm": 45.10006332397461,
+      "learning_rate": 9.999262992125985e-06,
+      "loss": 0.287,
+      "step": 472
+    },
+    {
+      "epoch": 0.1489763779527559,
+      "grad_norm": 146.70803833007812,
+      "learning_rate": 9.999261417322836e-06,
+      "loss": 1.0979,
+      "step": 473
+    },
+    {
+      "epoch": 0.14929133858267715,
+      "grad_norm": 156.35951232910156,
+      "learning_rate": 9.999259842519687e-06,
+      "loss": 0.8397,
+      "step": 474
+    },
+    {
+      "epoch": 0.14960629921259844,
+      "grad_norm": 99.72000122070312,
+      "learning_rate": 9.999258267716536e-06,
+      "loss": 0.3907,
+      "step": 475
+    },
+    {
+      "epoch": 0.1499212598425197,
+      "grad_norm": 119.9110336303711,
+      "learning_rate": 9.999256692913387e-06,
+      "loss": 0.8118,
+      "step": 476
+    },
+    {
+      "epoch": 0.15023622047244095,
+      "grad_norm": 160.22637939453125,
+      "learning_rate": 9.999255118110236e-06,
+      "loss": 0.6305,
+      "step": 477
+    },
+    {
+      "epoch": 0.1505511811023622,
+      "grad_norm": 53.594276428222656,
+      "learning_rate": 9.999253543307087e-06,
+      "loss": 0.2044,
+      "step": 478
+    },
+    {
+      "epoch": 0.15086614173228347,
+      "grad_norm": 76.94689178466797,
+      "learning_rate": 9.999251968503938e-06,
+      "loss": 1.0085,
+      "step": 479
+    },
+    {
+      "epoch": 0.15118110236220472,
+      "grad_norm": 133.71861267089844,
+      "learning_rate": 9.999250393700788e-06,
+      "loss": 0.715,
+      "step": 480
+    },
+    {
+      "epoch": 0.15118110236220472,
+      "eval_loss": 0.5550094246864319,
+      "eval_runtime": 306.9821,
+      "eval_samples_per_second": 0.381,
+      "eval_steps_per_second": 0.381,
+      "step": 480
+    },
+    {
+      "epoch": 0.15149606299212598,
+      "grad_norm": 65.10472106933594,
+      "learning_rate": 9.999248818897639e-06,
+      "loss": 0.3845,
+      "step": 481
+    },
+    {
+      "epoch": 0.15181102362204724,
+      "grad_norm": 64.05907440185547,
+      "learning_rate": 9.999247244094488e-06,
+      "loss": 0.4831,
+      "step": 482
+    },
+    {
+      "epoch": 0.1521259842519685,
+      "grad_norm": 29.167266845703125,
+      "learning_rate": 9.999245669291339e-06,
+      "loss": 0.1163,
+      "step": 483
+    },
+    {
+      "epoch": 0.15244094488188975,
+      "grad_norm": 81.4149398803711,
+      "learning_rate": 9.99924409448819e-06,
+      "loss": 0.5063,
+      "step": 484
+    },
+    {
+      "epoch": 0.15275590551181104,
+      "grad_norm": 29.339014053344727,
+      "learning_rate": 9.999242519685041e-06,
+      "loss": 0.3148,
+      "step": 485
+    },
+    {
+      "epoch": 0.1530708661417323,
+      "grad_norm": 56.6673698425293,
+      "learning_rate": 9.99924094488189e-06,
+      "loss": 0.5286,
+      "step": 486
+    },
+    {
+      "epoch": 0.15338582677165355,
+      "grad_norm": 52.65031051635742,
+      "learning_rate": 9.999239370078741e-06,
+      "loss": 0.2759,
+      "step": 487
+    },
+    {
+      "epoch": 0.1537007874015748,
+      "grad_norm": 68.89445495605469,
+      "learning_rate": 9.999237795275592e-06,
+      "loss": 0.4206,
+      "step": 488
+    },
+    {
+      "epoch": 0.15401574803149606,
+      "grad_norm": 57.06834030151367,
+      "learning_rate": 9.999236220472442e-06,
+      "loss": 0.3788,
+      "step": 489
+    },
+    {
+      "epoch": 0.15433070866141732,
+      "grad_norm": 30.39971923828125,
+      "learning_rate": 9.999234645669293e-06,
+      "loss": 0.4785,
+      "step": 490
+    },
+    {
+      "epoch": 0.15464566929133858,
+      "grad_norm": 58.35342025756836,
+      "learning_rate": 9.999233070866142e-06,
+      "loss": 0.5897,
+      "step": 491
+    },
+    {
+      "epoch": 0.15496062992125983,
+      "grad_norm": 58.403533935546875,
+      "learning_rate": 9.999231496062993e-06,
+      "loss": 0.8812,
+      "step": 492
+    },
+    {
+      "epoch": 0.1552755905511811,
+      "grad_norm": 71.63230895996094,
+      "learning_rate": 9.999229921259844e-06,
+      "loss": 0.1902,
+      "step": 493
+    },
+    {
+      "epoch": 0.15559055118110235,
+      "grad_norm": 31.834192276000977,
+      "learning_rate": 9.999228346456695e-06,
+      "loss": 0.7004,
+      "step": 494
+    },
+    {
+      "epoch": 0.15590551181102363,
+      "grad_norm": 35.32748794555664,
+      "learning_rate": 9.999226771653544e-06,
+      "loss": 0.5292,
+      "step": 495
+    },
+    {
+      "epoch": 0.1562204724409449,
+      "grad_norm": 63.80234909057617,
+      "learning_rate": 9.999225196850395e-06,
+      "loss": 0.69,
+      "step": 496
+    },
+    {
+      "epoch": 0.15653543307086615,
+      "grad_norm": 47.65721893310547,
+      "learning_rate": 9.999223622047244e-06,
+      "loss": 0.4307,
+      "step": 497
+    },
+    {
+      "epoch": 0.1568503937007874,
+      "grad_norm": 23.857507705688477,
+      "learning_rate": 9.999222047244095e-06,
+      "loss": 0.4278,
+      "step": 498
+    },
+    {
+      "epoch": 0.15716535433070866,
+      "grad_norm": 36.79646301269531,
+      "learning_rate": 9.999220472440946e-06,
+      "loss": 0.3957,
+      "step": 499
+    },
+    {
+      "epoch": 0.15748031496062992,
+      "grad_norm": 37.079471588134766,
+      "learning_rate": 9.999218897637796e-06,
+      "loss": 0.5078,
+      "step": 500
+    },
+    {
+      "epoch": 0.15748031496062992,
+      "eval_loss": 0.5599373579025269,
+      "eval_runtime": 312.5913,
+      "eval_samples_per_second": 0.374,
+      "eval_steps_per_second": 0.374,
+      "step": 500
+    },
+    {
+      "epoch": 0.15779527559055118,
+      "grad_norm": 31.857145309448242,
+      "learning_rate": 9.999217322834647e-06,
+      "loss": 0.3308,
+      "step": 501
+    },
+    {
+      "epoch": 0.15811023622047243,
+      "grad_norm": 31.417692184448242,
+      "learning_rate": 9.999215748031496e-06,
+      "loss": 0.4725,
+      "step": 502
+    },
+    {
+      "epoch": 0.1584251968503937,
+      "grad_norm": 100.06877899169922,
+      "learning_rate": 9.999214173228347e-06,
+      "loss": 0.8381,
+      "step": 503
+    },
+    {
+      "epoch": 0.15874015748031495,
+      "grad_norm": 29.754446029663086,
+      "learning_rate": 9.999212598425198e-06,
+      "loss": 0.422,
+      "step": 504
+    },
+    {
+      "epoch": 0.15905511811023623,
+      "grad_norm": 36.77560043334961,
+      "learning_rate": 9.999211023622049e-06,
+      "loss": 0.2553,
+      "step": 505
+    },
+    {
+      "epoch": 0.1593700787401575,
+      "grad_norm": 45.064300537109375,
+      "learning_rate": 9.999209448818898e-06,
+      "loss": 0.4116,
+      "step": 506
+    },
+    {
+      "epoch": 0.15968503937007875,
+      "grad_norm": 82.82674407958984,
+      "learning_rate": 9.999207874015747e-06,
+      "loss": 1.244,
+      "step": 507
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 30.511226654052734,
+      "learning_rate": 9.999206299212598e-06,
+      "loss": 0.2208,
+      "step": 508
+    },
+    {
+      "epoch": 0.16031496062992126,
+      "grad_norm": 26.348651885986328,
+      "learning_rate": 9.99920472440945e-06,
+      "loss": 0.2132,
+      "step": 509
+    },
+    {
+      "epoch": 0.16062992125984252,
+      "grad_norm": 88.1176986694336,
+      "learning_rate": 9.9992031496063e-06,
+      "loss": 0.5883,
+      "step": 510
+    },
+    {
+      "epoch": 0.16094488188976377,
+      "grad_norm": 36.90149688720703,
+      "learning_rate": 9.99920157480315e-06,
+      "loss": 0.3298,
+      "step": 511
+    },
+    {
+      "epoch": 0.16125984251968503,
+      "grad_norm": 30.67510414123535,
+      "learning_rate": 9.9992e-06,
+      "loss": 0.2843,
+      "step": 512
+    },
+    {
+      "epoch": 0.1615748031496063,
+      "grad_norm": 89.83963012695312,
+      "learning_rate": 9.999198425196852e-06,
+      "loss": 0.4022,
+      "step": 513
+    },
+    {
+      "epoch": 0.16188976377952755,
+      "grad_norm": 46.62565612792969,
+      "learning_rate": 9.999196850393703e-06,
+      "loss": 0.491,
+      "step": 514
+    },
+    {
+      "epoch": 0.16220472440944883,
+      "grad_norm": 27.551525115966797,
+      "learning_rate": 9.999195275590552e-06,
+      "loss": 0.1321,
+      "step": 515
+    },
+    {
+      "epoch": 0.1625196850393701,
+      "grad_norm": 15.754891395568848,
+      "learning_rate": 9.999193700787403e-06,
+      "loss": 0.0748,
+      "step": 516
+    },
+    {
+      "epoch": 0.16283464566929134,
+      "grad_norm": 55.909244537353516,
+      "learning_rate": 9.999192125984252e-06,
+      "loss": 0.2661,
+      "step": 517
+    },
+    {
+      "epoch": 0.1631496062992126,
+      "grad_norm": 37.772239685058594,
+      "learning_rate": 9.999190551181103e-06,
+      "loss": 0.1733,
+      "step": 518
+    },
+    {
+      "epoch": 0.16346456692913386,
+      "grad_norm": 66.4226303100586,
+      "learning_rate": 9.999188976377954e-06,
+      "loss": 0.4846,
+      "step": 519
+    },
+    {
+      "epoch": 0.16377952755905512,
+      "grad_norm": 37.98040771484375,
+      "learning_rate": 9.999187401574804e-06,
+      "loss": 0.1224,
+      "step": 520
+    },
+    {
+      "epoch": 0.16377952755905512,
+      "eval_loss": 0.6723836064338684,
+      "eval_runtime": 307.1583,
+      "eval_samples_per_second": 0.381,
+      "eval_steps_per_second": 0.381,
+      "step": 520
+    },
+    {
+      "epoch": 0.16409448818897637,
+      "grad_norm": 53.96063995361328,
+      "learning_rate": 9.999185826771655e-06,
+      "loss": 0.4602,
+      "step": 521
+    },
+    {
+      "epoch": 0.16440944881889763,
+      "grad_norm": 32.147621154785156,
+      "learning_rate": 9.999184251968504e-06,
+      "loss": 0.0799,
+      "step": 522
+    },
+    {
+      "epoch": 0.1647244094488189,
+      "grad_norm": 104.01305389404297,
+      "learning_rate": 9.999182677165355e-06,
+      "loss": 0.2182,
+      "step": 523
+    },
+    {
+      "epoch": 0.16503937007874014,
+      "grad_norm": 106.27403259277344,
+      "learning_rate": 9.999181102362206e-06,
+      "loss": 1.2292,
+      "step": 524
+    },
+    {
+      "epoch": 0.16535433070866143,
+      "grad_norm": 33.72996139526367,
+      "learning_rate": 9.999179527559057e-06,
+      "loss": 0.2182,
+      "step": 525
+    },
+    {
+      "epoch": 0.16566929133858269,
+      "grad_norm": 83.75643920898438,
+      "learning_rate": 9.999177952755906e-06,
+      "loss": 1.4189,
+      "step": 526
+    },
+    {
+      "epoch": 0.16598425196850394,
+      "grad_norm": 81.67135620117188,
+      "learning_rate": 9.999176377952755e-06,
+      "loss": 0.3019,
+      "step": 527
+    },
+    {
+      "epoch": 0.1662992125984252,
+      "grad_norm": 160.24029541015625,
+      "learning_rate": 9.999174803149606e-06,
+      "loss": 0.2741,
+      "step": 528
+    },
+    {
+      "epoch": 0.16661417322834646,
+      "grad_norm": 73.85230255126953,
+      "learning_rate": 9.999173228346457e-06,
+      "loss": 1.037,
+      "step": 529
+    },
+    {
+      "epoch": 0.16692913385826771,
+      "grad_norm": 87.96981811523438,
+      "learning_rate": 9.999171653543308e-06,
+      "loss": 0.3799,
+      "step": 530
+    },
+    {
+      "epoch": 0.16724409448818897,
+      "grad_norm": 62.46072769165039,
+      "learning_rate": 9.999170078740158e-06,
+      "loss": 0.5531,
+      "step": 531
+    },
+    {
+      "epoch": 0.16755905511811023,
+      "grad_norm": 151.0035858154297,
+      "learning_rate": 9.999168503937009e-06,
+      "loss": 1.4022,
+      "step": 532
+    },
+    {
+      "epoch": 0.16787401574803149,
+      "grad_norm": 60.152671813964844,
+      "learning_rate": 9.99916692913386e-06,
+      "loss": 0.6236,
+      "step": 533
+    },
+    {
+      "epoch": 0.16818897637795274,
+      "grad_norm": 102.74165344238281,
+      "learning_rate": 9.99916535433071e-06,
+      "loss": 1.2304,
+      "step": 534
+    },
+    {
+      "epoch": 0.16850393700787403,
+      "grad_norm": 20.071189880371094,
+      "learning_rate": 9.99916377952756e-06,
+      "loss": 0.1347,
+      "step": 535
+    },
+    {
+      "epoch": 0.16881889763779528,
+      "grad_norm": 43.08108139038086,
+      "learning_rate": 9.999162204724411e-06,
+      "loss": 0.4371,
+      "step": 536
+    },
+    {
+      "epoch": 0.16913385826771654,
+      "grad_norm": 64.68496704101562,
+      "learning_rate": 9.99916062992126e-06,
+      "loss": 0.6083,
+      "step": 537
+    },
+    {
+      "epoch": 0.1694488188976378,
+      "grad_norm": 39.30328369140625,
+      "learning_rate": 9.999159055118111e-06,
+      "loss": 0.5559,
+      "step": 538
+    },
+    {
+      "epoch": 0.16976377952755906,
+      "grad_norm": 56.48727035522461,
+      "learning_rate": 9.999157480314962e-06,
+      "loss": 0.7363,
+      "step": 539
+    },
+    {
+      "epoch": 0.1700787401574803,
+      "grad_norm": 26.32372283935547,
+      "learning_rate": 9.999155905511812e-06,
+      "loss": 0.3275,
+      "step": 540
+    },
+    {
+      "epoch": 0.1700787401574803,
+      "eval_loss": 0.6582661867141724,
+      "eval_runtime": 302.7589,
+      "eval_samples_per_second": 0.386,
+      "eval_steps_per_second": 0.386,
+      "step": 540
+    },
+    {
+      "epoch": 0.17039370078740157,
+      "grad_norm": 69.46321868896484,
+      "learning_rate": 9.999154330708663e-06,
+      "loss": 0.3794,
+      "step": 541
+    },
+    {
+      "epoch": 0.17070866141732283,
+      "grad_norm": 36.254520416259766,
+      "learning_rate": 9.999152755905512e-06,
+      "loss": 0.6954,
+      "step": 542
+    },
+    {
+      "epoch": 0.17102362204724408,
+      "grad_norm": 55.22049331665039,
+      "learning_rate": 9.999151181102363e-06,
+      "loss": 0.4438,
+      "step": 543
+    },
+    {
+      "epoch": 0.17133858267716534,
+      "grad_norm": 24.40268325805664,
+      "learning_rate": 9.999149606299214e-06,
+      "loss": 0.3439,
+      "step": 544
+    },
+    {
+      "epoch": 0.17165354330708663,
+      "grad_norm": 29.554643630981445,
+      "learning_rate": 9.999148031496065e-06,
+      "loss": 0.4619,
+      "step": 545
+    },
+    {
+      "epoch": 0.17196850393700788,
+      "grad_norm": 34.69717788696289,
+      "learning_rate": 9.999146456692914e-06,
+      "loss": 0.3959,
+      "step": 546
+    },
+    {
+      "epoch": 0.17228346456692914,
+      "grad_norm": 49.8066520690918,
+      "learning_rate": 9.999144881889763e-06,
+      "loss": 0.6021,
+      "step": 547
+    },
+    {
+      "epoch": 0.1725984251968504,
+      "grad_norm": 29.715484619140625,
+      "learning_rate": 9.999143307086614e-06,
+      "loss": 0.5588,
+      "step": 548
+    },
+    {
+      "epoch": 0.17291338582677165,
+      "grad_norm": 24.99655532836914,
+      "learning_rate": 9.999141732283465e-06,
+      "loss": 0.5414,
+      "step": 549
+    },
+    {
+      "epoch": 0.1732283464566929,
+      "grad_norm": 21.271596908569336,
+      "learning_rate": 9.999140157480316e-06,
+      "loss": 0.4187,
+      "step": 550
+    },
+    {
+      "epoch": 0.17354330708661417,
+      "grad_norm": 51.398712158203125,
+      "learning_rate": 9.999138582677166e-06,
+      "loss": 0.6258,
+      "step": 551
+    },
+    {
+      "epoch": 0.17385826771653543,
+      "grad_norm": 51.94355010986328,
+      "learning_rate": 9.999137007874017e-06,
+      "loss": 0.3318,
+      "step": 552
+    },
+    {
+      "epoch": 0.17417322834645668,
+      "grad_norm": 30.253637313842773,
+      "learning_rate": 9.999135433070866e-06,
+      "loss": 0.2938,
+      "step": 553
+    },
+    {
+      "epoch": 0.17448818897637794,
+      "grad_norm": 44.27308654785156,
+      "learning_rate": 9.999133858267717e-06,
+      "loss": 0.5734,
+      "step": 554
+    },
+    {
+      "epoch": 0.17480314960629922,
+      "grad_norm": 33.18519973754883,
+      "learning_rate": 9.999132283464568e-06,
+      "loss": 0.566,
+      "step": 555
+    },
+    {
+      "epoch": 0.17511811023622048,
+      "grad_norm": 48.19463348388672,
+      "learning_rate": 9.999130708661419e-06,
+      "loss": 0.6241,
+      "step": 556
+    },
+    {
+      "epoch": 0.17543307086614174,
+      "grad_norm": 34.98441696166992,
+      "learning_rate": 9.999129133858268e-06,
+      "loss": 0.4501,
+      "step": 557
+    },
+    {
+      "epoch": 0.175748031496063,
+      "grad_norm": 30.51637077331543,
+      "learning_rate": 9.99912755905512e-06,
+      "loss": 0.2408,
+      "step": 558
+    },
+    {
+      "epoch": 0.17606299212598425,
+      "grad_norm": 65.26117706298828,
+      "learning_rate": 9.99912598425197e-06,
+      "loss": 0.6767,
+      "step": 559
+    },
+    {
+      "epoch": 0.1763779527559055,
+      "grad_norm": 37.751888275146484,
+      "learning_rate": 9.99912440944882e-06,
+      "loss": 0.3929,
+      "step": 560
+    },
+    {
+      "epoch": 0.1763779527559055,
+      "eval_loss": 0.5896673798561096,
+      "eval_runtime": 296.0217,
+      "eval_samples_per_second": 0.395,
+      "eval_steps_per_second": 0.395,
+      "step": 560
+    },
+    {
+      "epoch": 0.17669291338582677,
+      "grad_norm": 47.340492248535156,
+      "learning_rate": 9.99912283464567e-06,
+      "loss": 0.3066,
+      "step": 561
+    },
+    {
+      "epoch": 0.17700787401574802,
+      "grad_norm": 86.48099517822266,
+      "learning_rate": 9.99912125984252e-06,
+      "loss": 0.4933,
+      "step": 562
+    },
+    {
+      "epoch": 0.17732283464566928,
+      "grad_norm": 33.50886917114258,
+      "learning_rate": 9.99911968503937e-06,
+      "loss": 0.1638,
+      "step": 563
+    },
+    {
+      "epoch": 0.17763779527559054,
+      "grad_norm": 48.44023895263672,
+      "learning_rate": 9.999118110236222e-06,
+      "loss": 0.4082,
+      "step": 564
+    },
+    {
+      "epoch": 0.17795275590551182,
+      "grad_norm": 54.50333786010742,
+      "learning_rate": 9.999116535433073e-06,
+      "loss": 0.3993,
+      "step": 565
+    },
+    {
+      "epoch": 0.17826771653543308,
+      "grad_norm": 82.37716674804688,
+      "learning_rate": 9.999114960629922e-06,
+      "loss": 0.9308,
+      "step": 566
+    },
+    {
+      "epoch": 0.17858267716535434,
+      "grad_norm": 53.5648307800293,
+      "learning_rate": 9.999113385826771e-06,
+      "loss": 0.757,
+      "step": 567
+    },
+    {
+      "epoch": 0.1788976377952756,
+      "grad_norm": 51.68220901489258,
+      "learning_rate": 9.999111811023622e-06,
+      "loss": 0.408,
+      "step": 568
+    },
+    {
+      "epoch": 0.17921259842519685,
+      "grad_norm": 32.468666076660156,
+      "learning_rate": 9.999110236220473e-06,
+      "loss": 0.1988,
+      "step": 569
+    },
+    {
+      "epoch": 0.1795275590551181,
+      "grad_norm": 92.7658462524414,
+      "learning_rate": 9.999108661417324e-06,
+      "loss": 0.737,
+      "step": 570
+    },
+    {
+      "epoch": 0.17984251968503936,
+      "grad_norm": 94.98796844482422,
+      "learning_rate": 9.999107086614174e-06,
+      "loss": 0.2773,
+      "step": 571
+    },
+    {
+      "epoch": 0.18015748031496062,
+      "grad_norm": 45.95973587036133,
+      "learning_rate": 9.999105511811025e-06,
+      "loss": 0.417,
+      "step": 572
+    },
+    {
+      "epoch": 0.18047244094488188,
+      "grad_norm": 42.59912872314453,
+      "learning_rate": 9.999103937007874e-06,
+      "loss": 0.437,
+      "step": 573
+    },
+    {
+      "epoch": 0.18078740157480314,
+      "grad_norm": 73.75167083740234,
+      "learning_rate": 9.999102362204725e-06,
+      "loss": 0.2676,
+      "step": 574
+    },
+    {
+      "epoch": 0.18110236220472442,
+      "grad_norm": 46.682533264160156,
+      "learning_rate": 9.999100787401576e-06,
+      "loss": 0.1902,
+      "step": 575
+    },
+    {
+      "epoch": 0.18141732283464568,
+      "grad_norm": 35.30620574951172,
+      "learning_rate": 9.999099212598427e-06,
+      "loss": 0.2953,
+      "step": 576
+    },
+    {
+      "epoch": 0.18173228346456693,
+      "grad_norm": 113.94246673583984,
+      "learning_rate": 9.999097637795276e-06,
+      "loss": 0.7046,
+      "step": 577
+    },
+    {
+      "epoch": 0.1820472440944882,
+      "grad_norm": 207.06141662597656,
+      "learning_rate": 9.999096062992125e-06,
+      "loss": 0.5952,
+      "step": 578
+    },
+    {
+      "epoch": 0.18236220472440945,
+      "grad_norm": 34.89611053466797,
+      "learning_rate": 9.999094488188978e-06,
+      "loss": 0.2947,
+      "step": 579
+    },
+    {
+      "epoch": 0.1826771653543307,
+      "grad_norm": 108.61929321289062,
+      "learning_rate": 9.999092913385827e-06,
+      "loss": 0.2417,
+      "step": 580
+    },
+    {
+      "epoch": 0.1826771653543307,
+      "eval_loss": 0.5409280061721802,
+      "eval_runtime": 307.2823,
+      "eval_samples_per_second": 0.381,
+      "eval_steps_per_second": 0.381,
+      "step": 580
+    },
+    {
+      "epoch": 0.18299212598425196,
+      "grad_norm": 15.958565711975098,
+      "learning_rate": 9.999091338582678e-06,
+      "loss": 0.1013,
+      "step": 581
+    },
+    {
+      "epoch": 0.18330708661417322,
+      "grad_norm": 68.7283706665039,
+      "learning_rate": 9.999089763779528e-06,
+      "loss": 0.8978,
+      "step": 582
+    },
+    {
+      "epoch": 0.18362204724409448,
+      "grad_norm": 80.29981994628906,
+      "learning_rate": 9.999088188976379e-06,
+      "loss": 0.5132,
+      "step": 583
+    },
+    {
+      "epoch": 0.18393700787401573,
+      "grad_norm": 25.062374114990234,
+      "learning_rate": 9.99908661417323e-06,
+      "loss": 0.1291,
+      "step": 584
+    },
+    {
+      "epoch": 0.18425196850393702,
+      "grad_norm": 58.20054244995117,
+      "learning_rate": 9.99908503937008e-06,
+      "loss": 0.7141,
+      "step": 585
+    },
+    {
+      "epoch": 0.18456692913385828,
+      "grad_norm": 73.24835968017578,
+      "learning_rate": 9.99908346456693e-06,
+      "loss": 0.5306,
+      "step": 586
+    },
+    {
+      "epoch": 0.18488188976377953,
+      "grad_norm": 145.0093994140625,
+      "learning_rate": 9.99908188976378e-06,
+      "loss": 0.6498,
+      "step": 587
+    },
+    {
+      "epoch": 0.1851968503937008,
+      "grad_norm": 62.22865295410156,
+      "learning_rate": 9.99908031496063e-06,
+      "loss": 0.6765,
+      "step": 588
+    },
+    {
+      "epoch": 0.18551181102362205,
+      "grad_norm": 11.53496265411377,
+      "learning_rate": 9.999078740157481e-06,
+      "loss": 0.0789,
+      "step": 589
+    },
+    {
+      "epoch": 0.1858267716535433,
+      "grad_norm": 104.29961395263672,
+      "learning_rate": 9.999077165354332e-06,
+      "loss": 0.444,
+      "step": 590
+    },
+    {
+      "epoch": 0.18614173228346456,
+      "grad_norm": 26.030893325805664,
+      "learning_rate": 9.999075590551182e-06,
+      "loss": 0.0719,
+      "step": 591
+    },
+    {
+      "epoch": 0.18645669291338582,
+      "grad_norm": 59.57289123535156,
+      "learning_rate": 9.999074015748033e-06,
+      "loss": 0.4676,
+      "step": 592
+    },
+    {
+      "epoch": 0.18677165354330708,
+      "grad_norm": 47.10686111450195,
+      "learning_rate": 9.999072440944882e-06,
+      "loss": 0.6408,
+      "step": 593
+    },
+    {
+      "epoch": 0.18708661417322833,
+      "grad_norm": 97.91781616210938,
+      "learning_rate": 9.999070866141733e-06,
+      "loss": 0.8239,
+      "step": 594
+    },
+    {
+      "epoch": 0.18740157480314962,
+      "grad_norm": 82.05168151855469,
+      "learning_rate": 9.999069291338584e-06,
+      "loss": 0.2311,
+      "step": 595
+    },
+    {
+      "epoch": 0.18771653543307087,
+      "grad_norm": 73.30006408691406,
+      "learning_rate": 9.999067716535435e-06,
+      "loss": 0.6648,
+      "step": 596
+    },
+    {
+      "epoch": 0.18803149606299213,
+      "grad_norm": 36.88441467285156,
+      "learning_rate": 9.999066141732284e-06,
+      "loss": 0.411,
+      "step": 597
+    },
+    {
+      "epoch": 0.1883464566929134,
+      "grad_norm": 21.77279281616211,
+      "learning_rate": 9.999064566929133e-06,
+      "loss": 0.0762,
+      "step": 598
+    },
+    {
+      "epoch": 0.18866141732283465,
+      "grad_norm": 41.469337463378906,
+      "learning_rate": 9.999062992125984e-06,
+      "loss": 0.2768,
+      "step": 599
+    },
+    {
+      "epoch": 0.1889763779527559,
+      "grad_norm": 23.42574119567871,
+      "learning_rate": 9.999061417322835e-06,
+      "loss": 0.3597,
+      "step": 600
+    },
+    {
+      "epoch": 0.1889763779527559,
+      "eval_loss": 0.5386444330215454,
+      "eval_runtime": 309.0194,
+      "eval_samples_per_second": 0.379,
+      "eval_steps_per_second": 0.379,
+      "step": 600
+    },
+    {
+      "epoch": 0.18929133858267716,
+      "grad_norm": 49.9908332824707,
+      "learning_rate": 9.999059842519686e-06,
+      "loss": 0.3259,
+      "step": 601
+    },
+    {
+      "epoch": 0.18960629921259842,
+      "grad_norm": 111.2554702758789,
+      "learning_rate": 9.999058267716536e-06,
+      "loss": 0.451,
+      "step": 602
+    },
+    {
+      "epoch": 0.18992125984251967,
+      "grad_norm": 87.17556762695312,
+      "learning_rate": 9.999056692913387e-06,
+      "loss": 0.2611,
+      "step": 603
+    },
+    {
+      "epoch": 0.19023622047244093,
+      "grad_norm": 24.059478759765625,
+      "learning_rate": 9.999055118110238e-06,
+      "loss": 0.3407,
+      "step": 604
+    },
+    {
+      "epoch": 0.19055118110236222,
+      "grad_norm": 121.00428009033203,
+      "learning_rate": 9.999053543307089e-06,
+      "loss": 1.3334,
+      "step": 605
+    },
+    {
+      "epoch": 0.19086614173228347,
+      "grad_norm": 26.074909210205078,
+      "learning_rate": 9.999051968503938e-06,
+      "loss": 0.2243,
+      "step": 606
+    },
+    {
+      "epoch": 0.19118110236220473,
+      "grad_norm": 60.16206359863281,
+      "learning_rate": 9.999050393700787e-06,
+      "loss": 0.1694,
+      "step": 607
+    },
+    {
+      "epoch": 0.191496062992126,
+      "grad_norm": 94.56045532226562,
+      "learning_rate": 9.999048818897638e-06,
+      "loss": 0.453,
+      "step": 608
+    },
+    {
+      "epoch": 0.19181102362204724,
+      "grad_norm": 106.34618377685547,
+      "learning_rate": 9.99904724409449e-06,
+      "loss": 0.5918,
+      "step": 609
+    },
+    {
+      "epoch": 0.1921259842519685,
+      "grad_norm": 33.97660827636719,
+      "learning_rate": 9.99904566929134e-06,
+      "loss": 0.6824,
+      "step": 610
+    },
+    {
+      "epoch": 0.19244094488188976,
+      "grad_norm": 25.390705108642578,
+      "learning_rate": 9.99904409448819e-06,
+      "loss": 0.4475,
+      "step": 611
+    },
+    {
+      "epoch": 0.19275590551181102,
+      "grad_norm": 77.29833984375,
+      "learning_rate": 9.99904251968504e-06,
+      "loss": 0.5393,
+      "step": 612
+    },
+    {
+      "epoch": 0.19307086614173227,
+      "grad_norm": 56.867801666259766,
+      "learning_rate": 9.99904094488189e-06,
+      "loss": 0.4419,
+      "step": 613
+    },
+    {
+      "epoch": 0.19338582677165353,
+      "grad_norm": 62.77841567993164,
+      "learning_rate": 9.99903937007874e-06,
+      "loss": 0.7734,
+      "step": 614
+    },
+    {
+      "epoch": 0.19370078740157481,
+      "grad_norm": 76.52714538574219,
+      "learning_rate": 9.999037795275592e-06,
+      "loss": 0.7938,
+      "step": 615
+    },
+    {
+      "epoch": 0.19401574803149607,
+      "grad_norm": 26.540964126586914,
+      "learning_rate": 9.999036220472443e-06,
+      "loss": 0.2187,
+      "step": 616
+    },
+    {
+      "epoch": 0.19433070866141733,
+      "grad_norm": 20.736865997314453,
+      "learning_rate": 9.999034645669292e-06,
+      "loss": 0.1165,
+      "step": 617
+    },
+    {
+      "epoch": 0.19464566929133859,
+      "grad_norm": 86.58599853515625,
+      "learning_rate": 9.999033070866141e-06,
+      "loss": 0.4113,
+      "step": 618
+    },
+    {
+      "epoch": 0.19496062992125984,
+      "grad_norm": 17.32464599609375,
+      "learning_rate": 9.999031496062992e-06,
+      "loss": 0.2206,
+      "step": 619
+    },
+    {
+      "epoch": 0.1952755905511811,
+      "grad_norm": 54.347904205322266,
+      "learning_rate": 9.999029921259843e-06,
+      "loss": 0.6016,
+      "step": 620
+    },
+    {
+      "epoch": 0.1952755905511811,
+      "eval_loss": 0.5922896265983582,
+      "eval_runtime": 309.8531,
+      "eval_samples_per_second": 0.378,
+      "eval_steps_per_second": 0.378,
+      "step": 620
+    },
+    {
+      "epoch": 0.19559055118110236,
+      "grad_norm": 62.3914794921875,
+      "learning_rate": 9.999028346456694e-06,
+      "loss": 0.5295,
+      "step": 621
+    },
+    {
+      "epoch": 0.1959055118110236,
+      "grad_norm": 29.178783416748047,
+      "learning_rate": 9.999026771653544e-06,
+      "loss": 0.2871,
+      "step": 622
+    },
+    {
+      "epoch": 0.19622047244094487,
+      "grad_norm": 28.286312103271484,
+      "learning_rate": 9.999025196850395e-06,
+      "loss": 0.3118,
+      "step": 623
+    },
+    {
+      "epoch": 0.19653543307086613,
+      "grad_norm": 25.735105514526367,
+      "learning_rate": 9.999023622047244e-06,
+      "loss": 0.2462,
+      "step": 624
+    },
+    {
+      "epoch": 0.1968503937007874,
+      "grad_norm": 173.3253936767578,
+      "learning_rate": 9.999022047244095e-06,
+      "loss": 0.9632,
+      "step": 625
+    },
+    {
+      "epoch": 0.19716535433070867,
+      "grad_norm": 85.50897979736328,
+      "learning_rate": 9.999020472440946e-06,
+      "loss": 0.643,
+      "step": 626
+    },
+    {
+      "epoch": 0.19748031496062993,
+      "grad_norm": 29.653247833251953,
+      "learning_rate": 9.999018897637795e-06,
+      "loss": 0.4286,
+      "step": 627
+    },
+    {
+      "epoch": 0.19779527559055118,
+      "grad_norm": 37.991817474365234,
+      "learning_rate": 9.999017322834646e-06,
+      "loss": 0.3509,
+      "step": 628
+    },
+    {
+      "epoch": 0.19811023622047244,
+      "grad_norm": 97.16632843017578,
+      "learning_rate": 9.999015748031497e-06,
+      "loss": 0.3752,
+      "step": 629
+    },
+    {
+      "epoch": 0.1984251968503937,
+      "grad_norm": 50.35764694213867,
+      "learning_rate": 9.999014173228348e-06,
+      "loss": 0.5741,
+      "step": 630
+    },
+    {
+      "epoch": 0.19874015748031496,
+      "grad_norm": 28.755027770996094,
+      "learning_rate": 9.999012598425197e-06,
+      "loss": 0.278,
+      "step": 631
+    },
+    {
+      "epoch": 0.1990551181102362,
+      "grad_norm": 74.70952606201172,
+      "learning_rate": 9.999011023622048e-06,
+      "loss": 0.6146,
+      "step": 632
+    },
+    {
+      "epoch": 0.19937007874015747,
+      "grad_norm": 78.7210693359375,
+      "learning_rate": 9.999009448818898e-06,
+      "loss": 0.2827,
+      "step": 633
+    },
+    {
+      "epoch": 0.19968503937007873,
+      "grad_norm": 17.010251998901367,
+      "learning_rate": 9.999007874015749e-06,
+      "loss": 0.239,
+      "step": 634
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 81.048095703125,
+      "learning_rate": 9.9990062992126e-06,
+      "loss": 1.0794,
+      "step": 635
+    },
+    {
+      "epoch": 0.20031496062992127,
+      "grad_norm": 30.376026153564453,
+      "learning_rate": 9.99900472440945e-06,
+      "loss": 0.2238,
+      "step": 636
+    },
+    {
+      "epoch": 0.20062992125984253,
+      "grad_norm": 45.60966873168945,
+      "learning_rate": 9.9990031496063e-06,
+      "loss": 0.7123,
+      "step": 637
+    },
+    {
+      "epoch": 0.20094488188976378,
+      "grad_norm": 47.69063186645508,
+      "learning_rate": 9.99900157480315e-06,
+      "loss": 0.5227,
+      "step": 638
+    },
+    {
+      "epoch": 0.20125984251968504,
+      "grad_norm": 25.39598274230957,
+      "learning_rate": 9.999e-06,
+      "loss": 0.0982,
+      "step": 639
+    },
+    {
+      "epoch": 0.2015748031496063,
+      "grad_norm": 36.95790481567383,
+      "learning_rate": 9.998998425196851e-06,
+      "loss": 0.4809,
+      "step": 640
+    },
+    {
+      "epoch": 0.2015748031496063,
+      "eval_loss": 0.5429102182388306,
+      "eval_runtime": 295.5911,
+      "eval_samples_per_second": 0.396,
+      "eval_steps_per_second": 0.396,
+      "step": 640
+    },
+    {
+      "epoch": 0.20188976377952755,
+      "grad_norm": 47.62435531616211,
+      "learning_rate": 9.998996850393702e-06,
+      "loss": 0.6058,
+      "step": 641
+    },
+    {
+      "epoch": 0.2022047244094488,
+      "grad_norm": 31.27633285522461,
+      "learning_rate": 9.998995275590552e-06,
+      "loss": 0.2615,
+      "step": 642
+    },
+    {
+      "epoch": 0.20251968503937007,
+      "grad_norm": 47.4228630065918,
+      "learning_rate": 9.998993700787403e-06,
+      "loss": 0.6306,
+      "step": 643
+    },
+    {
+      "epoch": 0.20283464566929135,
+      "grad_norm": 67.43081665039062,
+      "learning_rate": 9.998992125984252e-06,
+      "loss": 0.654,
+      "step": 644
+    },
+    {
+      "epoch": 0.2031496062992126,
+      "grad_norm": 57.79238510131836,
+      "learning_rate": 9.998990551181103e-06,
+      "loss": 0.3423,
+      "step": 645
+    },
+    {
+      "epoch": 0.20346456692913387,
+      "grad_norm": 38.009735107421875,
+      "learning_rate": 9.998988976377954e-06,
+      "loss": 0.5406,
+      "step": 646
+    },
+    {
+      "epoch": 0.20377952755905512,
+      "grad_norm": 36.380531311035156,
+      "learning_rate": 9.998987401574805e-06,
+      "loss": 0.3381,
+      "step": 647
+    },
+    {
+      "epoch": 0.20409448818897638,
+      "grad_norm": 33.59734344482422,
+      "learning_rate": 9.998985826771654e-06,
+      "loss": 0.2425,
+      "step": 648
+    },
+    {
+      "epoch": 0.20440944881889764,
+      "grad_norm": 40.288902282714844,
+      "learning_rate": 9.998984251968505e-06,
+      "loss": 0.4959,
+      "step": 649
+    },
+    {
+      "epoch": 0.2047244094488189,
+      "grad_norm": 51.55220413208008,
+      "learning_rate": 9.998982677165356e-06,
+      "loss": 0.4607,
+      "step": 650
+    },
+    {
+      "epoch": 0.20503937007874015,
+      "grad_norm": 51.30996322631836,
+      "learning_rate": 9.998981102362205e-06,
+      "loss": 0.4445,
+      "step": 651
+    },
+    {
+      "epoch": 0.2053543307086614,
+      "grad_norm": 139.98907470703125,
+      "learning_rate": 9.998979527559056e-06,
+      "loss": 1.0222,
+      "step": 652
+    },
+    {
+      "epoch": 0.20566929133858267,
+      "grad_norm": 42.455169677734375,
+      "learning_rate": 9.998977952755906e-06,
+      "loss": 0.3876,
+      "step": 653
+    },
+    {
+      "epoch": 0.20598425196850395,
+      "grad_norm": 22.130889892578125,
+      "learning_rate": 9.998976377952757e-06,
+      "loss": 0.3178,
+      "step": 654
+    },
+    {
+      "epoch": 0.2062992125984252,
+      "grad_norm": 27.28899574279785,
+      "learning_rate": 9.998974803149608e-06,
+      "loss": 0.236,
+      "step": 655
+    },
+    {
+      "epoch": 0.20661417322834646,
+      "grad_norm": 19.113969802856445,
+      "learning_rate": 9.998973228346459e-06,
+      "loss": 0.2199,
+      "step": 656
+    },
+    {
+      "epoch": 0.20692913385826772,
+      "grad_norm": 70.42593383789062,
+      "learning_rate": 9.998971653543308e-06,
+      "loss": 0.5523,
+      "step": 657
+    },
+    {
+      "epoch": 0.20724409448818898,
+      "grad_norm": 51.796207427978516,
+      "learning_rate": 9.998970078740157e-06,
+      "loss": 0.5754,
+      "step": 658
+    },
+    {
+      "epoch": 0.20755905511811024,
+      "grad_norm": 95.93733215332031,
+      "learning_rate": 9.998968503937008e-06,
+      "loss": 0.7566,
+      "step": 659
+    },
+    {
+      "epoch": 0.2078740157480315,
+      "grad_norm": 27.715049743652344,
+      "learning_rate": 9.99896692913386e-06,
+      "loss": 0.2745,
+      "step": 660
+    },
+    {
+      "epoch": 0.2078740157480315,
+      "eval_loss": 0.4993188977241516,
+      "eval_runtime": 309.1118,
+      "eval_samples_per_second": 0.379,
+      "eval_steps_per_second": 0.379,
+      "step": 660
+    },
+    {
+      "epoch": 0.20818897637795275,
+      "grad_norm": 43.9770393371582,
+      "learning_rate": 9.99896535433071e-06,
+      "loss": 0.5356,
+      "step": 661
+    },
+    {
+      "epoch": 0.208503937007874,
+      "grad_norm": 28.040441513061523,
+      "learning_rate": 9.99896377952756e-06,
+      "loss": 0.315,
+      "step": 662
+    },
+    {
+      "epoch": 0.20881889763779526,
+      "grad_norm": 45.72781753540039,
+      "learning_rate": 9.99896220472441e-06,
+      "loss": 0.4161,
+      "step": 663
+    },
+    {
+      "epoch": 0.20913385826771655,
+      "grad_norm": 27.964317321777344,
+      "learning_rate": 9.99896062992126e-06,
+      "loss": 0.2185,
+      "step": 664
+    },
+    {
+      "epoch": 0.2094488188976378,
+      "grad_norm": 32.59132766723633,
+      "learning_rate": 9.99895905511811e-06,
+      "loss": 0.3825,
+      "step": 665
+    },
+    {
+      "epoch": 0.20976377952755906,
+      "grad_norm": 38.677207946777344,
+      "learning_rate": 9.998957480314962e-06,
+      "loss": 0.2456,
+      "step": 666
+    },
+    {
+      "epoch": 0.21007874015748032,
+      "grad_norm": 14.392455101013184,
+      "learning_rate": 9.998955905511813e-06,
+      "loss": 0.0907,
+      "step": 667
+    },
+    {
+      "epoch": 0.21039370078740158,
+      "grad_norm": 50.628910064697266,
+      "learning_rate": 9.998954330708662e-06,
+      "loss": 0.7256,
+      "step": 668
+    },
+    {
+      "epoch": 0.21070866141732283,
+      "grad_norm": 60.51618194580078,
+      "learning_rate": 9.998952755905511e-06,
+      "loss": 0.4601,
+      "step": 669
+    },
+    {
+      "epoch": 0.2110236220472441,
+      "grad_norm": 40.12845230102539,
+      "learning_rate": 9.998951181102362e-06,
+      "loss": 0.5639,
+      "step": 670
+    },
+    {
+      "epoch": 0.21133858267716535,
+      "grad_norm": 45.10561752319336,
+      "learning_rate": 9.998949606299213e-06,
+      "loss": 0.4125,
+      "step": 671
+    },
+    {
+      "epoch": 0.2116535433070866,
+      "grad_norm": 79.58858489990234,
+      "learning_rate": 9.998948031496064e-06,
+      "loss": 0.8415,
+      "step": 672
+    },
+    {
+      "epoch": 0.21196850393700786,
+      "grad_norm": 35.8792724609375,
+      "learning_rate": 9.998946456692914e-06,
+      "loss": 0.3386,
+      "step": 673
+    },
+    {
+      "epoch": 0.21228346456692915,
+      "grad_norm": 13.449446678161621,
+      "learning_rate": 9.998944881889765e-06,
+      "loss": 0.1095,
+      "step": 674
+    },
+    {
+      "epoch": 0.2125984251968504,
+      "grad_norm": 82.7651596069336,
+      "learning_rate": 9.998943307086616e-06,
+      "loss": 0.331,
+      "step": 675
+    },
+    {
+      "epoch": 0.21291338582677166,
+      "grad_norm": 80.85045623779297,
+      "learning_rate": 9.998941732283467e-06,
+      "loss": 0.9251,
+      "step": 676
+    },
+    {
+      "epoch": 0.21322834645669292,
+      "grad_norm": 34.32810974121094,
+      "learning_rate": 9.998940157480316e-06,
+      "loss": 0.2547,
+      "step": 677
+    },
+    {
+      "epoch": 0.21354330708661418,
+      "grad_norm": 39.6412239074707,
+      "learning_rate": 9.998938582677165e-06,
+      "loss": 0.3042,
+      "step": 678
+    },
+    {
+      "epoch": 0.21385826771653543,
+      "grad_norm": 29.662391662597656,
+      "learning_rate": 9.998937007874016e-06,
+      "loss": 0.353,
+      "step": 679
+    },
+    {
+      "epoch": 0.2141732283464567,
+      "grad_norm": 15.640954971313477,
+      "learning_rate": 9.998935433070867e-06,
+      "loss": 0.0869,
+      "step": 680
+    },
+    {
+      "epoch": 0.2141732283464567,
+      "eval_loss": 0.5842440128326416,
+      "eval_runtime": 303.7569,
+      "eval_samples_per_second": 0.385,
+      "eval_steps_per_second": 0.385,
+      "step": 680
+    },
+    {
+      "epoch": 0.21448818897637795,
+      "grad_norm": 47.02978515625,
+      "learning_rate": 9.998933858267718e-06,
+      "loss": 0.6137,
+      "step": 681
+    },
+    {
+      "epoch": 0.2148031496062992,
+      "grad_norm": 23.671756744384766,
+      "learning_rate": 9.998932283464568e-06,
+      "loss": 0.2595,
+      "step": 682
+    },
+    {
+      "epoch": 0.21511811023622046,
+      "grad_norm": 35.9589729309082,
+      "learning_rate": 9.998930708661418e-06,
+      "loss": 0.3242,
+      "step": 683
+    },
+    {
+      "epoch": 0.21543307086614175,
+      "grad_norm": 42.95161819458008,
+      "learning_rate": 9.998929133858268e-06,
+      "loss": 0.2721,
+      "step": 684
+    },
+    {
+      "epoch": 0.215748031496063,
+      "grad_norm": 58.640968322753906,
+      "learning_rate": 9.998927559055119e-06,
+      "loss": 0.3166,
+      "step": 685
+    },
+    {
+      "epoch": 0.21606299212598426,
+      "grad_norm": 35.683467864990234,
+      "learning_rate": 9.99892598425197e-06,
+      "loss": 0.3474,
+      "step": 686
+    },
+    {
+      "epoch": 0.21637795275590552,
+      "grad_norm": 9.069628715515137,
+      "learning_rate": 9.99892440944882e-06,
+      "loss": 0.0522,
+      "step": 687
+    },
+    {
+      "epoch": 0.21669291338582677,
+      "grad_norm": 29.538806915283203,
+      "learning_rate": 9.99892283464567e-06,
+      "loss": 0.3609,
+      "step": 688
+    },
+    {
+      "epoch": 0.21700787401574803,
+      "grad_norm": 40.99892044067383,
+      "learning_rate": 9.99892125984252e-06,
+      "loss": 0.5832,
+      "step": 689
+    },
+    {
+      "epoch": 0.2173228346456693,
+      "grad_norm": 48.680538177490234,
+      "learning_rate": 9.99891968503937e-06,
+      "loss": 0.5598,
+      "step": 690
+    },
+    {
+      "epoch": 0.21763779527559055,
+      "grad_norm": 37.50501251220703,
+      "learning_rate": 9.998918110236221e-06,
+      "loss": 0.211,
+      "step": 691
+    },
+    {
+      "epoch": 0.2179527559055118,
+      "grad_norm": 145.6537322998047,
+      "learning_rate": 9.998916535433072e-06,
+      "loss": 1.0281,
+      "step": 692
+    },
+    {
+      "epoch": 0.21826771653543306,
+      "grad_norm": 82.14835357666016,
+      "learning_rate": 9.998914960629922e-06,
+      "loss": 0.4626,
+      "step": 693
+    },
+    {
+      "epoch": 0.21858267716535434,
+      "grad_norm": 114.24347686767578,
+      "learning_rate": 9.998913385826773e-06,
+      "loss": 0.4935,
+      "step": 694
+    },
+    {
+      "epoch": 0.2188976377952756,
+      "grad_norm": 44.75710678100586,
+      "learning_rate": 9.998911811023622e-06,
+      "loss": 0.2071,
+      "step": 695
+    },
+    {
+      "epoch": 0.21921259842519686,
+      "grad_norm": 35.995880126953125,
+      "learning_rate": 9.998910236220473e-06,
+      "loss": 0.1709,
+      "step": 696
+    },
+    {
+      "epoch": 0.21952755905511812,
+      "grad_norm": 74.69808197021484,
+      "learning_rate": 9.998908661417324e-06,
+      "loss": 0.7127,
+      "step": 697
+    },
+    {
+      "epoch": 0.21984251968503937,
+      "grad_norm": 46.23347854614258,
+      "learning_rate": 9.998907086614173e-06,
+      "loss": 0.2524,
+      "step": 698
+    },
+    {
+      "epoch": 0.22015748031496063,
+      "grad_norm": 23.196392059326172,
+      "learning_rate": 9.998905511811024e-06,
+      "loss": 0.0743,
+      "step": 699
+    },
+    {
+      "epoch": 0.2204724409448819,
+      "grad_norm": 68.11265563964844,
+      "learning_rate": 9.998903937007875e-06,
+      "loss": 0.5743,
+      "step": 700
+    },
+    {
+      "epoch": 0.2204724409448819,
+      "eval_loss": 0.6227251291275024,
+      "eval_runtime": 306.9107,
+      "eval_samples_per_second": 0.381,
+      "eval_steps_per_second": 0.381,
+      "step": 700
+    },
+    {
+      "epoch": 0.22078740157480314,
+      "grad_norm": 49.117034912109375,
+      "learning_rate": 9.998902362204726e-06,
+      "loss": 0.5395,
+      "step": 701
+    },
+    {
+      "epoch": 0.2211023622047244,
+      "grad_norm": 72.21612548828125,
+      "learning_rate": 9.998900787401575e-06,
+      "loss": 0.1053,
+      "step": 702
+    },
+    {
+      "epoch": 0.22141732283464566,
+      "grad_norm": 106.6301498413086,
+      "learning_rate": 9.998899212598426e-06,
+      "loss": 0.4932,
+      "step": 703
+    },
+    {
+      "epoch": 0.22173228346456694,
+      "grad_norm": 171.39988708496094,
+      "learning_rate": 9.998897637795276e-06,
+      "loss": 0.9959,
+      "step": 704
+    },
+    {
+      "epoch": 0.2220472440944882,
+      "grad_norm": 33.84798812866211,
+      "learning_rate": 9.998896062992127e-06,
+      "loss": 0.3472,
+      "step": 705
+    },
+    {
+      "epoch": 0.22236220472440946,
+      "grad_norm": 35.71989059448242,
+      "learning_rate": 9.998894488188978e-06,
+      "loss": 0.1522,
+      "step": 706
+    },
+    {
+      "epoch": 0.22267716535433071,
+      "grad_norm": 65.45905303955078,
+      "learning_rate": 9.998892913385829e-06,
+      "loss": 0.9923,
+      "step": 707
+    },
+    {
+      "epoch": 0.22299212598425197,
+      "grad_norm": 88.24420166015625,
+      "learning_rate": 9.998891338582678e-06,
+      "loss": 0.7105,
+      "step": 708
+    },
+    {
+      "epoch": 0.22330708661417323,
+      "grad_norm": 71.33065032958984,
+      "learning_rate": 9.998889763779527e-06,
+      "loss": 1.2078,
+      "step": 709
+    },
+    {
+      "epoch": 0.22362204724409449,
+      "grad_norm": 25.207542419433594,
+      "learning_rate": 9.998888188976378e-06,
+      "loss": 0.1815,
+      "step": 710
+    },
+    {
+      "epoch": 0.22393700787401574,
+      "grad_norm": 23.143169403076172,
+      "learning_rate": 9.99888661417323e-06,
+      "loss": 0.1984,
+      "step": 711
+    },
+    {
+      "epoch": 0.224251968503937,
+      "grad_norm": 32.797752380371094,
+      "learning_rate": 9.99888503937008e-06,
+      "loss": 0.3693,
+      "step": 712
+    },
+    {
+      "epoch": 0.22456692913385826,
+      "grad_norm": 35.14696502685547,
+      "learning_rate": 9.99888346456693e-06,
+      "loss": 0.4894,
+      "step": 713
+    },
+    {
+      "epoch": 0.22488188976377954,
+      "grad_norm": 24.498680114746094,
+      "learning_rate": 9.99888188976378e-06,
+      "loss": 0.3493,
+      "step": 714
+    },
+    {
+      "epoch": 0.2251968503937008,
+      "grad_norm": 57.850730895996094,
+      "learning_rate": 9.99888031496063e-06,
+      "loss": 0.483,
+      "step": 715
+    },
+    {
+      "epoch": 0.22551181102362206,
+      "grad_norm": 47.62955093383789,
+      "learning_rate": 9.998878740157481e-06,
+      "loss": 0.3677,
+      "step": 716
+    },
+    {
+      "epoch": 0.2258267716535433,
+      "grad_norm": 25.785797119140625,
+      "learning_rate": 9.998877165354332e-06,
+      "loss": 0.324,
+      "step": 717
+    },
+    {
+      "epoch": 0.22614173228346457,
+      "grad_norm": 44.619041442871094,
+      "learning_rate": 9.998875590551181e-06,
+      "loss": 0.3471,
+      "step": 718
+    },
+    {
+      "epoch": 0.22645669291338583,
+      "grad_norm": 10.855001449584961,
+      "learning_rate": 9.998874015748032e-06,
+      "loss": 0.205,
+      "step": 719
+    },
+    {
+      "epoch": 0.22677165354330708,
+      "grad_norm": 53.096466064453125,
+      "learning_rate": 9.998872440944883e-06,
+      "loss": 0.4501,
+      "step": 720
+    },
+    {
+      "epoch": 0.22677165354330708,
+      "eval_loss": 0.5146042704582214,
+      "eval_runtime": 316.9221,
+      "eval_samples_per_second": 0.369,
+      "eval_steps_per_second": 0.369,
+      "step": 720
+    },
+    {
+      "epoch": 0.22708661417322834,
+      "grad_norm": 44.9933967590332,
+      "learning_rate": 9.998870866141734e-06,
+      "loss": 0.4339,
+      "step": 721
+    },
+    {
+      "epoch": 0.2274015748031496,
+      "grad_norm": 46.176700592041016,
+      "learning_rate": 9.998869291338583e-06,
+      "loss": 0.4345,
+      "step": 722
+    },
+    {
+      "epoch": 0.22771653543307085,
+      "grad_norm": 21.4019832611084,
+      "learning_rate": 9.998867716535434e-06,
+      "loss": 0.5765,
+      "step": 723
+    },
+    {
+      "epoch": 0.22803149606299214,
+      "grad_norm": 17.84157943725586,
+      "learning_rate": 9.998866141732284e-06,
+      "loss": 0.1562,
+      "step": 724
+    },
+    {
+      "epoch": 0.2283464566929134,
+      "grad_norm": 18.121217727661133,
+      "learning_rate": 9.998864566929135e-06,
+      "loss": 0.1687,
+      "step": 725
+    },
+    {
+      "epoch": 0.22866141732283465,
+      "grad_norm": 32.00659942626953,
+      "learning_rate": 9.998862992125986e-06,
+      "loss": 0.1804,
+      "step": 726
+    },
+    {
+      "epoch": 0.2289763779527559,
+      "grad_norm": 32.684757232666016,
+      "learning_rate": 9.998861417322837e-06,
+      "loss": 0.2921,
+      "step": 727
+    },
+    {
+      "epoch": 0.22929133858267717,
+      "grad_norm": 65.45207214355469,
+      "learning_rate": 9.998859842519686e-06,
+      "loss": 0.237,
+      "step": 728
+    },
+    {
+      "epoch": 0.22960629921259842,
+      "grad_norm": 24.639812469482422,
+      "learning_rate": 9.998858267716535e-06,
+      "loss": 0.1602,
+      "step": 729
+    },
+    {
+      "epoch": 0.22992125984251968,
+      "grad_norm": 105.74681854248047,
+      "learning_rate": 9.998856692913386e-06,
+      "loss": 0.6624,
+      "step": 730
+    },
+    {
+      "epoch": 0.23023622047244094,
+      "grad_norm": 123.29035949707031,
+      "learning_rate": 9.998855118110237e-06,
+      "loss": 0.1592,
+      "step": 731
+    },
+    {
+      "epoch": 0.2305511811023622,
+      "grad_norm": 75.88599395751953,
+      "learning_rate": 9.998853543307088e-06,
+      "loss": 0.8749,
+      "step": 732
+    },
+    {
+      "epoch": 0.23086614173228345,
+      "grad_norm": 93.44769287109375,
+      "learning_rate": 9.998851968503938e-06,
+      "loss": 0.2961,
+      "step": 733
+    },
+    {
+      "epoch": 0.23118110236220474,
+      "grad_norm": 149.9324188232422,
+      "learning_rate": 9.998850393700789e-06,
+      "loss": 1.0564,
+      "step": 734
+    },
+    {
+      "epoch": 0.231496062992126,
+      "grad_norm": 158.12449645996094,
+      "learning_rate": 9.998848818897638e-06,
+      "loss": 1.6293,
+      "step": 735
+    },
+    {
+      "epoch": 0.23181102362204725,
+      "grad_norm": 149.76597595214844,
+      "learning_rate": 9.998847244094489e-06,
+      "loss": 1.6253,
+      "step": 736
+    },
+    {
+      "epoch": 0.2321259842519685,
+      "grad_norm": 40.33038330078125,
+      "learning_rate": 9.99884566929134e-06,
+      "loss": 0.5073,
+      "step": 737
+    },
+    {
+      "epoch": 0.23244094488188977,
+      "grad_norm": 67.39472198486328,
+      "learning_rate": 9.998844094488189e-06,
+      "loss": 0.303,
+      "step": 738
+    },
+    {
+      "epoch": 0.23275590551181102,
+      "grad_norm": 48.84297561645508,
+      "learning_rate": 9.99884251968504e-06,
+      "loss": 0.3308,
+      "step": 739
+    },
+    {
+      "epoch": 0.23307086614173228,
+      "grad_norm": 50.47072219848633,
+      "learning_rate": 9.99884094488189e-06,
+      "loss": 0.5499,
+      "step": 740
+    },
+    {
+      "epoch": 0.23307086614173228,
+      "eval_loss": 0.563422441482544,
+      "eval_runtime": 300.8217,
+      "eval_samples_per_second": 0.389,
+      "eval_steps_per_second": 0.389,
+      "step": 740
+    },
+    {
+      "epoch": 0.23338582677165354,
+      "grad_norm": 40.9046745300293,
+      "learning_rate": 9.99883937007874e-06,
+      "loss": 0.275,
+      "step": 741
+    },
+    {
+      "epoch": 0.2337007874015748,
+      "grad_norm": 68.57015991210938,
+      "learning_rate": 9.998837795275591e-06,
+      "loss": 0.5954,
+      "step": 742
+    },
+    {
+      "epoch": 0.23401574803149605,
+      "grad_norm": 32.3016357421875,
+      "learning_rate": 9.998836220472442e-06,
+      "loss": 0.1714,
+      "step": 743
+    },
+    {
+      "epoch": 0.23433070866141734,
+      "grad_norm": 35.811279296875,
+      "learning_rate": 9.998834645669292e-06,
+      "loss": 0.3879,
+      "step": 744
+    },
+    {
+      "epoch": 0.2346456692913386,
+      "grad_norm": 25.600717544555664,
+      "learning_rate": 9.998833070866143e-06,
+      "loss": 0.0832,
+      "step": 745
+    },
+    {
+      "epoch": 0.23496062992125985,
+      "grad_norm": 38.07957458496094,
+      "learning_rate": 9.998831496062994e-06,
+      "loss": 0.6504,
+      "step": 746
+    },
+    {
+      "epoch": 0.2352755905511811,
+      "grad_norm": 62.875614166259766,
+      "learning_rate": 9.998829921259845e-06,
+      "loss": 0.6823,
+      "step": 747
+    },
+    {
+      "epoch": 0.23559055118110236,
+      "grad_norm": 95.92493438720703,
+      "learning_rate": 9.998828346456694e-06,
+      "loss": 1.5914,
+      "step": 748
+    },
+    {
+      "epoch": 0.23590551181102362,
+      "grad_norm": 40.44503402709961,
+      "learning_rate": 9.998826771653543e-06,
+      "loss": 0.6454,
+      "step": 749
+    },
+    {
+      "epoch": 0.23622047244094488,
+      "grad_norm": 16.39349365234375,
+      "learning_rate": 9.998825196850394e-06,
+      "loss": 0.2515,
+      "step": 750
+    },
+    {
+      "epoch": 0.23653543307086614,
+      "grad_norm": 54.119468688964844,
+      "learning_rate": 9.998823622047245e-06,
+      "loss": 0.497,
+      "step": 751
+    },
+    {
+      "epoch": 0.2368503937007874,
+      "grad_norm": 55.42496871948242,
+      "learning_rate": 9.998822047244096e-06,
+      "loss": 0.2491,
+      "step": 752
+    },
+    {
+      "epoch": 0.23716535433070865,
+      "grad_norm": 55.21287155151367,
+      "learning_rate": 9.998820472440945e-06,
+      "loss": 0.9017,
+      "step": 753
+    },
+    {
+      "epoch": 0.23748031496062993,
+      "grad_norm": 26.37238311767578,
+      "learning_rate": 9.998818897637796e-06,
+      "loss": 0.4861,
+      "step": 754
+    },
+    {
+      "epoch": 0.2377952755905512,
+      "grad_norm": 23.099788665771484,
+      "learning_rate": 9.998817322834646e-06,
+      "loss": 0.4609,
+      "step": 755
+    },
+    {
+      "epoch": 0.23811023622047245,
+      "grad_norm": 35.12017822265625,
+      "learning_rate": 9.998815748031497e-06,
+      "loss": 0.4897,
+      "step": 756
+    },
+    {
+      "epoch": 0.2384251968503937,
+      "grad_norm": 13.813502311706543,
+      "learning_rate": 9.998814173228348e-06,
+      "loss": 0.1271,
+      "step": 757
+    },
+    {
+      "epoch": 0.23874015748031496,
+      "grad_norm": 29.46474266052246,
+      "learning_rate": 9.998812598425197e-06,
+      "loss": 0.505,
+      "step": 758
+    },
+    {
+      "epoch": 0.23905511811023622,
+      "grad_norm": 26.503684997558594,
+      "learning_rate": 9.998811023622048e-06,
+      "loss": 0.424,
+      "step": 759
+    },
+    {
+      "epoch": 0.23937007874015748,
+      "grad_norm": 36.68299102783203,
+      "learning_rate": 9.998809448818897e-06,
+      "loss": 0.5153,
+      "step": 760
+    },
+    {
+      "epoch": 0.23937007874015748,
+      "eval_loss": 0.467477947473526,
+      "eval_runtime": 299.9771,
+      "eval_samples_per_second": 0.39,
+      "eval_steps_per_second": 0.39,
+      "step": 760
+    },
+    {
+      "epoch": 0.23968503937007873,
+      "grad_norm": 27.404645919799805,
+      "learning_rate": 9.998807874015748e-06,
+      "loss": 0.4891,
+      "step": 761
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 35.5542106628418,
+      "learning_rate": 9.9988062992126e-06,
+      "loss": 0.6146,
+      "step": 762
+    },
+    {
+      "epoch": 0.24031496062992125,
+      "grad_norm": 64.38270568847656,
+      "learning_rate": 9.99880472440945e-06,
+      "loss": 0.7001,
+      "step": 763
+    },
+    {
+      "epoch": 0.24062992125984253,
+      "grad_norm": 11.979158401489258,
+      "learning_rate": 9.9988031496063e-06,
+      "loss": 0.1519,
+      "step": 764
+    },
+    {
+      "epoch": 0.2409448818897638,
+      "grad_norm": 45.081756591796875,
+      "learning_rate": 9.99880157480315e-06,
+      "loss": 0.5738,
+      "step": 765
+    },
+    {
+      "epoch": 0.24125984251968505,
+      "grad_norm": 16.82332992553711,
+      "learning_rate": 9.998800000000002e-06,
+      "loss": 0.2367,
+      "step": 766
+    },
+    {
+      "epoch": 0.2415748031496063,
+      "grad_norm": 58.26005554199219,
+      "learning_rate": 9.998798425196853e-06,
+      "loss": 0.6255,
+      "step": 767
+    },
+    {
+      "epoch": 0.24188976377952756,
+      "grad_norm": 33.454200744628906,
+      "learning_rate": 9.998796850393702e-06,
+      "loss": 0.282,
+      "step": 768
+    },
+    {
+      "epoch": 0.24220472440944882,
+      "grad_norm": 26.192066192626953,
+      "learning_rate": 9.998795275590551e-06,
+      "loss": 0.2512,
+      "step": 769
+    },
+    {
+      "epoch": 0.24251968503937008,
+      "grad_norm": 24.451866149902344,
+      "learning_rate": 9.998793700787402e-06,
+      "loss": 0.1288,
+      "step": 770
+    },
+    {
+      "epoch": 0.24283464566929133,
+      "grad_norm": 17.908388137817383,
+      "learning_rate": 9.998792125984253e-06,
+      "loss": 0.1228,
+      "step": 771
+    },
+    {
+      "epoch": 0.2431496062992126,
+      "grad_norm": 30.55709457397461,
+      "learning_rate": 9.998790551181104e-06,
+      "loss": 0.4937,
+      "step": 772
+    },
+    {
+      "epoch": 0.24346456692913385,
+      "grad_norm": 47.04617691040039,
+      "learning_rate": 9.998788976377953e-06,
+      "loss": 0.7825,
+      "step": 773
+    },
+    {
+      "epoch": 0.24377952755905513,
+      "grad_norm": 66.30603790283203,
+      "learning_rate": 9.998787401574804e-06,
+      "loss": 0.3853,
+      "step": 774
+    },
+    {
+      "epoch": 0.2440944881889764,
+      "grad_norm": 28.97539520263672,
+      "learning_rate": 9.998785826771654e-06,
+      "loss": 0.2909,
+      "step": 775
+    },
+    {
+      "epoch": 0.24440944881889765,
+      "grad_norm": 71.36101531982422,
+      "learning_rate": 9.998784251968505e-06,
+      "loss": 0.6988,
+      "step": 776
+    },
+    {
+      "epoch": 0.2447244094488189,
+      "grad_norm": 35.699249267578125,
+      "learning_rate": 9.998782677165356e-06,
+      "loss": 0.4579,
+      "step": 777
+    },
+    {
+      "epoch": 0.24503937007874016,
+      "grad_norm": 4.760364532470703,
+      "learning_rate": 9.998781102362205e-06,
+      "loss": 0.0287,
+      "step": 778
+    },
+    {
+      "epoch": 0.24535433070866142,
+      "grad_norm": 30.33928108215332,
+      "learning_rate": 9.998779527559056e-06,
+      "loss": 0.5643,
+      "step": 779
+    },
+    {
+      "epoch": 0.24566929133858267,
+      "grad_norm": 43.770694732666016,
+      "learning_rate": 9.998777952755905e-06,
+      "loss": 0.4347,
+      "step": 780
+    },
+    {
+      "epoch": 0.24566929133858267,
+      "eval_loss": 0.5863191485404968,
+      "eval_runtime": 301.5247,
+      "eval_samples_per_second": 0.388,
+      "eval_steps_per_second": 0.388,
+      "step": 780
+    },
+    {
+      "epoch": 0.24598425196850393,
+      "grad_norm": 38.59556198120117,
+      "learning_rate": 9.998776377952756e-06,
+      "loss": 0.2601,
+      "step": 781
+    },
+    {
+      "epoch": 0.2462992125984252,
+      "grad_norm": 42.7963981628418,
+      "learning_rate": 9.998774803149607e-06,
+      "loss": 0.329,
+      "step": 782
+    },
+    {
+      "epoch": 0.24661417322834644,
+      "grad_norm": 48.59577941894531,
+      "learning_rate": 9.998773228346458e-06,
+      "loss": 0.5232,
+      "step": 783
+    },
+    {
+      "epoch": 0.24692913385826773,
+      "grad_norm": 51.12166976928711,
+      "learning_rate": 9.998771653543308e-06,
+      "loss": 0.955,
+      "step": 784
+    },
+    {
+      "epoch": 0.247244094488189,
+      "grad_norm": 29.71060562133789,
+      "learning_rate": 9.998770078740159e-06,
+      "loss": 0.1399,
+      "step": 785
+    },
+    {
+      "epoch": 0.24755905511811024,
+      "grad_norm": 73.006103515625,
+      "learning_rate": 9.998768503937008e-06,
+      "loss": 0.8551,
+      "step": 786
+    },
+    {
+      "epoch": 0.2478740157480315,
+      "grad_norm": 24.96092414855957,
+      "learning_rate": 9.998766929133859e-06,
+      "loss": 0.379,
+      "step": 787
+    },
+    {
+      "epoch": 0.24818897637795276,
+      "grad_norm": 23.48893165588379,
+      "learning_rate": 9.99876535433071e-06,
+      "loss": 0.1811,
+      "step": 788
+    },
+    {
+      "epoch": 0.24850393700787402,
+      "grad_norm": 38.28635025024414,
+      "learning_rate": 9.998763779527559e-06,
+      "loss": 0.4841,
+      "step": 789
+    },
+    {
+      "epoch": 0.24881889763779527,
+      "grad_norm": 53.374549865722656,
+      "learning_rate": 9.99876220472441e-06,
+      "loss": 0.5892,
+      "step": 790
+    },
+    {
+      "epoch": 0.24913385826771653,
+      "grad_norm": 32.456485748291016,
+      "learning_rate": 9.998760629921261e-06,
+      "loss": 0.4001,
+      "step": 791
+    },
+    {
+      "epoch": 0.2494488188976378,
+      "grad_norm": 75.20841217041016,
+      "learning_rate": 9.998759055118112e-06,
+      "loss": 0.5947,
+      "step": 792
+    },
+    {
+      "epoch": 0.24976377952755904,
+      "grad_norm": 43.24211502075195,
+      "learning_rate": 9.998757480314961e-06,
+      "loss": 0.4858,
+      "step": 793
+    },
+    {
+      "epoch": 0.25007874015748033,
+      "grad_norm": 50.684852600097656,
+      "learning_rate": 9.998755905511812e-06,
+      "loss": 0.3602,
+      "step": 794
+    },
+    {
+      "epoch": 0.2503937007874016,
+      "grad_norm": 24.13330078125,
+      "learning_rate": 9.998754330708662e-06,
+      "loss": 0.3898,
+      "step": 795
+    },
+    {
+      "epoch": 0.25070866141732284,
+      "grad_norm": 32.628719329833984,
+      "learning_rate": 9.998752755905513e-06,
+      "loss": 0.547,
+      "step": 796
+    },
+    {
+      "epoch": 0.2510236220472441,
+      "grad_norm": 24.97728157043457,
+      "learning_rate": 9.998751181102364e-06,
+      "loss": 0.536,
+      "step": 797
+    },
+    {
+      "epoch": 0.25133858267716536,
+      "grad_norm": 21.705215454101562,
+      "learning_rate": 9.998749606299213e-06,
+      "loss": 0.315,
+      "step": 798
+    },
+    {
+      "epoch": 0.2516535433070866,
+      "grad_norm": 27.06643295288086,
+      "learning_rate": 9.998748031496064e-06,
+      "loss": 0.4722,
+      "step": 799
+    },
+    {
+      "epoch": 0.25196850393700787,
+      "grad_norm": 23.71688461303711,
+      "learning_rate": 9.998746456692913e-06,
+      "loss": 0.5445,
+      "step": 800
+    },
+    {
+      "epoch": 0.25196850393700787,
+      "eval_loss": 0.51589435338974,
+      "eval_runtime": 316.4111,
+      "eval_samples_per_second": 0.37,
+      "eval_steps_per_second": 0.37,
+      "step": 800
+    },
+    {
+      "epoch": 0.2522834645669291,
+      "grad_norm": 35.45967483520508,
+      "learning_rate": 9.998744881889764e-06,
+      "loss": 0.5146,
+      "step": 801
+    },
+    {
+      "epoch": 0.2525984251968504,
+      "grad_norm": 24.311609268188477,
+      "learning_rate": 9.998743307086615e-06,
+      "loss": 0.596,
+      "step": 802
+    },
+    {
+      "epoch": 0.25291338582677164,
+      "grad_norm": 37.100257873535156,
+      "learning_rate": 9.998741732283466e-06,
+      "loss": 0.4307,
+      "step": 803
+    },
+    {
+      "epoch": 0.2532283464566929,
+      "grad_norm": 21.675411224365234,
+      "learning_rate": 9.998740157480315e-06,
+      "loss": 0.3512,
+      "step": 804
+    },
+    {
+      "epoch": 0.25354330708661416,
+      "grad_norm": 21.883447647094727,
+      "learning_rate": 9.998738582677166e-06,
+      "loss": 0.4789,
+      "step": 805
+    },
+    {
+      "epoch": 0.2538582677165354,
+      "grad_norm": 26.242074966430664,
+      "learning_rate": 9.998737007874016e-06,
+      "loss": 0.6633,
+      "step": 806
+    },
+    {
+      "epoch": 0.25417322834645667,
+      "grad_norm": 36.36134338378906,
+      "learning_rate": 9.998735433070867e-06,
+      "loss": 0.2947,
+      "step": 807
+    },
+    {
+      "epoch": 0.2544881889763779,
+      "grad_norm": 58.036354064941406,
+      "learning_rate": 9.998733858267718e-06,
+      "loss": 0.5375,
+      "step": 808
+    },
+    {
+      "epoch": 0.25480314960629924,
+      "grad_norm": 77.08882141113281,
+      "learning_rate": 9.998732283464567e-06,
+      "loss": 0.2146,
+      "step": 809
+    },
+    {
+      "epoch": 0.2551181102362205,
+      "grad_norm": 25.003931045532227,
+      "learning_rate": 9.998730708661418e-06,
+      "loss": 0.3593,
+      "step": 810
+    },
+    {
+      "epoch": 0.25543307086614175,
+      "grad_norm": 35.63140869140625,
+      "learning_rate": 9.998729133858267e-06,
+      "loss": 0.3855,
+      "step": 811
+    },
+    {
+      "epoch": 0.255748031496063,
+      "grad_norm": 13.581232070922852,
+      "learning_rate": 9.998727559055118e-06,
+      "loss": 0.1482,
+      "step": 812
+    },
+    {
+      "epoch": 0.25606299212598427,
+      "grad_norm": 46.440670013427734,
+      "learning_rate": 9.99872598425197e-06,
+      "loss": 0.5954,
+      "step": 813
+    },
+    {
+      "epoch": 0.2563779527559055,
+      "grad_norm": 24.534271240234375,
+      "learning_rate": 9.99872440944882e-06,
+      "loss": 0.2221,
+      "step": 814
+    },
+    {
+      "epoch": 0.2566929133858268,
+      "grad_norm": 28.46855926513672,
+      "learning_rate": 9.99872283464567e-06,
+      "loss": 0.1972,
+      "step": 815
+    },
+    {
+      "epoch": 0.25700787401574804,
+      "grad_norm": 27.41106605529785,
+      "learning_rate": 9.99872125984252e-06,
+      "loss": 0.2933,
+      "step": 816
+    },
+    {
+      "epoch": 0.2573228346456693,
+      "grad_norm": 77.73954772949219,
+      "learning_rate": 9.998719685039372e-06,
+      "loss": 0.5044,
+      "step": 817
+    },
+    {
+      "epoch": 0.25763779527559055,
+      "grad_norm": 48.21875,
+      "learning_rate": 9.998718110236221e-06,
+      "loss": 0.5613,
+      "step": 818
+    },
+    {
+      "epoch": 0.2579527559055118,
+      "grad_norm": 33.06459045410156,
+      "learning_rate": 9.998716535433072e-06,
+      "loss": 0.4511,
+      "step": 819
+    },
+    {
+      "epoch": 0.25826771653543307,
+      "grad_norm": 91.82710266113281,
+      "learning_rate": 9.998714960629921e-06,
+      "loss": 1.3236,
+      "step": 820
+    },
+    {
+      "epoch": 0.25826771653543307,
+      "eval_loss": 0.4647013545036316,
+      "eval_runtime": 297.9126,
+      "eval_samples_per_second": 0.393,
+      "eval_steps_per_second": 0.393,
+      "step": 820
+    },
+    {
+      "epoch": 0.2585826771653543,
+      "grad_norm": 80.5277328491211,
+      "learning_rate": 9.998713385826772e-06,
+      "loss": 0.7502,
+      "step": 821
+    },
+    {
+      "epoch": 0.2588976377952756,
+      "grad_norm": 200.29324340820312,
+      "learning_rate": 9.998711811023623e-06,
+      "loss": 1.2537,
+      "step": 822
+    },
+    {
+      "epoch": 0.25921259842519684,
+      "grad_norm": 22.48586654663086,
+      "learning_rate": 9.998710236220474e-06,
+      "loss": 0.1774,
+      "step": 823
+    },
+    {
+      "epoch": 0.2595275590551181,
+      "grad_norm": 43.362831115722656,
+      "learning_rate": 9.998708661417323e-06,
+      "loss": 0.336,
+      "step": 824
+    },
+    {
+      "epoch": 0.25984251968503935,
+      "grad_norm": 27.631332397460938,
+      "learning_rate": 9.998707086614174e-06,
+      "loss": 0.2185,
+      "step": 825
+    },
+    {
+      "epoch": 0.2601574803149606,
+      "grad_norm": 44.31364440917969,
+      "learning_rate": 9.998705511811024e-06,
+      "loss": 0.3823,
+      "step": 826
+    },
+    {
+      "epoch": 0.26047244094488187,
+      "grad_norm": 78.68717193603516,
+      "learning_rate": 9.998703937007875e-06,
+      "loss": 0.5907,
+      "step": 827
+    },
+    {
+      "epoch": 0.2607874015748031,
+      "grad_norm": 86.36324310302734,
+      "learning_rate": 9.998702362204726e-06,
+      "loss": 0.7028,
+      "step": 828
+    },
+    {
+      "epoch": 0.26110236220472444,
+      "grad_norm": 108.75439453125,
+      "learning_rate": 9.998700787401575e-06,
+      "loss": 0.9105,
+      "step": 829
+    },
+    {
+      "epoch": 0.2614173228346457,
+      "grad_norm": 33.248592376708984,
+      "learning_rate": 9.998699212598426e-06,
+      "loss": 0.2809,
+      "step": 830
+    },
+    {
+      "epoch": 0.26173228346456695,
+      "grad_norm": 61.382259368896484,
+      "learning_rate": 9.998697637795275e-06,
+      "loss": 0.5749,
+      "step": 831
+    },
+    {
+      "epoch": 0.2620472440944882,
+      "grad_norm": 34.68425750732422,
+      "learning_rate": 9.998696062992126e-06,
+      "loss": 0.5513,
+      "step": 832
+    },
+    {
+      "epoch": 0.26236220472440946,
+      "grad_norm": 25.614322662353516,
+      "learning_rate": 9.998694488188977e-06,
+      "loss": 0.0668,
+      "step": 833
+    },
+    {
+      "epoch": 0.2626771653543307,
+      "grad_norm": 42.22956848144531,
+      "learning_rate": 9.998692913385828e-06,
+      "loss": 0.3998,
+      "step": 834
+    },
+    {
+      "epoch": 0.262992125984252,
+      "grad_norm": 34.24924087524414,
+      "learning_rate": 9.998691338582678e-06,
+      "loss": 0.2085,
+      "step": 835
+    },
+    {
+      "epoch": 0.26330708661417324,
+      "grad_norm": 72.84844970703125,
+      "learning_rate": 9.998689763779529e-06,
+      "loss": 0.7099,
+      "step": 836
+    },
+    {
+      "epoch": 0.2636220472440945,
+      "grad_norm": 16.073625564575195,
+      "learning_rate": 9.99868818897638e-06,
+      "loss": 0.1643,
+      "step": 837
+    },
+    {
+      "epoch": 0.26393700787401575,
+      "grad_norm": 12.518115997314453,
+      "learning_rate": 9.998686614173229e-06,
+      "loss": 0.1489,
+      "step": 838
+    },
+    {
+      "epoch": 0.264251968503937,
+      "grad_norm": 19.30543327331543,
+      "learning_rate": 9.99868503937008e-06,
+      "loss": 0.2271,
+      "step": 839
+    },
+    {
+      "epoch": 0.26456692913385826,
+      "grad_norm": 80.19608306884766,
+      "learning_rate": 9.998683464566929e-06,
+      "loss": 0.3996,
+      "step": 840
+    },
+    {
+      "epoch": 0.26456692913385826,
+      "eval_loss": 0.46735909581184387,
+      "eval_runtime": 308.0881,
+      "eval_samples_per_second": 0.38,
+      "eval_steps_per_second": 0.38,
+      "step": 840
+    },
+    {
+      "epoch": 0.2648818897637795,
+      "grad_norm": 47.69651412963867,
+      "learning_rate": 9.99868188976378e-06,
+      "loss": 0.4229,
+      "step": 841
+    },
+    {
+      "epoch": 0.2651968503937008,
+      "grad_norm": 40.88669204711914,
+      "learning_rate": 9.998680314960631e-06,
+      "loss": 0.594,
+      "step": 842
+    },
+    {
+      "epoch": 0.26551181102362204,
+      "grad_norm": 44.67588806152344,
+      "learning_rate": 9.998678740157482e-06,
+      "loss": 0.1496,
+      "step": 843
+    },
+    {
+      "epoch": 0.2658267716535433,
+      "grad_norm": 24.644241333007812,
+      "learning_rate": 9.998677165354331e-06,
+      "loss": 0.3401,
+      "step": 844
+    },
+    {
+      "epoch": 0.26614173228346455,
+      "grad_norm": 18.624984741210938,
+      "learning_rate": 9.998675590551182e-06,
+      "loss": 0.2221,
+      "step": 845
+    },
+    {
+      "epoch": 0.2664566929133858,
+      "grad_norm": 17.709203720092773,
+      "learning_rate": 9.998674015748032e-06,
+      "loss": 0.1178,
+      "step": 846
+    },
+    {
+      "epoch": 0.26677165354330706,
+      "grad_norm": 24.67478370666504,
+      "learning_rate": 9.998672440944883e-06,
+      "loss": 0.1389,
+      "step": 847
+    },
+    {
+      "epoch": 0.2670866141732283,
+      "grad_norm": 31.281604766845703,
+      "learning_rate": 9.998670866141734e-06,
+      "loss": 0.2201,
+      "step": 848
+    },
+    {
+      "epoch": 0.26740157480314963,
+      "grad_norm": 37.749542236328125,
+      "learning_rate": 9.998669291338583e-06,
+      "loss": 0.2498,
+      "step": 849
+    },
+    {
+      "epoch": 0.2677165354330709,
+      "grad_norm": 77.09476470947266,
+      "learning_rate": 9.998667716535434e-06,
+      "loss": 1.0283,
+      "step": 850
+    },
+    {
+      "epoch": 0.26803149606299215,
+      "grad_norm": 63.68134307861328,
+      "learning_rate": 9.998666141732283e-06,
+      "loss": 0.4823,
+      "step": 851
+    },
+    {
+      "epoch": 0.2683464566929134,
+      "grad_norm": 78.47075653076172,
+      "learning_rate": 9.998664566929134e-06,
+      "loss": 0.42,
+      "step": 852
+    },
+    {
+      "epoch": 0.26866141732283466,
+      "grad_norm": 116.93570709228516,
+      "learning_rate": 9.998662992125985e-06,
+      "loss": 0.2312,
+      "step": 853
+    },
+    {
+      "epoch": 0.2689763779527559,
+      "grad_norm": 20.844566345214844,
+      "learning_rate": 9.998661417322836e-06,
+      "loss": 0.1281,
+      "step": 854
+    },
+    {
+      "epoch": 0.2692913385826772,
+      "grad_norm": 26.171772003173828,
+      "learning_rate": 9.998659842519686e-06,
+      "loss": 0.2235,
+      "step": 855
+    },
+    {
+      "epoch": 0.26960629921259843,
+      "grad_norm": 63.36984634399414,
+      "learning_rate": 9.998658267716537e-06,
+      "loss": 0.6844,
+      "step": 856
+    },
+    {
+      "epoch": 0.2699212598425197,
+      "grad_norm": 76.0230941772461,
+      "learning_rate": 9.998656692913386e-06,
+      "loss": 0.5884,
+      "step": 857
+    },
+    {
+      "epoch": 0.27023622047244095,
+      "grad_norm": 57.21022033691406,
+      "learning_rate": 9.998655118110237e-06,
+      "loss": 0.6529,
+      "step": 858
+    },
+    {
+      "epoch": 0.2705511811023622,
+      "grad_norm": 17.419769287109375,
+      "learning_rate": 9.998653543307088e-06,
+      "loss": 0.0747,
+      "step": 859
+    },
+    {
+      "epoch": 0.27086614173228346,
+      "grad_norm": 87.31539154052734,
+      "learning_rate": 9.998651968503937e-06,
+      "loss": 0.6404,
+      "step": 860
+    },
+    {
+      "epoch": 0.27086614173228346,
+      "eval_loss": 0.5078207850456238,
+      "eval_runtime": 304.5599,
+      "eval_samples_per_second": 0.384,
+      "eval_steps_per_second": 0.384,
+      "step": 860
+    },
+    {
+      "epoch": 0.2711811023622047,
+      "grad_norm": 39.05491638183594,
+      "learning_rate": 9.998650393700788e-06,
+      "loss": 0.3182,
+      "step": 861
+    },
+    {
+      "epoch": 0.271496062992126,
+      "grad_norm": 43.85835647583008,
+      "learning_rate": 9.998648818897639e-06,
+      "loss": 0.2367,
+      "step": 862
+    },
+    {
+      "epoch": 0.27181102362204723,
+      "grad_norm": 61.60994338989258,
+      "learning_rate": 9.99864724409449e-06,
+      "loss": 0.7357,
+      "step": 863
+    },
+    {
+      "epoch": 0.2721259842519685,
+      "grad_norm": 61.263484954833984,
+      "learning_rate": 9.99864566929134e-06,
+      "loss": 0.1954,
+      "step": 864
+    },
+    {
+      "epoch": 0.27244094488188975,
+      "grad_norm": 41.59515380859375,
+      "learning_rate": 9.99864409448819e-06,
+      "loss": 0.4512,
+      "step": 865
+    },
+    {
+      "epoch": 0.272755905511811,
+      "grad_norm": 24.524341583251953,
+      "learning_rate": 9.99864251968504e-06,
+      "loss": 0.1307,
+      "step": 866
+    },
+    {
+      "epoch": 0.27307086614173226,
+      "grad_norm": 58.72618865966797,
+      "learning_rate": 9.99864094488189e-06,
+      "loss": 0.707,
+      "step": 867
+    },
+    {
+      "epoch": 0.2733858267716535,
+      "grad_norm": 46.32933044433594,
+      "learning_rate": 9.998639370078742e-06,
+      "loss": 0.218,
+      "step": 868
+    },
+    {
+      "epoch": 0.27370078740157483,
+      "grad_norm": 36.803565979003906,
+      "learning_rate": 9.998637795275591e-06,
+      "loss": 0.1957,
+      "step": 869
+    },
+    {
+      "epoch": 0.2740157480314961,
+      "grad_norm": 29.207927703857422,
+      "learning_rate": 9.998636220472442e-06,
+      "loss": 0.3488,
+      "step": 870
+    },
+    {
+      "epoch": 0.27433070866141734,
+      "grad_norm": 26.461669921875,
+      "learning_rate": 9.998634645669291e-06,
+      "loss": 0.1342,
+      "step": 871
+    },
+    {
+      "epoch": 0.2746456692913386,
+      "grad_norm": 98.34436798095703,
+      "learning_rate": 9.998633070866142e-06,
+      "loss": 0.2521,
+      "step": 872
+    },
+    {
+      "epoch": 0.27496062992125986,
+      "grad_norm": 40.949153900146484,
+      "learning_rate": 9.998631496062993e-06,
+      "loss": 0.3699,
+      "step": 873
+    },
+    {
+      "epoch": 0.2752755905511811,
+      "grad_norm": 26.012012481689453,
+      "learning_rate": 9.998629921259844e-06,
+      "loss": 0.2719,
+      "step": 874
+    },
+    {
+      "epoch": 0.2755905511811024,
+      "grad_norm": 28.45779800415039,
+      "learning_rate": 9.998628346456693e-06,
+      "loss": 0.1321,
+      "step": 875
+    },
+    {
+      "epoch": 0.27590551181102363,
+      "grad_norm": 63.54460144042969,
+      "learning_rate": 9.998626771653544e-06,
+      "loss": 0.5929,
+      "step": 876
+    },
+    {
+      "epoch": 0.2762204724409449,
+      "grad_norm": 101.21806335449219,
+      "learning_rate": 9.998625196850394e-06,
+      "loss": 0.6633,
+      "step": 877
+    },
+    {
+      "epoch": 0.27653543307086614,
+      "grad_norm": 8.460577964782715,
+      "learning_rate": 9.998623622047245e-06,
+      "loss": 0.0187,
+      "step": 878
+    },
+    {
+      "epoch": 0.2768503937007874,
+      "grad_norm": 101.49215698242188,
+      "learning_rate": 9.998622047244096e-06,
+      "loss": 0.7255,
+      "step": 879
+    },
+    {
+      "epoch": 0.27716535433070866,
+      "grad_norm": 110.3086929321289,
+      "learning_rate": 9.998620472440945e-06,
+      "loss": 0.9578,
+      "step": 880
+    },
+    {
+      "epoch": 0.27716535433070866,
+      "eval_loss": 0.49943187832832336,
+      "eval_runtime": 306.9283,
+      "eval_samples_per_second": 0.381,
+      "eval_steps_per_second": 0.381,
+      "step": 880
+    },
+    {
+      "epoch": 0.2774803149606299,
+      "grad_norm": 6.745794773101807,
+      "learning_rate": 9.998618897637796e-06,
+      "loss": 0.0379,
+      "step": 881
+    },
+    {
+      "epoch": 0.27779527559055117,
+      "grad_norm": 51.96881866455078,
+      "learning_rate": 9.998617322834645e-06,
+      "loss": 0.7563,
+      "step": 882
+    },
+    {
+      "epoch": 0.27811023622047243,
+      "grad_norm": 52.106834411621094,
+      "learning_rate": 9.998615748031496e-06,
+      "loss": 0.4385,
+      "step": 883
+    },
+    {
+      "epoch": 0.2784251968503937,
+      "grad_norm": 69.72996520996094,
+      "learning_rate": 9.998614173228347e-06,
+      "loss": 0.8885,
+      "step": 884
+    },
+    {
+      "epoch": 0.27874015748031494,
+      "grad_norm": 67.1977310180664,
+      "learning_rate": 9.998612598425198e-06,
+      "loss": 0.814,
+      "step": 885
+    },
+    {
+      "epoch": 0.2790551181102362,
+      "grad_norm": 111.88276672363281,
+      "learning_rate": 9.998611023622048e-06,
+      "loss": 0.6336,
+      "step": 886
+    },
+    {
+      "epoch": 0.27937007874015746,
+      "grad_norm": 40.74708557128906,
+      "learning_rate": 9.998609448818899e-06,
+      "loss": 0.2043,
+      "step": 887
+    },
+    {
+      "epoch": 0.2796850393700787,
+      "grad_norm": 42.77908706665039,
+      "learning_rate": 9.99860787401575e-06,
+      "loss": 0.2083,
+      "step": 888
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 57.296024322509766,
+      "learning_rate": 9.998606299212599e-06,
+      "loss": 0.1104,
+      "step": 889
+    },
+    {
+      "epoch": 0.2803149606299213,
+      "grad_norm": 32.91524124145508,
+      "learning_rate": 9.99860472440945e-06,
+      "loss": 0.498,
+      "step": 890
+    },
+    {
+      "epoch": 0.28062992125984254,
+      "grad_norm": 71.90412139892578,
+      "learning_rate": 9.9986031496063e-06,
+      "loss": 0.8651,
+      "step": 891
+    },
+    {
+      "epoch": 0.2809448818897638,
+      "grad_norm": 11.306217193603516,
+      "learning_rate": 9.99860157480315e-06,
+      "loss": 0.0642,
+      "step": 892
+    },
+    {
+      "epoch": 0.28125984251968505,
+      "grad_norm": 15.339298248291016,
+      "learning_rate": 9.998600000000001e-06,
+      "loss": 0.0829,
+      "step": 893
+    },
+    {
+      "epoch": 0.2815748031496063,
+      "grad_norm": 59.414466857910156,
+      "learning_rate": 9.998598425196852e-06,
+      "loss": 0.5543,
+      "step": 894
+    },
+    {
+      "epoch": 0.28188976377952757,
+      "grad_norm": 66.70774841308594,
+      "learning_rate": 9.998596850393701e-06,
+      "loss": 0.3054,
+      "step": 895
+    },
+    {
+      "epoch": 0.2822047244094488,
+      "grad_norm": 20.609098434448242,
+      "learning_rate": 9.998595275590552e-06,
+      "loss": 0.1729,
+      "step": 896
+    },
+    {
+      "epoch": 0.2825196850393701,
+      "grad_norm": 7.707085609436035,
+      "learning_rate": 9.998593700787402e-06,
+      "loss": 0.0473,
+      "step": 897
+    },
+    {
+      "epoch": 0.28283464566929134,
+      "grad_norm": 41.48631286621094,
+      "learning_rate": 9.998592125984253e-06,
+      "loss": 0.4578,
+      "step": 898
+    },
+    {
+      "epoch": 0.2831496062992126,
+      "grad_norm": 23.614355087280273,
+      "learning_rate": 9.998590551181104e-06,
+      "loss": 0.193,
+      "step": 899
+    },
+    {
+      "epoch": 0.28346456692913385,
+      "grad_norm": 15.626468658447266,
+      "learning_rate": 9.998588976377953e-06,
+      "loss": 0.0845,
+      "step": 900
+    },
+    {
+      "epoch": 0.28346456692913385,
+      "eval_loss": 0.5890966653823853,
+      "eval_runtime": 302.1316,
+      "eval_samples_per_second": 0.387,
+      "eval_steps_per_second": 0.387,
+      "step": 900
+    },
+    {
+      "epoch": 0.2837795275590551,
+      "grad_norm": 60.57646942138672,
+      "learning_rate": 9.998587401574804e-06,
+      "loss": 0.3963,
+      "step": 901
+    },
+    {
+      "epoch": 0.28409448818897637,
+      "grad_norm": 47.83597946166992,
+      "learning_rate": 9.998585826771653e-06,
+      "loss": 0.6909,
+      "step": 902
+    },
+    {
+      "epoch": 0.2844094488188976,
+      "grad_norm": 26.267818450927734,
+      "learning_rate": 9.998584251968504e-06,
+      "loss": 0.0956,
+      "step": 903
+    },
+    {
+      "epoch": 0.2847244094488189,
+      "grad_norm": 63.959110260009766,
+      "learning_rate": 9.998582677165355e-06,
+      "loss": 0.4272,
+      "step": 904
+    },
+    {
+      "epoch": 0.28503937007874014,
+      "grad_norm": 59.13768005371094,
+      "learning_rate": 9.998581102362206e-06,
+      "loss": 0.3178,
+      "step": 905
+    },
+    {
+      "epoch": 0.2853543307086614,
+      "grad_norm": 39.657814025878906,
+      "learning_rate": 9.998579527559056e-06,
+      "loss": 0.6433,
+      "step": 906
+    },
+    {
+      "epoch": 0.28566929133858265,
+      "grad_norm": 90.1864013671875,
+      "learning_rate": 9.998577952755907e-06,
+      "loss": 0.6963,
+      "step": 907
+    },
+    {
+      "epoch": 0.2859842519685039,
+      "grad_norm": 121.48163604736328,
+      "learning_rate": 9.998576377952758e-06,
+      "loss": 1.8743,
+      "step": 908
+    },
+    {
+      "epoch": 0.2862992125984252,
+      "grad_norm": 37.84361267089844,
+      "learning_rate": 9.998574803149607e-06,
+      "loss": 0.3433,
+      "step": 909
+    },
+    {
+      "epoch": 0.2866141732283465,
+      "grad_norm": 42.82717514038086,
+      "learning_rate": 9.998573228346458e-06,
+      "loss": 0.3181,
+      "step": 910
+    },
+    {
+      "epoch": 0.28692913385826774,
+      "grad_norm": 55.58892822265625,
+      "learning_rate": 9.998571653543307e-06,
+      "loss": 1.2413,
+      "step": 911
+    },
+    {
+      "epoch": 0.287244094488189,
+      "grad_norm": 56.79817581176758,
+      "learning_rate": 9.998570078740158e-06,
+      "loss": 0.3707,
+      "step": 912
+    },
+    {
+      "epoch": 0.28755905511811025,
+      "grad_norm": 54.66569900512695,
+      "learning_rate": 9.998568503937009e-06,
+      "loss": 0.7417,
+      "step": 913
+    },
+    {
+      "epoch": 0.2878740157480315,
+      "grad_norm": 22.454544067382812,
+      "learning_rate": 9.99856692913386e-06,
+      "loss": 0.1303,
+      "step": 914
+    },
+    {
+      "epoch": 0.28818897637795277,
+      "grad_norm": 72.16681671142578,
+      "learning_rate": 9.99856535433071e-06,
+      "loss": 0.4007,
+      "step": 915
+    },
+    {
+      "epoch": 0.288503937007874,
+      "grad_norm": 52.15703201293945,
+      "learning_rate": 9.99856377952756e-06,
+      "loss": 0.7637,
+      "step": 916
+    },
+    {
+      "epoch": 0.2888188976377953,
+      "grad_norm": 8.107488632202148,
+      "learning_rate": 9.99856220472441e-06,
+      "loss": 0.0643,
+      "step": 917
+    },
+    {
+      "epoch": 0.28913385826771654,
+      "grad_norm": 47.568267822265625,
+      "learning_rate": 9.99856062992126e-06,
+      "loss": 0.4449,
+      "step": 918
+    },
+    {
+      "epoch": 0.2894488188976378,
+      "grad_norm": 21.596525192260742,
+      "learning_rate": 9.998559055118112e-06,
+      "loss": 0.1228,
+      "step": 919
+    },
+    {
+      "epoch": 0.28976377952755905,
+      "grad_norm": 40.52389144897461,
+      "learning_rate": 9.998557480314961e-06,
+      "loss": 0.446,
+      "step": 920
+    },
+    {
+      "epoch": 0.28976377952755905,
+      "eval_loss": 0.5098508596420288,
+      "eval_runtime": 307.5399,
+      "eval_samples_per_second": 0.38,
+      "eval_steps_per_second": 0.38,
+      "step": 920
+    },
+    {
+      "epoch": 0.2900787401574803,
+      "grad_norm": 43.565303802490234,
+      "learning_rate": 9.998555905511812e-06,
+      "loss": 0.5184,
+      "step": 921
+    },
+    {
+      "epoch": 0.29039370078740157,
+      "grad_norm": 54.92490768432617,
+      "learning_rate": 9.998554330708661e-06,
+      "loss": 0.7516,
+      "step": 922
+    },
+    {
+      "epoch": 0.2907086614173228,
+      "grad_norm": 47.38011169433594,
+      "learning_rate": 9.998552755905512e-06,
+      "loss": 0.4989,
+      "step": 923
+    },
+    {
+      "epoch": 0.2910236220472441,
+      "grad_norm": 63.52509689331055,
+      "learning_rate": 9.998551181102363e-06,
+      "loss": 0.4689,
+      "step": 924
+    },
+    {
+      "epoch": 0.29133858267716534,
+      "grad_norm": 38.14700698852539,
+      "learning_rate": 9.998549606299214e-06,
+      "loss": 0.5137,
+      "step": 925
+    },
+    {
+      "epoch": 0.2916535433070866,
+      "grad_norm": 63.95713806152344,
+      "learning_rate": 9.998548031496063e-06,
+      "loss": 0.3511,
+      "step": 926
+    },
+    {
+      "epoch": 0.29196850393700785,
+      "grad_norm": 38.79820251464844,
+      "learning_rate": 9.998546456692914e-06,
+      "loss": 0.3497,
+      "step": 927
+    },
+    {
+      "epoch": 0.2922834645669291,
+      "grad_norm": 76.02424621582031,
+      "learning_rate": 9.998544881889764e-06,
+      "loss": 0.416,
+      "step": 928
+    },
+    {
+      "epoch": 0.2925984251968504,
+      "grad_norm": 45.44684982299805,
+      "learning_rate": 9.998543307086615e-06,
+      "loss": 0.4091,
+      "step": 929
+    },
+    {
+      "epoch": 0.2929133858267717,
+      "grad_norm": 32.81157684326172,
+      "learning_rate": 9.998541732283466e-06,
+      "loss": 0.4614,
+      "step": 930
+    },
+    {
+      "epoch": 0.29322834645669293,
+      "grad_norm": 45.81043243408203,
+      "learning_rate": 9.998540157480315e-06,
+      "loss": 0.4832,
+      "step": 931
+    },
+    {
+      "epoch": 0.2935433070866142,
+      "grad_norm": 42.5070915222168,
+      "learning_rate": 9.998538582677166e-06,
+      "loss": 0.352,
+      "step": 932
+    },
+    {
+      "epoch": 0.29385826771653545,
+      "grad_norm": 40.78940200805664,
+      "learning_rate": 9.998537007874017e-06,
+      "loss": 0.9412,
+      "step": 933
+    },
+    {
+      "epoch": 0.2941732283464567,
+      "grad_norm": 44.96437454223633,
+      "learning_rate": 9.998535433070868e-06,
+      "loss": 0.9758,
+      "step": 934
+    },
+    {
+      "epoch": 0.29448818897637796,
+      "grad_norm": 8.247536659240723,
+      "learning_rate": 9.998533858267717e-06,
+      "loss": 0.0863,
+      "step": 935
+    },
+    {
+      "epoch": 0.2948031496062992,
+      "grad_norm": 72.65038299560547,
+      "learning_rate": 9.998532283464568e-06,
+      "loss": 0.6317,
+      "step": 936
+    },
+    {
+      "epoch": 0.2951181102362205,
+      "grad_norm": 28.256349563598633,
+      "learning_rate": 9.998530708661418e-06,
+      "loss": 0.3451,
+      "step": 937
+    },
+    {
+      "epoch": 0.29543307086614173,
+      "grad_norm": 17.122854232788086,
+      "learning_rate": 9.998529133858269e-06,
+      "loss": 0.2172,
+      "step": 938
+    },
+    {
+      "epoch": 0.295748031496063,
+      "grad_norm": 25.158649444580078,
+      "learning_rate": 9.99852755905512e-06,
+      "loss": 0.3431,
+      "step": 939
+    },
+    {
+      "epoch": 0.29606299212598425,
+      "grad_norm": 48.18063735961914,
+      "learning_rate": 9.998525984251969e-06,
+      "loss": 0.6472,
+      "step": 940
+    },
+    {
+      "epoch": 0.29606299212598425,
+      "eval_loss": 0.4980691373348236,
+      "eval_runtime": 301.7364,
+      "eval_samples_per_second": 0.388,
+      "eval_steps_per_second": 0.388,
+      "step": 940
+    },
+    {
+      "epoch": 0.2963779527559055,
+      "grad_norm": 93.8964614868164,
+      "learning_rate": 9.99852440944882e-06,
+      "loss": 0.8136,
+      "step": 941
+    },
+    {
+      "epoch": 0.29669291338582676,
+      "grad_norm": 19.963516235351562,
+      "learning_rate": 9.99852283464567e-06,
+      "loss": 0.194,
+      "step": 942
+    },
+    {
+      "epoch": 0.297007874015748,
+      "grad_norm": 20.99871826171875,
+      "learning_rate": 9.99852125984252e-06,
+      "loss": 0.294,
+      "step": 943
+    },
+    {
+      "epoch": 0.2973228346456693,
+      "grad_norm": 43.991703033447266,
+      "learning_rate": 9.998519685039371e-06,
+      "loss": 0.4679,
+      "step": 944
+    },
+    {
+      "epoch": 0.29763779527559053,
+      "grad_norm": 48.08595275878906,
+      "learning_rate": 9.998518110236222e-06,
+      "loss": 0.6872,
+      "step": 945
+    },
+    {
+      "epoch": 0.2979527559055118,
+      "grad_norm": 14.398991584777832,
+      "learning_rate": 9.998516535433071e-06,
+      "loss": 0.1213,
+      "step": 946
+    },
+    {
+      "epoch": 0.29826771653543305,
+      "grad_norm": 60.385154724121094,
+      "learning_rate": 9.998514960629922e-06,
+      "loss": 0.9587,
+      "step": 947
+    },
+    {
+      "epoch": 0.2985826771653543,
+      "grad_norm": 23.951004028320312,
+      "learning_rate": 9.998513385826772e-06,
+      "loss": 0.1824,
+      "step": 948
+    },
+    {
+      "epoch": 0.2988976377952756,
+      "grad_norm": 24.64979362487793,
+      "learning_rate": 9.998511811023623e-06,
+      "loss": 0.2494,
+      "step": 949
+    },
+    {
+      "epoch": 0.2992125984251969,
+      "grad_norm": 33.05177307128906,
+      "learning_rate": 9.998510236220474e-06,
+      "loss": 0.8105,
+      "step": 950
+    },
+    {
+      "epoch": 0.29952755905511813,
+      "grad_norm": 59.83413314819336,
+      "learning_rate": 9.998508661417323e-06,
+      "loss": 0.6087,
+      "step": 951
+    },
+    {
+      "epoch": 0.2998425196850394,
+      "grad_norm": 79.51978302001953,
+      "learning_rate": 9.998507086614174e-06,
+      "loss": 0.8573,
+      "step": 952
+    },
+    {
+      "epoch": 0.30015748031496065,
+      "grad_norm": 59.22591018676758,
+      "learning_rate": 9.998505511811023e-06,
+      "loss": 0.7464,
+      "step": 953
+    },
+    {
+      "epoch": 0.3004724409448819,
+      "grad_norm": 53.090614318847656,
+      "learning_rate": 9.998503937007876e-06,
+      "loss": 0.6472,
+      "step": 954
+    },
+    {
+      "epoch": 0.30078740157480316,
+      "grad_norm": 35.13800048828125,
+      "learning_rate": 9.998502362204725e-06,
+      "loss": 0.2764,
+      "step": 955
+    },
+    {
+      "epoch": 0.3011023622047244,
+      "grad_norm": 18.853023529052734,
+      "learning_rate": 9.998500787401576e-06,
+      "loss": 0.1602,
+      "step": 956
+    },
+    {
+      "epoch": 0.3014173228346457,
+      "grad_norm": 13.263671875,
+      "learning_rate": 9.998499212598426e-06,
+      "loss": 0.1145,
+      "step": 957
+    },
+    {
+      "epoch": 0.30173228346456693,
+      "grad_norm": 38.00738525390625,
+      "learning_rate": 9.998497637795277e-06,
+      "loss": 0.1779,
+      "step": 958
+    },
+    {
+      "epoch": 0.3020472440944882,
+      "grad_norm": 29.51807403564453,
+      "learning_rate": 9.998496062992128e-06,
+      "loss": 0.3998,
+      "step": 959
+    },
+    {
+      "epoch": 0.30236220472440944,
+      "grad_norm": 49.63967514038086,
+      "learning_rate": 9.998494488188977e-06,
+      "loss": 1.2491,
+      "step": 960
+    },
+    {
+      "epoch": 0.30236220472440944,
+      "eval_loss": 0.505746066570282,
+      "eval_runtime": 303.3875,
+      "eval_samples_per_second": 0.386,
+      "eval_steps_per_second": 0.386,
+      "step": 960
+    },
+    {
+      "epoch": 0.3026771653543307,
+      "grad_norm": 22.166236877441406,
+      "learning_rate": 9.998492913385828e-06,
+      "loss": 0.2884,
+      "step": 961
+    },
+    {
+      "epoch": 0.30299212598425196,
+      "grad_norm": 9.977280616760254,
+      "learning_rate": 9.998491338582677e-06,
+      "loss": 0.0985,
+      "step": 962
+    },
+    {
+      "epoch": 0.3033070866141732,
+      "grad_norm": 51.85643005371094,
+      "learning_rate": 9.998489763779528e-06,
+      "loss": 0.4792,
+      "step": 963
+    },
+    {
+      "epoch": 0.3036220472440945,
+      "grad_norm": 11.42037582397461,
+      "learning_rate": 9.998488188976379e-06,
+      "loss": 0.0609,
+      "step": 964
+    },
+    {
+      "epoch": 0.30393700787401573,
+      "grad_norm": 29.179784774780273,
+      "learning_rate": 9.99848661417323e-06,
+      "loss": 0.2065,
+      "step": 965
+    },
+    {
+      "epoch": 0.304251968503937,
+      "grad_norm": 99.96176147460938,
+      "learning_rate": 9.99848503937008e-06,
+      "loss": 0.1855,
+      "step": 966
+    },
+    {
+      "epoch": 0.30456692913385824,
+      "grad_norm": 67.8636245727539,
+      "learning_rate": 9.99848346456693e-06,
+      "loss": 0.4098,
+      "step": 967
+    },
+    {
+      "epoch": 0.3048818897637795,
+      "grad_norm": 67.14287567138672,
+      "learning_rate": 9.99848188976378e-06,
+      "loss": 0.8962,
+      "step": 968
+    },
+    {
+      "epoch": 0.3051968503937008,
+      "grad_norm": 52.694175720214844,
+      "learning_rate": 9.99848031496063e-06,
+      "loss": 0.3033,
+      "step": 969
+    },
+    {
+      "epoch": 0.30551181102362207,
+      "grad_norm": 124.31986236572266,
+      "learning_rate": 9.998478740157482e-06,
+      "loss": 0.6216,
+      "step": 970
+    },
+    {
+      "epoch": 0.30582677165354333,
+      "grad_norm": 59.26445770263672,
+      "learning_rate": 9.998477165354331e-06,
+      "loss": 0.4527,
+      "step": 971
+    },
+    {
+      "epoch": 0.3061417322834646,
+      "grad_norm": 28.591516494750977,
+      "learning_rate": 9.998475590551182e-06,
+      "loss": 0.1072,
+      "step": 972
+    },
+    {
+      "epoch": 0.30645669291338584,
+      "grad_norm": 117.83760833740234,
+      "learning_rate": 9.998474015748031e-06,
+      "loss": 0.622,
+      "step": 973
+    },
+    {
+      "epoch": 0.3067716535433071,
+      "grad_norm": 34.50752639770508,
+      "learning_rate": 9.998472440944882e-06,
+      "loss": 0.4129,
+      "step": 974
+    },
+    {
+      "epoch": 0.30708661417322836,
+      "grad_norm": 8.890520095825195,
+      "learning_rate": 9.998470866141733e-06,
+      "loss": 0.0409,
+      "step": 975
+    },
+    {
+      "epoch": 0.3074015748031496,
+      "grad_norm": 52.015785217285156,
+      "learning_rate": 9.998469291338584e-06,
+      "loss": 0.4855,
+      "step": 976
+    },
+    {
+      "epoch": 0.30771653543307087,
+      "grad_norm": 23.1546688079834,
+      "learning_rate": 9.998467716535434e-06,
+      "loss": 0.1683,
+      "step": 977
+    },
+    {
+      "epoch": 0.3080314960629921,
+      "grad_norm": 26.71653938293457,
+      "learning_rate": 9.998466141732285e-06,
+      "loss": 0.1988,
+      "step": 978
+    },
+    {
+      "epoch": 0.3083464566929134,
+      "grad_norm": 55.756309509277344,
+      "learning_rate": 9.998464566929135e-06,
+      "loss": 0.6107,
+      "step": 979
+    },
+    {
+      "epoch": 0.30866141732283464,
+      "grad_norm": 38.31888198852539,
+      "learning_rate": 9.998462992125985e-06,
+      "loss": 0.3716,
+      "step": 980
+    },
+    {
+      "epoch": 0.30866141732283464,
+      "eval_loss": 0.4945707321166992,
+      "eval_runtime": 303.1038,
+      "eval_samples_per_second": 0.386,
+      "eval_steps_per_second": 0.386,
+      "step": 980
+    },
+    {
+      "epoch": 0.3089763779527559,
+      "grad_norm": 136.65208435058594,
+      "learning_rate": 9.998461417322836e-06,
+      "loss": 1.04,
+      "step": 981
+    },
+    {
+      "epoch": 0.30929133858267716,
+      "grad_norm": 70.81889343261719,
+      "learning_rate": 9.998459842519685e-06,
+      "loss": 0.7749,
+      "step": 982
+    },
+    {
+      "epoch": 0.3096062992125984,
+      "grad_norm": 81.47750854492188,
+      "learning_rate": 9.998458267716536e-06,
+      "loss": 0.4565,
+      "step": 983
+    },
+    {
+      "epoch": 0.30992125984251967,
+      "grad_norm": 84.3187255859375,
+      "learning_rate": 9.998456692913387e-06,
+      "loss": 1.1115,
+      "step": 984
+    },
+    {
+      "epoch": 0.3102362204724409,
+      "grad_norm": 121.83627319335938,
+      "learning_rate": 9.998455118110238e-06,
+      "loss": 0.4089,
+      "step": 985
+    },
+    {
+      "epoch": 0.3105511811023622,
+      "grad_norm": 46.44225311279297,
+      "learning_rate": 9.998453543307087e-06,
+      "loss": 0.6732,
+      "step": 986
+    },
+    {
+      "epoch": 0.31086614173228344,
+      "grad_norm": 60.36497497558594,
+      "learning_rate": 9.998451968503938e-06,
+      "loss": 0.3421,
+      "step": 987
+    },
+    {
+      "epoch": 0.3111811023622047,
+      "grad_norm": 34.300819396972656,
+      "learning_rate": 9.998450393700788e-06,
+      "loss": 0.1633,
+      "step": 988
+    },
+    {
+      "epoch": 0.311496062992126,
+      "grad_norm": 29.893108367919922,
+      "learning_rate": 9.998448818897639e-06,
+      "loss": 0.2435,
+      "step": 989
+    },
+    {
+      "epoch": 0.31181102362204727,
+      "grad_norm": 56.01438522338867,
+      "learning_rate": 9.99844724409449e-06,
+      "loss": 0.2676,
+      "step": 990
+    },
+    {
+      "epoch": 0.3121259842519685,
+      "grad_norm": 58.90812301635742,
+      "learning_rate": 9.998445669291339e-06,
+      "loss": 0.362,
+      "step": 991
+    },
+    {
+      "epoch": 0.3124409448818898,
+      "grad_norm": 54.015743255615234,
+      "learning_rate": 9.99844409448819e-06,
+      "loss": 0.3922,
+      "step": 992
+    },
+    {
+      "epoch": 0.31275590551181104,
+      "grad_norm": 43.66928482055664,
+      "learning_rate": 9.99844251968504e-06,
+      "loss": 0.3873,
+      "step": 993
+    },
+    {
+      "epoch": 0.3130708661417323,
+      "grad_norm": 37.3400764465332,
+      "learning_rate": 9.99844094488189e-06,
+      "loss": 0.1752,
+      "step": 994
+    },
+    {
+      "epoch": 0.31338582677165355,
+      "grad_norm": 22.93543243408203,
+      "learning_rate": 9.998439370078741e-06,
+      "loss": 0.2166,
+      "step": 995
+    },
+    {
+      "epoch": 0.3137007874015748,
+      "grad_norm": 54.434444427490234,
+      "learning_rate": 9.998437795275592e-06,
+      "loss": 0.6486,
+      "step": 996
+    },
+    {
+      "epoch": 0.31401574803149607,
+      "grad_norm": 29.223251342773438,
+      "learning_rate": 9.998436220472441e-06,
+      "loss": 0.2945,
+      "step": 997
+    },
+    {
+      "epoch": 0.3143307086614173,
+      "grad_norm": 40.362152099609375,
+      "learning_rate": 9.998434645669292e-06,
+      "loss": 0.6047,
+      "step": 998
+    },
+    {
+      "epoch": 0.3146456692913386,
+      "grad_norm": 17.493072509765625,
+      "learning_rate": 9.998433070866142e-06,
+      "loss": 0.3557,
+      "step": 999
+    },
+    {
+      "epoch": 0.31496062992125984,
+      "grad_norm": 32.33248519897461,
+      "learning_rate": 9.998431496062993e-06,
+      "loss": 0.1763,
+      "step": 1000
+    },
+    {
+      "epoch": 0.31496062992125984,
+      "eval_loss": 0.4581963121891022,
+      "eval_runtime": 306.4597,
+      "eval_samples_per_second": 0.382,
+      "eval_steps_per_second": 0.382,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3152755905511811,
+      "grad_norm": 41.288719177246094,
+      "learning_rate": 9.998429921259844e-06,
+      "loss": 0.3175,
+      "step": 1001
+    },
+    {
+      "epoch": 0.31559055118110235,
+      "grad_norm": 21.213180541992188,
+      "learning_rate": 9.998428346456693e-06,
+      "loss": 0.1175,
+      "step": 1002
+    },
+    {
+      "epoch": 0.3159055118110236,
+      "grad_norm": 35.53864669799805,
+      "learning_rate": 9.998426771653544e-06,
+      "loss": 0.2783,
+      "step": 1003
+    },
+    {
+      "epoch": 0.31622047244094487,
+      "grad_norm": 46.64375305175781,
+      "learning_rate": 9.998425196850395e-06,
+      "loss": 0.4337,
+      "step": 1004
+    },
+    {
+      "epoch": 0.3165354330708661,
+      "grad_norm": 25.14255142211914,
+      "learning_rate": 9.998423622047246e-06,
+      "loss": 0.3534,
+      "step": 1005
+    },
+    {
+      "epoch": 0.3168503937007874,
+      "grad_norm": 64.11309051513672,
+      "learning_rate": 9.998422047244095e-06,
+      "loss": 0.1756,
+      "step": 1006
+    },
+    {
+      "epoch": 0.31716535433070864,
+      "grad_norm": 54.7104377746582,
+      "learning_rate": 9.998420472440946e-06,
+      "loss": 0.9319,
+      "step": 1007
+    },
+    {
+      "epoch": 0.3174803149606299,
+      "grad_norm": 84.07182312011719,
+      "learning_rate": 9.998418897637796e-06,
+      "loss": 0.9422,
+      "step": 1008
+    },
+    {
+      "epoch": 0.3177952755905512,
+      "grad_norm": 46.56437683105469,
+      "learning_rate": 9.998417322834647e-06,
+      "loss": 0.2973,
+      "step": 1009
+    },
+    {
+      "epoch": 0.31811023622047246,
+      "grad_norm": 39.493682861328125,
+      "learning_rate": 9.998415748031498e-06,
+      "loss": 0.4972,
+      "step": 1010
+    },
+    {
+      "epoch": 0.3184251968503937,
+      "grad_norm": 51.46127700805664,
+      "learning_rate": 9.998414173228347e-06,
+      "loss": 0.6015,
+      "step": 1011
+    },
+    {
+      "epoch": 0.318740157480315,
+      "grad_norm": 27.754695892333984,
+      "learning_rate": 9.998412598425198e-06,
+      "loss": 0.2744,
+      "step": 1012
+    },
+    {
+      "epoch": 0.31905511811023624,
+      "grad_norm": 26.403295516967773,
+      "learning_rate": 9.998411023622047e-06,
+      "loss": 0.2464,
+      "step": 1013
+    },
+    {
+      "epoch": 0.3193700787401575,
+      "grad_norm": 66.5213623046875,
+      "learning_rate": 9.998409448818898e-06,
+      "loss": 0.3692,
+      "step": 1014
+    },
+    {
+      "epoch": 0.31968503937007875,
+      "grad_norm": 35.4731559753418,
+      "learning_rate": 9.998407874015749e-06,
+      "loss": 0.3412,
+      "step": 1015
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 23.23394203186035,
+      "learning_rate": 9.9984062992126e-06,
+      "loss": 0.4104,
+      "step": 1016
+    },
+    {
+      "epoch": 0.32031496062992126,
+      "grad_norm": 44.30449676513672,
+      "learning_rate": 9.99840472440945e-06,
+      "loss": 0.4564,
+      "step": 1017
+    },
+    {
+      "epoch": 0.3206299212598425,
+      "grad_norm": 18.734786987304688,
+      "learning_rate": 9.9984031496063e-06,
+      "loss": 0.2802,
+      "step": 1018
+    },
+    {
+      "epoch": 0.3209448818897638,
+      "grad_norm": 40.950653076171875,
+      "learning_rate": 9.99840157480315e-06,
+      "loss": 0.3802,
+      "step": 1019
+    },
+    {
+      "epoch": 0.32125984251968503,
+      "grad_norm": 97.77069091796875,
+      "learning_rate": 9.9984e-06,
+      "loss": 0.467,
+      "step": 1020
+    },
+    {
+      "epoch": 0.32125984251968503,
+      "eval_loss": 0.4887102544307709,
+      "eval_runtime": 304.7183,
+      "eval_samples_per_second": 0.384,
+      "eval_steps_per_second": 0.384,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3215748031496063,
+      "grad_norm": 131.0202178955078,
+      "learning_rate": 9.998398425196852e-06,
+      "loss": 0.9191,
+      "step": 1021
+    },
+    {
+      "epoch": 0.32188976377952755,
+      "grad_norm": 72.55668640136719,
+      "learning_rate": 9.998396850393701e-06,
+      "loss": 0.9462,
+      "step": 1022
+    },
+    {
+      "epoch": 0.3222047244094488,
+      "grad_norm": 47.58323669433594,
+      "learning_rate": 9.998395275590552e-06,
+      "loss": 0.7648,
+      "step": 1023
+    },
+    {
+      "epoch": 0.32251968503937006,
+      "grad_norm": 15.815386772155762,
+      "learning_rate": 9.998393700787403e-06,
+      "loss": 0.1747,
+      "step": 1024
+    },
+    {
+      "epoch": 0.3228346456692913,
+      "grad_norm": 23.02329444885254,
+      "learning_rate": 9.998392125984254e-06,
+      "loss": 0.2737,
+      "step": 1025
+    },
+    {
+      "epoch": 0.3231496062992126,
+      "grad_norm": 69.93936157226562,
+      "learning_rate": 9.998390551181103e-06,
+      "loss": 0.6328,
+      "step": 1026
+    },
+    {
+      "epoch": 0.32346456692913383,
+      "grad_norm": 44.680335998535156,
+      "learning_rate": 9.998388976377954e-06,
+      "loss": 0.2808,
+      "step": 1027
+    },
+    {
+      "epoch": 0.3237795275590551,
+      "grad_norm": 61.480506896972656,
+      "learning_rate": 9.998387401574804e-06,
+      "loss": 0.3987,
+      "step": 1028
+    },
+    {
+      "epoch": 0.3240944881889764,
+      "grad_norm": 49.2515754699707,
+      "learning_rate": 9.998385826771655e-06,
+      "loss": 0.2713,
+      "step": 1029
+    },
+    {
+      "epoch": 0.32440944881889766,
+      "grad_norm": 39.322608947753906,
+      "learning_rate": 9.998384251968506e-06,
+      "loss": 0.5236,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3247244094488189,
+      "grad_norm": 50.91118621826172,
+      "learning_rate": 9.998382677165355e-06,
+      "loss": 0.3365,
+      "step": 1031
+    },
+    {
+      "epoch": 0.3250393700787402,
+      "grad_norm": 57.56412887573242,
+      "learning_rate": 9.998381102362206e-06,
+      "loss": 0.5609,
+      "step": 1032
+    },
+    {
+      "epoch": 0.32535433070866143,
+      "grad_norm": 106.12908935546875,
+      "learning_rate": 9.998379527559055e-06,
+      "loss": 0.6946,
+      "step": 1033
+    },
+    {
+      "epoch": 0.3256692913385827,
+      "grad_norm": 28.783655166625977,
+      "learning_rate": 9.998377952755906e-06,
+      "loss": 0.3571,
+      "step": 1034
+    },
+    {
+      "epoch": 0.32598425196850395,
+      "grad_norm": 35.49094772338867,
+      "learning_rate": 9.998376377952757e-06,
+      "loss": 0.2201,
+      "step": 1035
+    },
+    {
+      "epoch": 0.3262992125984252,
+      "grad_norm": 41.67929458618164,
+      "learning_rate": 9.998374803149608e-06,
+      "loss": 0.2486,
+      "step": 1036
+    },
+    {
+      "epoch": 0.32661417322834646,
+      "grad_norm": 55.53398132324219,
+      "learning_rate": 9.998373228346457e-06,
+      "loss": 0.6653,
+      "step": 1037
+    },
+    {
+      "epoch": 0.3269291338582677,
+      "grad_norm": 63.74702835083008,
+      "learning_rate": 9.998371653543308e-06,
+      "loss": 0.3952,
+      "step": 1038
+    },
+    {
+      "epoch": 0.327244094488189,
+      "grad_norm": 43.730770111083984,
+      "learning_rate": 9.998370078740158e-06,
+      "loss": 0.2713,
+      "step": 1039
+    },
+    {
+      "epoch": 0.32755905511811023,
+      "grad_norm": 20.90167808532715,
+      "learning_rate": 9.998368503937009e-06,
+      "loss": 0.3298,
+      "step": 1040
+    },
+    {
+      "epoch": 0.32755905511811023,
+      "eval_loss": 0.553530216217041,
+      "eval_runtime": 338.2916,
+      "eval_samples_per_second": 0.346,
+      "eval_steps_per_second": 0.346,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3278740157480315,
+      "grad_norm": 53.132686614990234,
+      "learning_rate": 9.99836692913386e-06,
+      "loss": 0.2454,
+      "step": 1041
+    },
+    {
+      "epoch": 0.32818897637795275,
+      "grad_norm": 62.30380630493164,
+      "learning_rate": 9.998365354330709e-06,
+      "loss": 0.6391,
+      "step": 1042
+    },
+    {
+      "epoch": 0.328503937007874,
+      "grad_norm": 34.83534622192383,
+      "learning_rate": 9.99836377952756e-06,
+      "loss": 0.2441,
+      "step": 1043
+    },
+    {
+      "epoch": 0.32881889763779526,
+      "grad_norm": 37.7114372253418,
+      "learning_rate": 9.99836220472441e-06,
+      "loss": 0.1986,
+      "step": 1044
+    },
+    {
+      "epoch": 0.3291338582677165,
+      "grad_norm": 26.971843719482422,
+      "learning_rate": 9.99836062992126e-06,
+      "loss": 0.1753,
+      "step": 1045
+    },
+    {
+      "epoch": 0.3294488188976378,
+      "grad_norm": 64.88057708740234,
+      "learning_rate": 9.998359055118111e-06,
+      "loss": 0.3672,
+      "step": 1046
+    },
+    {
+      "epoch": 0.32976377952755903,
+      "grad_norm": 16.45389747619629,
+      "learning_rate": 9.998357480314962e-06,
+      "loss": 0.0662,
+      "step": 1047
+    },
+    {
+      "epoch": 0.3300787401574803,
+      "grad_norm": 53.416934967041016,
+      "learning_rate": 9.998355905511811e-06,
+      "loss": 0.6081,
+      "step": 1048
+    },
+    {
+      "epoch": 0.3303937007874016,
+      "grad_norm": 44.44949722290039,
+      "learning_rate": 9.998354330708662e-06,
+      "loss": 0.2946,
+      "step": 1049
+    },
+    {
+      "epoch": 0.33070866141732286,
+      "grad_norm": 85.5646743774414,
+      "learning_rate": 9.998352755905513e-06,
+      "loss": 0.7022,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3310236220472441,
+      "grad_norm": 158.97926330566406,
+      "learning_rate": 9.998351181102363e-06,
+      "loss": 0.3364,
+      "step": 1051
+    },
+    {
+      "epoch": 0.33133858267716537,
+      "grad_norm": 53.93890380859375,
+      "learning_rate": 9.998349606299214e-06,
+      "loss": 0.0856,
+      "step": 1052
+    },
+    {
+      "epoch": 0.33165354330708663,
+      "grad_norm": 58.113338470458984,
+      "learning_rate": 9.998348031496063e-06,
+      "loss": 0.4051,
+      "step": 1053
+    },
+    {
+      "epoch": 0.3319685039370079,
+      "grad_norm": 113.30328369140625,
+      "learning_rate": 9.998346456692914e-06,
+      "loss": 1.3079,
+      "step": 1054
+    },
+    {
+      "epoch": 0.33228346456692914,
+      "grad_norm": 52.5933952331543,
+      "learning_rate": 9.998344881889765e-06,
+      "loss": 0.2001,
+      "step": 1055
+    },
+    {
+      "epoch": 0.3325984251968504,
+      "grad_norm": 111.05465698242188,
+      "learning_rate": 9.998343307086616e-06,
+      "loss": 0.8016,
+      "step": 1056
+    },
+    {
+      "epoch": 0.33291338582677166,
+      "grad_norm": 80.2440185546875,
+      "learning_rate": 9.998341732283465e-06,
+      "loss": 0.6905,
+      "step": 1057
+    },
+    {
+      "epoch": 0.3332283464566929,
+      "grad_norm": 46.98655319213867,
+      "learning_rate": 9.998340157480316e-06,
+      "loss": 0.645,
+      "step": 1058
+    },
+    {
+      "epoch": 0.33354330708661417,
+      "grad_norm": 88.55839538574219,
+      "learning_rate": 9.998338582677166e-06,
+      "loss": 0.632,
+      "step": 1059
+    },
+    {
+      "epoch": 0.33385826771653543,
+      "grad_norm": 45.15827178955078,
+      "learning_rate": 9.998337007874017e-06,
+      "loss": 0.3994,
+      "step": 1060
+    },
+    {
+      "epoch": 0.33385826771653543,
+      "eval_loss": 0.49371591210365295,
+      "eval_runtime": 338.6729,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3341732283464567,
+      "grad_norm": 89.61175537109375,
+      "learning_rate": 9.998335433070868e-06,
+      "loss": 0.7211,
+      "step": 1061
+    },
+    {
+      "epoch": 0.33448818897637794,
+      "grad_norm": 43.96299362182617,
+      "learning_rate": 9.998333858267717e-06,
+      "loss": 0.2502,
+      "step": 1062
+    },
+    {
+      "epoch": 0.3348031496062992,
+      "grad_norm": 48.51971435546875,
+      "learning_rate": 9.998332283464568e-06,
+      "loss": 0.2617,
+      "step": 1063
+    },
+    {
+      "epoch": 0.33511811023622046,
+      "grad_norm": 56.80341720581055,
+      "learning_rate": 9.998330708661417e-06,
+      "loss": 0.315,
+      "step": 1064
+    },
+    {
+      "epoch": 0.3354330708661417,
+      "grad_norm": 24.417354583740234,
+      "learning_rate": 9.998329133858268e-06,
+      "loss": 0.2498,
+      "step": 1065
+    },
+    {
+      "epoch": 0.33574803149606297,
+      "grad_norm": 21.887855529785156,
+      "learning_rate": 9.99832755905512e-06,
+      "loss": 0.1483,
+      "step": 1066
+    },
+    {
+      "epoch": 0.33606299212598423,
+      "grad_norm": 52.2086181640625,
+      "learning_rate": 9.99832598425197e-06,
+      "loss": 0.3647,
+      "step": 1067
+    },
+    {
+      "epoch": 0.3363779527559055,
+      "grad_norm": 50.21038818359375,
+      "learning_rate": 9.99832440944882e-06,
+      "loss": 0.5818,
+      "step": 1068
+    },
+    {
+      "epoch": 0.3366929133858268,
+      "grad_norm": 25.27126693725586,
+      "learning_rate": 9.99832283464567e-06,
+      "loss": 0.1273,
+      "step": 1069
+    },
+    {
+      "epoch": 0.33700787401574805,
+      "grad_norm": 36.34380340576172,
+      "learning_rate": 9.99832125984252e-06,
+      "loss": 0.3509,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3373228346456693,
+      "grad_norm": 35.567527770996094,
+      "learning_rate": 9.99831968503937e-06,
+      "loss": 0.2472,
+      "step": 1071
+    },
+    {
+      "epoch": 0.33763779527559057,
+      "grad_norm": 39.77647399902344,
+      "learning_rate": 9.998318110236222e-06,
+      "loss": 0.5746,
+      "step": 1072
+    },
+    {
+      "epoch": 0.3379527559055118,
+      "grad_norm": 10.02043628692627,
+      "learning_rate": 9.998316535433071e-06,
+      "loss": 0.0749,
+      "step": 1073
+    },
+    {
+      "epoch": 0.3382677165354331,
+      "grad_norm": 31.038677215576172,
+      "learning_rate": 9.998314960629922e-06,
+      "loss": 0.2673,
+      "step": 1074
+    },
+    {
+      "epoch": 0.33858267716535434,
+      "grad_norm": 34.27322769165039,
+      "learning_rate": 9.998313385826773e-06,
+      "loss": 0.1064,
+      "step": 1075
+    },
+    {
+      "epoch": 0.3388976377952756,
+      "grad_norm": 91.75303649902344,
+      "learning_rate": 9.998311811023624e-06,
+      "loss": 0.7232,
+      "step": 1076
+    },
+    {
+      "epoch": 0.33921259842519685,
+      "grad_norm": 68.7430419921875,
+      "learning_rate": 9.998310236220473e-06,
+      "loss": 0.5586,
+      "step": 1077
+    },
+    {
+      "epoch": 0.3395275590551181,
+      "grad_norm": 94.78008270263672,
+      "learning_rate": 9.998308661417324e-06,
+      "loss": 0.5504,
+      "step": 1078
+    },
+    {
+      "epoch": 0.33984251968503937,
+      "grad_norm": 54.0759162902832,
+      "learning_rate": 9.998307086614174e-06,
+      "loss": 0.1873,
+      "step": 1079
+    },
+    {
+      "epoch": 0.3401574803149606,
+      "grad_norm": 65.1077651977539,
+      "learning_rate": 9.998305511811025e-06,
+      "loss": 1.0425,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3401574803149606,
+      "eval_loss": 0.6948055028915405,
+      "eval_runtime": 327.3153,
+      "eval_samples_per_second": 0.357,
+      "eval_steps_per_second": 0.357,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3404724409448819,
+      "grad_norm": 79.78382873535156,
+      "learning_rate": 9.998303937007876e-06,
+      "loss": 0.6999,
+      "step": 1081
+    },
+    {
+      "epoch": 0.34078740157480314,
+      "grad_norm": 97.7957534790039,
+      "learning_rate": 9.998302362204725e-06,
+      "loss": 0.4083,
+      "step": 1082
+    },
+    {
+      "epoch": 0.3411023622047244,
+      "grad_norm": 90.36141967773438,
+      "learning_rate": 9.998300787401576e-06,
+      "loss": 0.7225,
+      "step": 1083
+    },
+    {
+      "epoch": 0.34141732283464565,
+      "grad_norm": 14.357733726501465,
+      "learning_rate": 9.998299212598425e-06,
+      "loss": 0.0595,
+      "step": 1084
+    },
+    {
+      "epoch": 0.3417322834645669,
+      "grad_norm": 47.068233489990234,
+      "learning_rate": 9.998297637795276e-06,
+      "loss": 0.7842,
+      "step": 1085
+    },
+    {
+      "epoch": 0.34204724409448817,
+      "grad_norm": 22.276060104370117,
+      "learning_rate": 9.998296062992127e-06,
+      "loss": 0.1631,
+      "step": 1086
+    },
+    {
+      "epoch": 0.3423622047244094,
+      "grad_norm": 38.76866149902344,
+      "learning_rate": 9.998294488188978e-06,
+      "loss": 0.2738,
+      "step": 1087
+    },
+    {
+      "epoch": 0.3426771653543307,
+      "grad_norm": 45.607505798339844,
+      "learning_rate": 9.998292913385827e-06,
+      "loss": 0.4995,
+      "step": 1088
+    },
+    {
+      "epoch": 0.342992125984252,
+      "grad_norm": 34.9421272277832,
+      "learning_rate": 9.998291338582678e-06,
+      "loss": 0.614,
+      "step": 1089
+    },
+    {
+      "epoch": 0.34330708661417325,
+      "grad_norm": 36.95371627807617,
+      "learning_rate": 9.998289763779528e-06,
+      "loss": 0.6949,
+      "step": 1090
+    },
+    {
+      "epoch": 0.3436220472440945,
+      "grad_norm": 32.992279052734375,
+      "learning_rate": 9.998288188976379e-06,
+      "loss": 0.2076,
+      "step": 1091
+    },
+    {
+      "epoch": 0.34393700787401577,
+      "grad_norm": 15.995903015136719,
+      "learning_rate": 9.99828661417323e-06,
+      "loss": 0.1538,
+      "step": 1092
+    },
+    {
+      "epoch": 0.344251968503937,
+      "grad_norm": 38.209495544433594,
+      "learning_rate": 9.998285039370079e-06,
+      "loss": 0.5808,
+      "step": 1093
+    },
+    {
+      "epoch": 0.3445669291338583,
+      "grad_norm": 52.266441345214844,
+      "learning_rate": 9.99828346456693e-06,
+      "loss": 0.3555,
+      "step": 1094
+    },
+    {
+      "epoch": 0.34488188976377954,
+      "grad_norm": 11.709747314453125,
+      "learning_rate": 9.998281889763781e-06,
+      "loss": 0.0999,
+      "step": 1095
+    },
+    {
+      "epoch": 0.3451968503937008,
+      "grad_norm": 37.43159484863281,
+      "learning_rate": 9.998280314960632e-06,
+      "loss": 0.5477,
+      "step": 1096
+    },
+    {
+      "epoch": 0.34551181102362205,
+      "grad_norm": 31.790834426879883,
+      "learning_rate": 9.998278740157481e-06,
+      "loss": 0.4586,
+      "step": 1097
+    },
+    {
+      "epoch": 0.3458267716535433,
+      "grad_norm": 15.95163631439209,
+      "learning_rate": 9.998277165354332e-06,
+      "loss": 0.1477,
+      "step": 1098
+    },
+    {
+      "epoch": 0.34614173228346456,
+      "grad_norm": 57.9958610534668,
+      "learning_rate": 9.998275590551182e-06,
+      "loss": 0.2226,
+      "step": 1099
+    },
+    {
+      "epoch": 0.3464566929133858,
+      "grad_norm": 31.550888061523438,
+      "learning_rate": 9.998274015748033e-06,
+      "loss": 0.3224,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3464566929133858,
+      "eval_loss": 0.5266521573066711,
+      "eval_runtime": 337.1873,
+      "eval_samples_per_second": 0.347,
+      "eval_steps_per_second": 0.347,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3467716535433071,
+      "grad_norm": 27.716707229614258,
+      "learning_rate": 9.998272440944883e-06,
+      "loss": 0.2458,
+      "step": 1101
+    },
+    {
+      "epoch": 0.34708661417322834,
+      "grad_norm": 36.678863525390625,
+      "learning_rate": 9.998270866141733e-06,
+      "loss": 0.315,
+      "step": 1102
+    },
+    {
+      "epoch": 0.3474015748031496,
+      "grad_norm": 21.499208450317383,
+      "learning_rate": 9.998269291338584e-06,
+      "loss": 0.2061,
+      "step": 1103
+    },
+    {
+      "epoch": 0.34771653543307085,
+      "grad_norm": 63.6679801940918,
+      "learning_rate": 9.998267716535433e-06,
+      "loss": 1.0103,
+      "step": 1104
+    },
+    {
+      "epoch": 0.3480314960629921,
+      "grad_norm": 34.54896545410156,
+      "learning_rate": 9.998266141732284e-06,
+      "loss": 0.5376,
+      "step": 1105
+    },
+    {
+      "epoch": 0.34834645669291336,
+      "grad_norm": 16.613237380981445,
+      "learning_rate": 9.998264566929135e-06,
+      "loss": 0.0988,
+      "step": 1106
+    },
+    {
+      "epoch": 0.3486614173228346,
+      "grad_norm": 73.5743179321289,
+      "learning_rate": 9.998262992125986e-06,
+      "loss": 0.8173,
+      "step": 1107
+    },
+    {
+      "epoch": 0.3489763779527559,
+      "grad_norm": 6.472119331359863,
+      "learning_rate": 9.998261417322835e-06,
+      "loss": 0.0331,
+      "step": 1108
+    },
+    {
+      "epoch": 0.3492913385826772,
+      "grad_norm": 24.923892974853516,
+      "learning_rate": 9.998259842519686e-06,
+      "loss": 0.1636,
+      "step": 1109
+    },
+    {
+      "epoch": 0.34960629921259845,
+      "grad_norm": 76.55589294433594,
+      "learning_rate": 9.998258267716536e-06,
+      "loss": 0.5014,
+      "step": 1110
+    },
+    {
+      "epoch": 0.3499212598425197,
+      "grad_norm": 15.523338317871094,
+      "learning_rate": 9.998256692913387e-06,
+      "loss": 0.0701,
+      "step": 1111
+    },
+    {
+      "epoch": 0.35023622047244096,
+      "grad_norm": 72.51299285888672,
+      "learning_rate": 9.998255118110238e-06,
+      "loss": 0.5609,
+      "step": 1112
+    },
+    {
+      "epoch": 0.3505511811023622,
+      "grad_norm": 41.60987854003906,
+      "learning_rate": 9.998253543307087e-06,
+      "loss": 0.2,
+      "step": 1113
+    },
+    {
+      "epoch": 0.3508661417322835,
+      "grad_norm": 4.41101598739624,
+      "learning_rate": 9.998251968503938e-06,
+      "loss": 0.0192,
+      "step": 1114
+    },
+    {
+      "epoch": 0.35118110236220473,
+      "grad_norm": 69.58025360107422,
+      "learning_rate": 9.998250393700787e-06,
+      "loss": 0.8551,
+      "step": 1115
+    },
+    {
+      "epoch": 0.351496062992126,
+      "grad_norm": 34.081336975097656,
+      "learning_rate": 9.998248818897638e-06,
+      "loss": 0.6141,
+      "step": 1116
+    },
+    {
+      "epoch": 0.35181102362204725,
+      "grad_norm": 46.96076965332031,
+      "learning_rate": 9.99824724409449e-06,
+      "loss": 0.8069,
+      "step": 1117
+    },
+    {
+      "epoch": 0.3521259842519685,
+      "grad_norm": 88.97361755371094,
+      "learning_rate": 9.99824566929134e-06,
+      "loss": 0.3247,
+      "step": 1118
+    },
+    {
+      "epoch": 0.35244094488188976,
+      "grad_norm": 16.12812042236328,
+      "learning_rate": 9.99824409448819e-06,
+      "loss": 0.0924,
+      "step": 1119
+    },
+    {
+      "epoch": 0.352755905511811,
+      "grad_norm": 3.950244188308716,
+      "learning_rate": 9.99824251968504e-06,
+      "loss": 0.0298,
+      "step": 1120
+    },
+    {
+      "epoch": 0.352755905511811,
+      "eval_loss": 0.5020915865898132,
+      "eval_runtime": 340.8492,
+      "eval_samples_per_second": 0.343,
+      "eval_steps_per_second": 0.343,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3530708661417323,
+      "grad_norm": 86.11933898925781,
+      "learning_rate": 9.998240944881891e-06,
+      "loss": 0.9179,
+      "step": 1121
+    },
+    {
+      "epoch": 0.35338582677165353,
+      "grad_norm": 41.664955139160156,
+      "learning_rate": 9.99823937007874e-06,
+      "loss": 0.1796,
+      "step": 1122
+    },
+    {
+      "epoch": 0.3537007874015748,
+      "grad_norm": 22.196773529052734,
+      "learning_rate": 9.998237795275592e-06,
+      "loss": 0.143,
+      "step": 1123
+    },
+    {
+      "epoch": 0.35401574803149605,
+      "grad_norm": 57.799415588378906,
+      "learning_rate": 9.998236220472441e-06,
+      "loss": 0.441,
+      "step": 1124
+    },
+    {
+      "epoch": 0.3543307086614173,
+      "grad_norm": 23.602643966674805,
+      "learning_rate": 9.998234645669292e-06,
+      "loss": 0.325,
+      "step": 1125
+    },
+    {
+      "epoch": 0.35464566929133856,
+      "grad_norm": 49.98581314086914,
+      "learning_rate": 9.998233070866143e-06,
+      "loss": 0.5312,
+      "step": 1126
+    },
+    {
+      "epoch": 0.3549606299212598,
+      "grad_norm": 32.001861572265625,
+      "learning_rate": 9.998231496062994e-06,
+      "loss": 0.2678,
+      "step": 1127
+    },
+    {
+      "epoch": 0.3552755905511811,
+      "grad_norm": 22.768354415893555,
+      "learning_rate": 9.998229921259843e-06,
+      "loss": 0.2655,
+      "step": 1128
+    },
+    {
+      "epoch": 0.3555905511811024,
+      "grad_norm": 10.575422286987305,
+      "learning_rate": 9.998228346456694e-06,
+      "loss": 0.067,
+      "step": 1129
+    },
+    {
+      "epoch": 0.35590551181102364,
+      "grad_norm": 23.114152908325195,
+      "learning_rate": 9.998226771653544e-06,
+      "loss": 0.1502,
+      "step": 1130
+    },
+    {
+      "epoch": 0.3562204724409449,
+      "grad_norm": 55.854827880859375,
+      "learning_rate": 9.998225196850395e-06,
+      "loss": 0.7007,
+      "step": 1131
+    },
+    {
+      "epoch": 0.35653543307086616,
+      "grad_norm": 43.69165802001953,
+      "learning_rate": 9.998223622047246e-06,
+      "loss": 0.2763,
+      "step": 1132
+    },
+    {
+      "epoch": 0.3568503937007874,
+      "grad_norm": 10.879396438598633,
+      "learning_rate": 9.998222047244095e-06,
+      "loss": 0.0361,
+      "step": 1133
+    },
+    {
+      "epoch": 0.3571653543307087,
+      "grad_norm": 78.35888671875,
+      "learning_rate": 9.998220472440946e-06,
+      "loss": 0.089,
+      "step": 1134
+    },
+    {
+      "epoch": 0.35748031496062993,
+      "grad_norm": 24.78093147277832,
+      "learning_rate": 9.998218897637795e-06,
+      "loss": 0.1069,
+      "step": 1135
+    },
+    {
+      "epoch": 0.3577952755905512,
+      "grad_norm": 46.83030700683594,
+      "learning_rate": 9.998217322834646e-06,
+      "loss": 0.472,
+      "step": 1136
+    },
+    {
+      "epoch": 0.35811023622047244,
+      "grad_norm": 15.252365112304688,
+      "learning_rate": 9.998215748031497e-06,
+      "loss": 0.0778,
+      "step": 1137
+    },
+    {
+      "epoch": 0.3584251968503937,
+      "grad_norm": 15.580936431884766,
+      "learning_rate": 9.998214173228348e-06,
+      "loss": 0.1253,
+      "step": 1138
+    },
+    {
+      "epoch": 0.35874015748031496,
+      "grad_norm": 2.145813226699829,
+      "learning_rate": 9.998212598425197e-06,
+      "loss": 0.0093,
+      "step": 1139
+    },
+    {
+      "epoch": 0.3590551181102362,
+      "grad_norm": 204.69932556152344,
+      "learning_rate": 9.998211023622047e-06,
+      "loss": 1.3589,
+      "step": 1140
+    },
+    {
+      "epoch": 0.3590551181102362,
+      "eval_loss": 0.5789304971694946,
+      "eval_runtime": 339.4483,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1140
+    },
+    {
+      "epoch": 0.3593700787401575,
+      "grad_norm": 134.10121154785156,
+      "learning_rate": 9.998209448818898e-06,
+      "loss": 0.3836,
+      "step": 1141
+    },
+    {
+      "epoch": 0.35968503937007873,
+      "grad_norm": 49.37288284301758,
+      "learning_rate": 9.998207874015749e-06,
+      "loss": 0.5611,
+      "step": 1142
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 42.33864212036133,
+      "learning_rate": 9.9982062992126e-06,
+      "loss": 0.1906,
+      "step": 1143
+    },
+    {
+      "epoch": 0.36031496062992124,
+      "grad_norm": 75.55062866210938,
+      "learning_rate": 9.998204724409449e-06,
+      "loss": 0.1209,
+      "step": 1144
+    },
+    {
+      "epoch": 0.3606299212598425,
+      "grad_norm": 30.669452667236328,
+      "learning_rate": 9.9982031496063e-06,
+      "loss": 0.097,
+      "step": 1145
+    },
+    {
+      "epoch": 0.36094488188976376,
+      "grad_norm": 96.1650390625,
+      "learning_rate": 9.998201574803151e-06,
+      "loss": 0.9096,
+      "step": 1146
+    },
+    {
+      "epoch": 0.361259842519685,
+      "grad_norm": 108.84801483154297,
+      "learning_rate": 9.998200000000002e-06,
+      "loss": 0.4399,
+      "step": 1147
+    },
+    {
+      "epoch": 0.36157480314960627,
+      "grad_norm": 101.94995880126953,
+      "learning_rate": 9.998198425196851e-06,
+      "loss": 0.5331,
+      "step": 1148
+    },
+    {
+      "epoch": 0.3618897637795276,
+      "grad_norm": 42.10065841674805,
+      "learning_rate": 9.998196850393702e-06,
+      "loss": 0.2717,
+      "step": 1149
+    },
+    {
+      "epoch": 0.36220472440944884,
+      "grad_norm": 34.27854919433594,
+      "learning_rate": 9.998195275590552e-06,
+      "loss": 0.0578,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3625196850393701,
+      "grad_norm": 80.90559387207031,
+      "learning_rate": 9.998193700787403e-06,
+      "loss": 0.2224,
+      "step": 1151
+    },
+    {
+      "epoch": 0.36283464566929136,
+      "grad_norm": 89.94649505615234,
+      "learning_rate": 9.998192125984254e-06,
+      "loss": 0.2526,
+      "step": 1152
+    },
+    {
+      "epoch": 0.3631496062992126,
+      "grad_norm": 96.36505126953125,
+      "learning_rate": 9.998190551181103e-06,
+      "loss": 0.521,
+      "step": 1153
+    },
+    {
+      "epoch": 0.36346456692913387,
+      "grad_norm": 177.08819580078125,
+      "learning_rate": 9.998188976377954e-06,
+      "loss": 0.6533,
+      "step": 1154
+    },
+    {
+      "epoch": 0.3637795275590551,
+      "grad_norm": 60.227481842041016,
+      "learning_rate": 9.998187401574803e-06,
+      "loss": 0.8652,
+      "step": 1155
+    },
+    {
+      "epoch": 0.3640944881889764,
+      "grad_norm": 69.29286193847656,
+      "learning_rate": 9.998185826771654e-06,
+      "loss": 1.2497,
+      "step": 1156
+    },
+    {
+      "epoch": 0.36440944881889764,
+      "grad_norm": 59.95965576171875,
+      "learning_rate": 9.998184251968505e-06,
+      "loss": 0.4519,
+      "step": 1157
+    },
+    {
+      "epoch": 0.3647244094488189,
+      "grad_norm": 61.35934066772461,
+      "learning_rate": 9.998182677165356e-06,
+      "loss": 0.2029,
+      "step": 1158
+    },
+    {
+      "epoch": 0.36503937007874016,
+      "grad_norm": 32.42390060424805,
+      "learning_rate": 9.998181102362205e-06,
+      "loss": 0.5222,
+      "step": 1159
+    },
+    {
+      "epoch": 0.3653543307086614,
+      "grad_norm": 58.62075424194336,
+      "learning_rate": 9.998179527559055e-06,
+      "loss": 0.6079,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3653543307086614,
+      "eval_loss": 0.5354205965995789,
+      "eval_runtime": 339.1978,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1160
+    },
+    {
+      "epoch": 0.36566929133858267,
+      "grad_norm": 61.64841079711914,
+      "learning_rate": 9.998177952755906e-06,
+      "loss": 0.603,
+      "step": 1161
+    },
+    {
+      "epoch": 0.3659842519685039,
+      "grad_norm": 34.8085823059082,
+      "learning_rate": 9.998176377952757e-06,
+      "loss": 0.3863,
+      "step": 1162
+    },
+    {
+      "epoch": 0.3662992125984252,
+      "grad_norm": 25.715442657470703,
+      "learning_rate": 9.998174803149608e-06,
+      "loss": 0.2857,
+      "step": 1163
+    },
+    {
+      "epoch": 0.36661417322834644,
+      "grad_norm": 33.884483337402344,
+      "learning_rate": 9.998173228346457e-06,
+      "loss": 0.549,
+      "step": 1164
+    },
+    {
+      "epoch": 0.3669291338582677,
+      "grad_norm": 24.125484466552734,
+      "learning_rate": 9.998171653543308e-06,
+      "loss": 0.2794,
+      "step": 1165
+    },
+    {
+      "epoch": 0.36724409448818895,
+      "grad_norm": 67.11617279052734,
+      "learning_rate": 9.998170078740159e-06,
+      "loss": 0.6092,
+      "step": 1166
+    },
+    {
+      "epoch": 0.3675590551181102,
+      "grad_norm": 23.704805374145508,
+      "learning_rate": 9.99816850393701e-06,
+      "loss": 0.4884,
+      "step": 1167
+    },
+    {
+      "epoch": 0.36787401574803147,
+      "grad_norm": 25.822975158691406,
+      "learning_rate": 9.99816692913386e-06,
+      "loss": 0.5046,
+      "step": 1168
+    },
+    {
+      "epoch": 0.3681889763779528,
+      "grad_norm": 31.311058044433594,
+      "learning_rate": 9.99816535433071e-06,
+      "loss": 0.3914,
+      "step": 1169
+    },
+    {
+      "epoch": 0.36850393700787404,
+      "grad_norm": 18.38756561279297,
+      "learning_rate": 9.99816377952756e-06,
+      "loss": 0.4273,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3688188976377953,
+      "grad_norm": 20.738372802734375,
+      "learning_rate": 9.99816220472441e-06,
+      "loss": 0.2612,
+      "step": 1171
+    },
+    {
+      "epoch": 0.36913385826771655,
+      "grad_norm": 23.450641632080078,
+      "learning_rate": 9.998160629921261e-06,
+      "loss": 0.3474,
+      "step": 1172
+    },
+    {
+      "epoch": 0.3694488188976378,
+      "grad_norm": 38.92578125,
+      "learning_rate": 9.99815905511811e-06,
+      "loss": 0.5643,
+      "step": 1173
+    },
+    {
+      "epoch": 0.36976377952755907,
+      "grad_norm": 59.437496185302734,
+      "learning_rate": 9.998157480314962e-06,
+      "loss": 0.7971,
+      "step": 1174
+    },
+    {
+      "epoch": 0.3700787401574803,
+      "grad_norm": 22.928701400756836,
+      "learning_rate": 9.998155905511811e-06,
+      "loss": 0.2843,
+      "step": 1175
+    },
+    {
+      "epoch": 0.3703937007874016,
+      "grad_norm": 38.7431526184082,
+      "learning_rate": 9.998154330708662e-06,
+      "loss": 0.3979,
+      "step": 1176
+    },
+    {
+      "epoch": 0.37070866141732284,
+      "grad_norm": 5.290953636169434,
+      "learning_rate": 9.998152755905513e-06,
+      "loss": 0.0298,
+      "step": 1177
+    },
+    {
+      "epoch": 0.3710236220472441,
+      "grad_norm": 23.754051208496094,
+      "learning_rate": 9.998151181102364e-06,
+      "loss": 0.4325,
+      "step": 1178
+    },
+    {
+      "epoch": 0.37133858267716535,
+      "grad_norm": 27.173952102661133,
+      "learning_rate": 9.998149606299213e-06,
+      "loss": 0.5674,
+      "step": 1179
+    },
+    {
+      "epoch": 0.3716535433070866,
+      "grad_norm": 42.20391845703125,
+      "learning_rate": 9.998148031496063e-06,
+      "loss": 0.5973,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3716535433070866,
+      "eval_loss": 0.5375993251800537,
+      "eval_runtime": 339.4527,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1180
+    },
+    {
+      "epoch": 0.37196850393700787,
+      "grad_norm": 11.690238952636719,
+      "learning_rate": 9.998146456692914e-06,
+      "loss": 0.0917,
+      "step": 1181
+    },
+    {
+      "epoch": 0.3722834645669291,
+      "grad_norm": 10.012290000915527,
+      "learning_rate": 9.998144881889765e-06,
+      "loss": 0.0842,
+      "step": 1182
+    },
+    {
+      "epoch": 0.3725984251968504,
+      "grad_norm": 21.645814895629883,
+      "learning_rate": 9.998143307086616e-06,
+      "loss": 0.152,
+      "step": 1183
+    },
+    {
+      "epoch": 0.37291338582677164,
+      "grad_norm": 31.5441837310791,
+      "learning_rate": 9.998141732283465e-06,
+      "loss": 0.5621,
+      "step": 1184
+    },
+    {
+      "epoch": 0.3732283464566929,
+      "grad_norm": 41.74612808227539,
+      "learning_rate": 9.998140157480316e-06,
+      "loss": 0.7019,
+      "step": 1185
+    },
+    {
+      "epoch": 0.37354330708661415,
+      "grad_norm": 35.047794342041016,
+      "learning_rate": 9.998138582677165e-06,
+      "loss": 0.1264,
+      "step": 1186
+    },
+    {
+      "epoch": 0.3738582677165354,
+      "grad_norm": 77.93087005615234,
+      "learning_rate": 9.998137007874016e-06,
+      "loss": 0.8623,
+      "step": 1187
+    },
+    {
+      "epoch": 0.37417322834645667,
+      "grad_norm": 32.53571701049805,
+      "learning_rate": 9.998135433070867e-06,
+      "loss": 0.2572,
+      "step": 1188
+    },
+    {
+      "epoch": 0.374488188976378,
+      "grad_norm": 57.9747428894043,
+      "learning_rate": 9.998133858267718e-06,
+      "loss": 0.5365,
+      "step": 1189
+    },
+    {
+      "epoch": 0.37480314960629924,
+      "grad_norm": 46.87409210205078,
+      "learning_rate": 9.998132283464567e-06,
+      "loss": 1.2091,
+      "step": 1190
+    },
+    {
+      "epoch": 0.3751181102362205,
+      "grad_norm": 29.17926788330078,
+      "learning_rate": 9.998130708661418e-06,
+      "loss": 0.3407,
+      "step": 1191
+    },
+    {
+      "epoch": 0.37543307086614175,
+      "grad_norm": 43.51498794555664,
+      "learning_rate": 9.99812913385827e-06,
+      "loss": 0.747,
+      "step": 1192
+    },
+    {
+      "epoch": 0.375748031496063,
+      "grad_norm": 63.52394104003906,
+      "learning_rate": 9.998127559055119e-06,
+      "loss": 0.5045,
+      "step": 1193
+    },
+    {
+      "epoch": 0.37606299212598426,
+      "grad_norm": 82.31373596191406,
+      "learning_rate": 9.99812598425197e-06,
+      "loss": 0.9014,
+      "step": 1194
+    },
+    {
+      "epoch": 0.3763779527559055,
+      "grad_norm": 70.40677642822266,
+      "learning_rate": 9.998124409448819e-06,
+      "loss": 0.5663,
+      "step": 1195
+    },
+    {
+      "epoch": 0.3766929133858268,
+      "grad_norm": 16.95841407775879,
+      "learning_rate": 9.99812283464567e-06,
+      "loss": 0.3155,
+      "step": 1196
+    },
+    {
+      "epoch": 0.37700787401574803,
+      "grad_norm": 46.58156967163086,
+      "learning_rate": 9.998121259842521e-06,
+      "loss": 0.6919,
+      "step": 1197
+    },
+    {
+      "epoch": 0.3773228346456693,
+      "grad_norm": 37.353492736816406,
+      "learning_rate": 9.998119685039372e-06,
+      "loss": 0.669,
+      "step": 1198
+    },
+    {
+      "epoch": 0.37763779527559055,
+      "grad_norm": 23.72784996032715,
+      "learning_rate": 9.998118110236221e-06,
+      "loss": 0.3445,
+      "step": 1199
+    },
+    {
+      "epoch": 0.3779527559055118,
+      "grad_norm": 39.98880386352539,
+      "learning_rate": 9.99811653543307e-06,
+      "loss": 0.4265,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3779527559055118,
+      "eval_loss": 0.5579959154129028,
+      "eval_runtime": 337.5471,
+      "eval_samples_per_second": 0.347,
+      "eval_steps_per_second": 0.347,
+      "step": 1200
+    },
+    {
+      "epoch": 0.37826771653543306,
+      "grad_norm": 13.343225479125977,
+      "learning_rate": 9.998114960629922e-06,
+      "loss": 0.346,
+      "step": 1201
+    },
+    {
+      "epoch": 0.3785826771653543,
+      "grad_norm": 31.93812370300293,
+      "learning_rate": 9.998113385826773e-06,
+      "loss": 0.7538,
+      "step": 1202
+    },
+    {
+      "epoch": 0.3788976377952756,
+      "grad_norm": 22.874921798706055,
+      "learning_rate": 9.998111811023624e-06,
+      "loss": 0.2865,
+      "step": 1203
+    },
+    {
+      "epoch": 0.37921259842519683,
+      "grad_norm": 50.595577239990234,
+      "learning_rate": 9.998110236220473e-06,
+      "loss": 0.7486,
+      "step": 1204
+    },
+    {
+      "epoch": 0.3795275590551181,
+      "grad_norm": 20.3195858001709,
+      "learning_rate": 9.998108661417324e-06,
+      "loss": 0.2986,
+      "step": 1205
+    },
+    {
+      "epoch": 0.37984251968503935,
+      "grad_norm": 18.59178352355957,
+      "learning_rate": 9.998107086614173e-06,
+      "loss": 0.3259,
+      "step": 1206
+    },
+    {
+      "epoch": 0.3801574803149606,
+      "grad_norm": 23.734966278076172,
+      "learning_rate": 9.998105511811024e-06,
+      "loss": 0.4835,
+      "step": 1207
+    },
+    {
+      "epoch": 0.38047244094488186,
+      "grad_norm": 47.375789642333984,
+      "learning_rate": 9.998103937007875e-06,
+      "loss": 0.4708,
+      "step": 1208
+    },
+    {
+      "epoch": 0.3807874015748032,
+      "grad_norm": 12.326082229614258,
+      "learning_rate": 9.998102362204726e-06,
+      "loss": 0.3698,
+      "step": 1209
+    },
+    {
+      "epoch": 0.38110236220472443,
+      "grad_norm": 30.175519943237305,
+      "learning_rate": 9.998100787401575e-06,
+      "loss": 0.4141,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3814173228346457,
+      "grad_norm": 27.1715087890625,
+      "learning_rate": 9.998099212598425e-06,
+      "loss": 0.4674,
+      "step": 1211
+    },
+    {
+      "epoch": 0.38173228346456695,
+      "grad_norm": 31.19744300842285,
+      "learning_rate": 9.998097637795277e-06,
+      "loss": 0.3239,
+      "step": 1212
+    },
+    {
+      "epoch": 0.3820472440944882,
+      "grad_norm": 44.07891845703125,
+      "learning_rate": 9.998096062992127e-06,
+      "loss": 0.7672,
+      "step": 1213
+    },
+    {
+      "epoch": 0.38236220472440946,
+      "grad_norm": 15.165576934814453,
+      "learning_rate": 9.998094488188978e-06,
+      "loss": 0.2029,
+      "step": 1214
+    },
+    {
+      "epoch": 0.3826771653543307,
+      "grad_norm": 41.70854187011719,
+      "learning_rate": 9.998092913385827e-06,
+      "loss": 0.3442,
+      "step": 1215
+    },
+    {
+      "epoch": 0.382992125984252,
+      "grad_norm": 25.64063835144043,
+      "learning_rate": 9.998091338582678e-06,
+      "loss": 0.3051,
+      "step": 1216
+    },
+    {
+      "epoch": 0.38330708661417323,
+      "grad_norm": 19.23823356628418,
+      "learning_rate": 9.998089763779529e-06,
+      "loss": 0.1983,
+      "step": 1217
+    },
+    {
+      "epoch": 0.3836220472440945,
+      "grad_norm": 65.09869384765625,
+      "learning_rate": 9.99808818897638e-06,
+      "loss": 0.4903,
+      "step": 1218
+    },
+    {
+      "epoch": 0.38393700787401575,
+      "grad_norm": 14.515801429748535,
+      "learning_rate": 9.99808661417323e-06,
+      "loss": 0.1511,
+      "step": 1219
+    },
+    {
+      "epoch": 0.384251968503937,
+      "grad_norm": 34.14856719970703,
+      "learning_rate": 9.998085039370079e-06,
+      "loss": 0.4351,
+      "step": 1220
+    },
+    {
+      "epoch": 0.384251968503937,
+      "eval_loss": 0.49825048446655273,
+      "eval_runtime": 340.3122,
+      "eval_samples_per_second": 0.344,
+      "eval_steps_per_second": 0.344,
+      "step": 1220
+    },
+    {
+      "epoch": 0.38456692913385826,
+      "grad_norm": 42.81884765625,
+      "learning_rate": 9.99808346456693e-06,
+      "loss": 0.4317,
+      "step": 1221
+    },
+    {
+      "epoch": 0.3848818897637795,
+      "grad_norm": 40.3559455871582,
+      "learning_rate": 9.99808188976378e-06,
+      "loss": 0.5042,
+      "step": 1222
+    },
+    {
+      "epoch": 0.3851968503937008,
+      "grad_norm": 28.505815505981445,
+      "learning_rate": 9.998080314960631e-06,
+      "loss": 0.316,
+      "step": 1223
+    },
+    {
+      "epoch": 0.38551181102362203,
+      "grad_norm": 20.508024215698242,
+      "learning_rate": 9.99807874015748e-06,
+      "loss": 0.15,
+      "step": 1224
+    },
+    {
+      "epoch": 0.3858267716535433,
+      "grad_norm": 73.8309555053711,
+      "learning_rate": 9.998077165354332e-06,
+      "loss": 0.4734,
+      "step": 1225
+    },
+    {
+      "epoch": 0.38614173228346454,
+      "grad_norm": 24.401338577270508,
+      "learning_rate": 9.998075590551181e-06,
+      "loss": 0.2553,
+      "step": 1226
+    },
+    {
+      "epoch": 0.3864566929133858,
+      "grad_norm": 28.52256202697754,
+      "learning_rate": 9.998074015748032e-06,
+      "loss": 0.0865,
+      "step": 1227
+    },
+    {
+      "epoch": 0.38677165354330706,
+      "grad_norm": 45.155696868896484,
+      "learning_rate": 9.998072440944883e-06,
+      "loss": 0.413,
+      "step": 1228
+    },
+    {
+      "epoch": 0.38708661417322837,
+      "grad_norm": 23.677753448486328,
+      "learning_rate": 9.998070866141734e-06,
+      "loss": 0.1919,
+      "step": 1229
+    },
+    {
+      "epoch": 0.38740157480314963,
+      "grad_norm": 27.503589630126953,
+      "learning_rate": 9.998069291338583e-06,
+      "loss": 0.6631,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3877165354330709,
+      "grad_norm": 33.99694061279297,
+      "learning_rate": 9.998067716535433e-06,
+      "loss": 0.5537,
+      "step": 1231
+    },
+    {
+      "epoch": 0.38803149606299214,
+      "grad_norm": 73.67473602294922,
+      "learning_rate": 9.998066141732284e-06,
+      "loss": 0.2761,
+      "step": 1232
+    },
+    {
+      "epoch": 0.3883464566929134,
+      "grad_norm": 47.667327880859375,
+      "learning_rate": 9.998064566929135e-06,
+      "loss": 0.1336,
+      "step": 1233
+    },
+    {
+      "epoch": 0.38866141732283466,
+      "grad_norm": 32.623802185058594,
+      "learning_rate": 9.998062992125986e-06,
+      "loss": 0.1881,
+      "step": 1234
+    },
+    {
+      "epoch": 0.3889763779527559,
+      "grad_norm": 92.79457092285156,
+      "learning_rate": 9.998061417322835e-06,
+      "loss": 0.9026,
+      "step": 1235
+    },
+    {
+      "epoch": 0.38929133858267717,
+      "grad_norm": 47.63346862792969,
+      "learning_rate": 9.998059842519686e-06,
+      "loss": 0.2402,
+      "step": 1236
+    },
+    {
+      "epoch": 0.38960629921259843,
+      "grad_norm": 113.52471923828125,
+      "learning_rate": 9.998058267716537e-06,
+      "loss": 0.7758,
+      "step": 1237
+    },
+    {
+      "epoch": 0.3899212598425197,
+      "grad_norm": 44.212303161621094,
+      "learning_rate": 9.998056692913388e-06,
+      "loss": 0.1154,
+      "step": 1238
+    },
+    {
+      "epoch": 0.39023622047244094,
+      "grad_norm": 113.48955535888672,
+      "learning_rate": 9.998055118110237e-06,
+      "loss": 0.7735,
+      "step": 1239
+    },
+    {
+      "epoch": 0.3905511811023622,
+      "grad_norm": 95.3028564453125,
+      "learning_rate": 9.998053543307086e-06,
+      "loss": 0.5174,
+      "step": 1240
+    },
+    {
+      "epoch": 0.3905511811023622,
+      "eval_loss": 0.680210292339325,
+      "eval_runtime": 337.0826,
+      "eval_samples_per_second": 0.347,
+      "eval_steps_per_second": 0.347,
+      "step": 1240
+    },
+    {
+      "epoch": 0.39086614173228346,
+      "grad_norm": 75.17251586914062,
+      "learning_rate": 9.998051968503937e-06,
+      "loss": 0.6919,
+      "step": 1241
+    },
+    {
+      "epoch": 0.3911811023622047,
+      "grad_norm": 94.41243743896484,
+      "learning_rate": 9.998050393700788e-06,
+      "loss": 1.1939,
+      "step": 1242
+    },
+    {
+      "epoch": 0.39149606299212597,
+      "grad_norm": 17.574474334716797,
+      "learning_rate": 9.99804881889764e-06,
+      "loss": 0.0841,
+      "step": 1243
+    },
+    {
+      "epoch": 0.3918110236220472,
+      "grad_norm": 22.532442092895508,
+      "learning_rate": 9.998047244094489e-06,
+      "loss": 0.0756,
+      "step": 1244
+    },
+    {
+      "epoch": 0.3921259842519685,
+      "grad_norm": 48.4405632019043,
+      "learning_rate": 9.99804566929134e-06,
+      "loss": 0.225,
+      "step": 1245
+    },
+    {
+      "epoch": 0.39244094488188974,
+      "grad_norm": 76.005859375,
+      "learning_rate": 9.998044094488189e-06,
+      "loss": 0.1798,
+      "step": 1246
+    },
+    {
+      "epoch": 0.392755905511811,
+      "grad_norm": 90.26568603515625,
+      "learning_rate": 9.99804251968504e-06,
+      "loss": 1.1972,
+      "step": 1247
+    },
+    {
+      "epoch": 0.39307086614173226,
+      "grad_norm": 69.64557647705078,
+      "learning_rate": 9.998040944881891e-06,
+      "loss": 0.1896,
+      "step": 1248
+    },
+    {
+      "epoch": 0.39338582677165357,
+      "grad_norm": 68.00067901611328,
+      "learning_rate": 9.998039370078742e-06,
+      "loss": 0.7109,
+      "step": 1249
+    },
+    {
+      "epoch": 0.3937007874015748,
+      "grad_norm": 57.22831344604492,
+      "learning_rate": 9.998037795275591e-06,
+      "loss": 0.8236,
+      "step": 1250
+    },
+    {
+      "epoch": 0.3940157480314961,
+      "grad_norm": 47.59756851196289,
+      "learning_rate": 9.99803622047244e-06,
+      "loss": 0.2775,
+      "step": 1251
+    },
+    {
+      "epoch": 0.39433070866141734,
+      "grad_norm": 23.361492156982422,
+      "learning_rate": 9.998034645669292e-06,
+      "loss": 0.0931,
+      "step": 1252
+    },
+    {
+      "epoch": 0.3946456692913386,
+      "grad_norm": 40.62185287475586,
+      "learning_rate": 9.998033070866143e-06,
+      "loss": 0.3612,
+      "step": 1253
+    },
+    {
+      "epoch": 0.39496062992125985,
+      "grad_norm": 31.514081954956055,
+      "learning_rate": 9.998031496062994e-06,
+      "loss": 0.0952,
+      "step": 1254
+    },
+    {
+      "epoch": 0.3952755905511811,
+      "grad_norm": 39.17756652832031,
+      "learning_rate": 9.998029921259843e-06,
+      "loss": 0.2162,
+      "step": 1255
+    },
+    {
+      "epoch": 0.39559055118110237,
+      "grad_norm": 19.34990882873535,
+      "learning_rate": 9.998028346456694e-06,
+      "loss": 0.1018,
+      "step": 1256
+    },
+    {
+      "epoch": 0.3959055118110236,
+      "grad_norm": 40.027671813964844,
+      "learning_rate": 9.998026771653543e-06,
+      "loss": 0.217,
+      "step": 1257
+    },
+    {
+      "epoch": 0.3962204724409449,
+      "grad_norm": 97.05489349365234,
+      "learning_rate": 9.998025196850394e-06,
+      "loss": 0.3828,
+      "step": 1258
+    },
+    {
+      "epoch": 0.39653543307086614,
+      "grad_norm": 20.452852249145508,
+      "learning_rate": 9.998023622047245e-06,
+      "loss": 0.1665,
+      "step": 1259
+    },
+    {
+      "epoch": 0.3968503937007874,
+      "grad_norm": 76.07334899902344,
+      "learning_rate": 9.998022047244094e-06,
+      "loss": 0.9711,
+      "step": 1260
+    },
+    {
+      "epoch": 0.3968503937007874,
+      "eval_loss": 0.508359432220459,
+      "eval_runtime": 337.8333,
+      "eval_samples_per_second": 0.346,
+      "eval_steps_per_second": 0.346,
+      "step": 1260
+    },
+    {
+      "epoch": 0.39716535433070865,
+      "grad_norm": 44.56214904785156,
+      "learning_rate": 9.998020472440945e-06,
+      "loss": 0.1862,
+      "step": 1261
+    },
+    {
+      "epoch": 0.3974803149606299,
+      "grad_norm": 105.54949188232422,
+      "learning_rate": 9.998018897637796e-06,
+      "loss": 1.0023,
+      "step": 1262
+    },
+    {
+      "epoch": 0.39779527559055117,
+      "grad_norm": 64.92313385009766,
+      "learning_rate": 9.998017322834647e-06,
+      "loss": 0.9656,
+      "step": 1263
+    },
+    {
+      "epoch": 0.3981102362204724,
+      "grad_norm": 60.504730224609375,
+      "learning_rate": 9.998015748031497e-06,
+      "loss": 0.3303,
+      "step": 1264
+    },
+    {
+      "epoch": 0.3984251968503937,
+      "grad_norm": 57.36290740966797,
+      "learning_rate": 9.998014173228348e-06,
+      "loss": 0.2705,
+      "step": 1265
+    },
+    {
+      "epoch": 0.39874015748031494,
+      "grad_norm": 49.48656463623047,
+      "learning_rate": 9.998012598425197e-06,
+      "loss": 0.5267,
+      "step": 1266
+    },
+    {
+      "epoch": 0.3990551181102362,
+      "grad_norm": 73.73528289794922,
+      "learning_rate": 9.998011023622048e-06,
+      "loss": 0.2752,
+      "step": 1267
+    },
+    {
+      "epoch": 0.39937007874015745,
+      "grad_norm": 77.58939361572266,
+      "learning_rate": 9.998009448818899e-06,
+      "loss": 0.782,
+      "step": 1268
+    },
+    {
+      "epoch": 0.39968503937007877,
+      "grad_norm": 13.655567169189453,
+      "learning_rate": 9.99800787401575e-06,
+      "loss": 0.0997,
+      "step": 1269
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 31.09360122680664,
+      "learning_rate": 9.9980062992126e-06,
+      "loss": 0.275,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4003149606299213,
+      "grad_norm": 57.30973815917969,
+      "learning_rate": 9.998004724409449e-06,
+      "loss": 0.5763,
+      "step": 1271
+    },
+    {
+      "epoch": 0.40062992125984254,
+      "grad_norm": 55.74612808227539,
+      "learning_rate": 9.9980031496063e-06,
+      "loss": 0.8855,
+      "step": 1272
+    },
+    {
+      "epoch": 0.4009448818897638,
+      "grad_norm": 45.396583557128906,
+      "learning_rate": 9.99800157480315e-06,
+      "loss": 0.4024,
+      "step": 1273
+    },
+    {
+      "epoch": 0.40125984251968505,
+      "grad_norm": 17.555898666381836,
+      "learning_rate": 9.998000000000002e-06,
+      "loss": 0.128,
+      "step": 1274
+    },
+    {
+      "epoch": 0.4015748031496063,
+      "grad_norm": 54.747310638427734,
+      "learning_rate": 9.99799842519685e-06,
+      "loss": 0.8548,
+      "step": 1275
+    },
+    {
+      "epoch": 0.40188976377952756,
+      "grad_norm": 36.224857330322266,
+      "learning_rate": 9.997996850393702e-06,
+      "loss": 0.2832,
+      "step": 1276
+    },
+    {
+      "epoch": 0.4022047244094488,
+      "grad_norm": 20.606124877929688,
+      "learning_rate": 9.997995275590551e-06,
+      "loss": 0.2242,
+      "step": 1277
+    },
+    {
+      "epoch": 0.4025196850393701,
+      "grad_norm": 37.51258087158203,
+      "learning_rate": 9.997993700787402e-06,
+      "loss": 0.3867,
+      "step": 1278
+    },
+    {
+      "epoch": 0.40283464566929134,
+      "grad_norm": 14.421310424804688,
+      "learning_rate": 9.997992125984253e-06,
+      "loss": 0.1174,
+      "step": 1279
+    },
+    {
+      "epoch": 0.4031496062992126,
+      "grad_norm": 41.61854553222656,
+      "learning_rate": 9.997990551181104e-06,
+      "loss": 0.3572,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4031496062992126,
+      "eval_loss": 0.48692360520362854,
+      "eval_runtime": 338.2236,
+      "eval_samples_per_second": 0.346,
+      "eval_steps_per_second": 0.346,
+      "step": 1280
+    },
+    {
+      "epoch": 0.40346456692913385,
+      "grad_norm": 23.846805572509766,
+      "learning_rate": 9.997988976377953e-06,
+      "loss": 0.419,
+      "step": 1281
+    },
+    {
+      "epoch": 0.4037795275590551,
+      "grad_norm": 48.06615447998047,
+      "learning_rate": 9.997987401574804e-06,
+      "loss": 0.8735,
+      "step": 1282
+    },
+    {
+      "epoch": 0.40409448818897636,
+      "grad_norm": 15.782271385192871,
+      "learning_rate": 9.997985826771655e-06,
+      "loss": 0.0723,
+      "step": 1283
+    },
+    {
+      "epoch": 0.4044094488188976,
+      "grad_norm": 19.87238883972168,
+      "learning_rate": 9.997984251968505e-06,
+      "loss": 0.1359,
+      "step": 1284
+    },
+    {
+      "epoch": 0.4047244094488189,
+      "grad_norm": 38.17216110229492,
+      "learning_rate": 9.997982677165356e-06,
+      "loss": 0.3926,
+      "step": 1285
+    },
+    {
+      "epoch": 0.40503937007874014,
+      "grad_norm": 45.54020309448242,
+      "learning_rate": 9.997981102362205e-06,
+      "loss": 0.3831,
+      "step": 1286
+    },
+    {
+      "epoch": 0.4053543307086614,
+      "grad_norm": 26.464305877685547,
+      "learning_rate": 9.997979527559056e-06,
+      "loss": 0.4216,
+      "step": 1287
+    },
+    {
+      "epoch": 0.4056692913385827,
+      "grad_norm": 64.78831481933594,
+      "learning_rate": 9.997977952755907e-06,
+      "loss": 0.861,
+      "step": 1288
+    },
+    {
+      "epoch": 0.40598425196850396,
+      "grad_norm": 38.6380615234375,
+      "learning_rate": 9.997976377952758e-06,
+      "loss": 0.3388,
+      "step": 1289
+    },
+    {
+      "epoch": 0.4062992125984252,
+      "grad_norm": 44.91901779174805,
+      "learning_rate": 9.997974803149607e-06,
+      "loss": 0.2208,
+      "step": 1290
+    },
+    {
+      "epoch": 0.4066141732283465,
+      "grad_norm": 59.079185485839844,
+      "learning_rate": 9.997973228346456e-06,
+      "loss": 0.6019,
+      "step": 1291
+    },
+    {
+      "epoch": 0.40692913385826773,
+      "grad_norm": 65.11566925048828,
+      "learning_rate": 9.997971653543307e-06,
+      "loss": 0.64,
+      "step": 1292
+    },
+    {
+      "epoch": 0.407244094488189,
+      "grad_norm": 45.68169021606445,
+      "learning_rate": 9.997970078740158e-06,
+      "loss": 0.6369,
+      "step": 1293
+    },
+    {
+      "epoch": 0.40755905511811025,
+      "grad_norm": 6.096194744110107,
+      "learning_rate": 9.99796850393701e-06,
+      "loss": 0.0484,
+      "step": 1294
+    },
+    {
+      "epoch": 0.4078740157480315,
+      "grad_norm": 31.807212829589844,
+      "learning_rate": 9.997966929133859e-06,
+      "loss": 0.3238,
+      "step": 1295
+    },
+    {
+      "epoch": 0.40818897637795276,
+      "grad_norm": 66.85626983642578,
+      "learning_rate": 9.99796535433071e-06,
+      "loss": 0.6099,
+      "step": 1296
+    },
+    {
+      "epoch": 0.408503937007874,
+      "grad_norm": 15.900289535522461,
+      "learning_rate": 9.997963779527559e-06,
+      "loss": 0.3017,
+      "step": 1297
+    },
+    {
+      "epoch": 0.4088188976377953,
+      "grad_norm": 58.991703033447266,
+      "learning_rate": 9.99796220472441e-06,
+      "loss": 0.441,
+      "step": 1298
+    },
+    {
+      "epoch": 0.40913385826771653,
+      "grad_norm": 64.84235382080078,
+      "learning_rate": 9.997960629921261e-06,
+      "loss": 0.5636,
+      "step": 1299
+    },
+    {
+      "epoch": 0.4094488188976378,
+      "grad_norm": 48.843505859375,
+      "learning_rate": 9.997959055118112e-06,
+      "loss": 0.5989,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4094488188976378,
+      "eval_loss": 0.5216355323791504,
+      "eval_runtime": 339.9169,
+      "eval_samples_per_second": 0.344,
+      "eval_steps_per_second": 0.344,
+      "step": 1300
+    },
+    {
+      "epoch": 0.40976377952755905,
+      "grad_norm": 22.199951171875,
+      "learning_rate": 9.997957480314961e-06,
+      "loss": 0.331,
+      "step": 1301
+    },
+    {
+      "epoch": 0.4100787401574803,
+      "grad_norm": 56.680816650390625,
+      "learning_rate": 9.99795590551181e-06,
+      "loss": 0.4215,
+      "step": 1302
+    },
+    {
+      "epoch": 0.41039370078740156,
+      "grad_norm": 33.23557662963867,
+      "learning_rate": 9.997954330708662e-06,
+      "loss": 0.444,
+      "step": 1303
+    },
+    {
+      "epoch": 0.4107086614173228,
+      "grad_norm": 31.659833908081055,
+      "learning_rate": 9.997952755905513e-06,
+      "loss": 0.4401,
+      "step": 1304
+    },
+    {
+      "epoch": 0.4110236220472441,
+      "grad_norm": 14.66598129272461,
+      "learning_rate": 9.997951181102364e-06,
+      "loss": 0.1447,
+      "step": 1305
+    },
+    {
+      "epoch": 0.41133858267716533,
+      "grad_norm": 20.910083770751953,
+      "learning_rate": 9.997949606299213e-06,
+      "loss": 0.2887,
+      "step": 1306
+    },
+    {
+      "epoch": 0.4116535433070866,
+      "grad_norm": 39.282596588134766,
+      "learning_rate": 9.997948031496064e-06,
+      "loss": 0.5081,
+      "step": 1307
+    },
+    {
+      "epoch": 0.4119685039370079,
+      "grad_norm": 37.22988510131836,
+      "learning_rate": 9.997946456692915e-06,
+      "loss": 0.6354,
+      "step": 1308
+    },
+    {
+      "epoch": 0.41228346456692916,
+      "grad_norm": 51.220314025878906,
+      "learning_rate": 9.997944881889766e-06,
+      "loss": 0.4205,
+      "step": 1309
+    },
+    {
+      "epoch": 0.4125984251968504,
+      "grad_norm": 31.497386932373047,
+      "learning_rate": 9.997943307086615e-06,
+      "loss": 0.4645,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4129133858267717,
+      "grad_norm": 48.64023971557617,
+      "learning_rate": 9.997941732283464e-06,
+      "loss": 0.5369,
+      "step": 1311
+    },
+    {
+      "epoch": 0.41322834645669293,
+      "grad_norm": 44.41730499267578,
+      "learning_rate": 9.997940157480315e-06,
+      "loss": 0.4616,
+      "step": 1312
+    },
+    {
+      "epoch": 0.4135433070866142,
+      "grad_norm": 32.24418640136719,
+      "learning_rate": 9.997938582677166e-06,
+      "loss": 0.4085,
+      "step": 1313
+    },
+    {
+      "epoch": 0.41385826771653544,
+      "grad_norm": 25.68863296508789,
+      "learning_rate": 9.997937007874017e-06,
+      "loss": 0.1576,
+      "step": 1314
+    },
+    {
+      "epoch": 0.4141732283464567,
+      "grad_norm": 57.67222595214844,
+      "learning_rate": 9.997935433070867e-06,
+      "loss": 0.6927,
+      "step": 1315
+    },
+    {
+      "epoch": 0.41448818897637796,
+      "grad_norm": 46.1195182800293,
+      "learning_rate": 9.997933858267718e-06,
+      "loss": 0.3948,
+      "step": 1316
+    },
+    {
+      "epoch": 0.4148031496062992,
+      "grad_norm": 16.534828186035156,
+      "learning_rate": 9.997932283464567e-06,
+      "loss": 0.1382,
+      "step": 1317
+    },
+    {
+      "epoch": 0.41511811023622047,
+      "grad_norm": 40.07267379760742,
+      "learning_rate": 9.997930708661418e-06,
+      "loss": 0.3056,
+      "step": 1318
+    },
+    {
+      "epoch": 0.41543307086614173,
+      "grad_norm": 29.960514068603516,
+      "learning_rate": 9.997929133858269e-06,
+      "loss": 0.1982,
+      "step": 1319
+    },
+    {
+      "epoch": 0.415748031496063,
+      "grad_norm": 26.702951431274414,
+      "learning_rate": 9.99792755905512e-06,
+      "loss": 0.4233,
+      "step": 1320
+    },
+    {
+      "epoch": 0.415748031496063,
+      "eval_loss": 0.4609772861003876,
+      "eval_runtime": 338.1687,
+      "eval_samples_per_second": 0.346,
+      "eval_steps_per_second": 0.346,
+      "step": 1320
+    },
+    {
+      "epoch": 0.41606299212598424,
+      "grad_norm": 31.864824295043945,
+      "learning_rate": 9.99792598425197e-06,
+      "loss": 0.5666,
+      "step": 1321
+    },
+    {
+      "epoch": 0.4163779527559055,
+      "grad_norm": 32.256771087646484,
+      "learning_rate": 9.997924409448819e-06,
+      "loss": 0.2308,
+      "step": 1322
+    },
+    {
+      "epoch": 0.41669291338582676,
+      "grad_norm": 20.752614974975586,
+      "learning_rate": 9.99792283464567e-06,
+      "loss": 0.3462,
+      "step": 1323
+    },
+    {
+      "epoch": 0.417007874015748,
+      "grad_norm": 86.646484375,
+      "learning_rate": 9.99792125984252e-06,
+      "loss": 0.7229,
+      "step": 1324
+    },
+    {
+      "epoch": 0.41732283464566927,
+      "grad_norm": 36.791202545166016,
+      "learning_rate": 9.997919685039372e-06,
+      "loss": 0.6036,
+      "step": 1325
+    },
+    {
+      "epoch": 0.41763779527559053,
+      "grad_norm": 60.902095794677734,
+      "learning_rate": 9.99791811023622e-06,
+      "loss": 0.7423,
+      "step": 1326
+    },
+    {
+      "epoch": 0.4179527559055118,
+      "grad_norm": 15.44100284576416,
+      "learning_rate": 9.997916535433072e-06,
+      "loss": 0.0577,
+      "step": 1327
+    },
+    {
+      "epoch": 0.4182677165354331,
+      "grad_norm": 39.82502365112305,
+      "learning_rate": 9.997914960629921e-06,
+      "loss": 0.351,
+      "step": 1328
+    },
+    {
+      "epoch": 0.41858267716535436,
+      "grad_norm": 19.225820541381836,
+      "learning_rate": 9.997913385826772e-06,
+      "loss": 0.1259,
+      "step": 1329
+    },
+    {
+      "epoch": 0.4188976377952756,
+      "grad_norm": 20.358154296875,
+      "learning_rate": 9.997911811023623e-06,
+      "loss": 0.1947,
+      "step": 1330
+    },
+    {
+      "epoch": 0.41921259842519687,
+      "grad_norm": 15.202303886413574,
+      "learning_rate": 9.997910236220472e-06,
+      "loss": 0.0907,
+      "step": 1331
+    },
+    {
+      "epoch": 0.4195275590551181,
+      "grad_norm": 16.259374618530273,
+      "learning_rate": 9.997908661417323e-06,
+      "loss": 0.0561,
+      "step": 1332
+    },
+    {
+      "epoch": 0.4198425196850394,
+      "grad_norm": 40.858028411865234,
+      "learning_rate": 9.997907086614174e-06,
+      "loss": 0.3551,
+      "step": 1333
+    },
+    {
+      "epoch": 0.42015748031496064,
+      "grad_norm": 21.958782196044922,
+      "learning_rate": 9.997905511811025e-06,
+      "loss": 0.1549,
+      "step": 1334
+    },
+    {
+      "epoch": 0.4204724409448819,
+      "grad_norm": 60.448734283447266,
+      "learning_rate": 9.997903937007875e-06,
+      "loss": 0.4945,
+      "step": 1335
+    },
+    {
+      "epoch": 0.42078740157480315,
+      "grad_norm": 9.495431900024414,
+      "learning_rate": 9.997902362204726e-06,
+      "loss": 0.0391,
+      "step": 1336
+    },
+    {
+      "epoch": 0.4211023622047244,
+      "grad_norm": 96.30876922607422,
+      "learning_rate": 9.997900787401575e-06,
+      "loss": 0.5236,
+      "step": 1337
+    },
+    {
+      "epoch": 0.42141732283464567,
+      "grad_norm": 100.25637817382812,
+      "learning_rate": 9.997899212598426e-06,
+      "loss": 0.5044,
+      "step": 1338
+    },
+    {
+      "epoch": 0.4217322834645669,
+      "grad_norm": 10.620447158813477,
+      "learning_rate": 9.997897637795277e-06,
+      "loss": 0.0333,
+      "step": 1339
+    },
+    {
+      "epoch": 0.4220472440944882,
+      "grad_norm": 99.30320739746094,
+      "learning_rate": 9.997896062992128e-06,
+      "loss": 0.4552,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4220472440944882,
+      "eval_loss": 0.6429303884506226,
+      "eval_runtime": 339.7084,
+      "eval_samples_per_second": 0.344,
+      "eval_steps_per_second": 0.344,
+      "step": 1340
+    },
+    {
+      "epoch": 0.42236220472440944,
+      "grad_norm": 90.87741088867188,
+      "learning_rate": 9.997894488188977e-06,
+      "loss": 0.8964,
+      "step": 1341
+    },
+    {
+      "epoch": 0.4226771653543307,
+      "grad_norm": 53.61817169189453,
+      "learning_rate": 9.997892913385827e-06,
+      "loss": 0.2336,
+      "step": 1342
+    },
+    {
+      "epoch": 0.42299212598425195,
+      "grad_norm": 69.91616821289062,
+      "learning_rate": 9.997891338582678e-06,
+      "loss": 0.3807,
+      "step": 1343
+    },
+    {
+      "epoch": 0.4233070866141732,
+      "grad_norm": 64.52349853515625,
+      "learning_rate": 9.997889763779528e-06,
+      "loss": 0.3968,
+      "step": 1344
+    },
+    {
+      "epoch": 0.42362204724409447,
+      "grad_norm": 101.8323745727539,
+      "learning_rate": 9.99788818897638e-06,
+      "loss": 0.6942,
+      "step": 1345
+    },
+    {
+      "epoch": 0.4239370078740157,
+      "grad_norm": 137.70166015625,
+      "learning_rate": 9.997886614173229e-06,
+      "loss": 1.0158,
+      "step": 1346
+    },
+    {
+      "epoch": 0.424251968503937,
+      "grad_norm": 95.1364974975586,
+      "learning_rate": 9.99788503937008e-06,
+      "loss": 0.4745,
+      "step": 1347
+    },
+    {
+      "epoch": 0.4245669291338583,
+      "grad_norm": 42.36204147338867,
+      "learning_rate": 9.997883464566929e-06,
+      "loss": 0.265,
+      "step": 1348
+    },
+    {
+      "epoch": 0.42488188976377955,
+      "grad_norm": 37.66621017456055,
+      "learning_rate": 9.99788188976378e-06,
+      "loss": 0.5518,
+      "step": 1349
+    },
+    {
+      "epoch": 0.4251968503937008,
+      "grad_norm": 54.68177032470703,
+      "learning_rate": 9.997880314960631e-06,
+      "loss": 0.2369,
+      "step": 1350
+    },
+    {
+      "epoch": 0.42551181102362207,
+      "grad_norm": 88.8768310546875,
+      "learning_rate": 9.99787874015748e-06,
+      "loss": 1.004,
+      "step": 1351
+    },
+    {
+      "epoch": 0.4258267716535433,
+      "grad_norm": 9.298436164855957,
+      "learning_rate": 9.997877165354331e-06,
+      "loss": 0.0451,
+      "step": 1352
+    },
+    {
+      "epoch": 0.4261417322834646,
+      "grad_norm": 68.624755859375,
+      "learning_rate": 9.997875590551182e-06,
+      "loss": 0.9096,
+      "step": 1353
+    },
+    {
+      "epoch": 0.42645669291338584,
+      "grad_norm": 42.33887481689453,
+      "learning_rate": 9.997874015748033e-06,
+      "loss": 0.5008,
+      "step": 1354
+    },
+    {
+      "epoch": 0.4267716535433071,
+      "grad_norm": 84.9489517211914,
+      "learning_rate": 9.997872440944883e-06,
+      "loss": 0.4279,
+      "step": 1355
+    },
+    {
+      "epoch": 0.42708661417322835,
+      "grad_norm": 26.046403884887695,
+      "learning_rate": 9.997870866141734e-06,
+      "loss": 0.2294,
+      "step": 1356
+    },
+    {
+      "epoch": 0.4274015748031496,
+      "grad_norm": 24.110301971435547,
+      "learning_rate": 9.997869291338583e-06,
+      "loss": 0.1951,
+      "step": 1357
+    },
+    {
+      "epoch": 0.42771653543307087,
+      "grad_norm": 31.500816345214844,
+      "learning_rate": 9.997867716535434e-06,
+      "loss": 0.1093,
+      "step": 1358
+    },
+    {
+      "epoch": 0.4280314960629921,
+      "grad_norm": 37.86301040649414,
+      "learning_rate": 9.997866141732285e-06,
+      "loss": 0.5599,
+      "step": 1359
+    },
+    {
+      "epoch": 0.4283464566929134,
+      "grad_norm": 68.08171081542969,
+      "learning_rate": 9.997864566929136e-06,
+      "loss": 1.32,
+      "step": 1360
+    },
+    {
+      "epoch": 0.4283464566929134,
+      "eval_loss": 0.47423291206359863,
+      "eval_runtime": 339.2488,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1360
+    },
+    {
+      "epoch": 0.42866141732283464,
+      "grad_norm": 32.4645881652832,
+      "learning_rate": 9.997862992125985e-06,
+      "loss": 0.2244,
+      "step": 1361
+    },
+    {
+      "epoch": 0.4289763779527559,
+      "grad_norm": 21.44221305847168,
+      "learning_rate": 9.997861417322834e-06,
+      "loss": 0.1674,
+      "step": 1362
+    },
+    {
+      "epoch": 0.42929133858267715,
+      "grad_norm": 67.8936538696289,
+      "learning_rate": 9.997859842519685e-06,
+      "loss": 1.0609,
+      "step": 1363
+    },
+    {
+      "epoch": 0.4296062992125984,
+      "grad_norm": 44.46934127807617,
+      "learning_rate": 9.997858267716536e-06,
+      "loss": 0.4285,
+      "step": 1364
+    },
+    {
+      "epoch": 0.42992125984251967,
+      "grad_norm": 25.02652359008789,
+      "learning_rate": 9.997856692913387e-06,
+      "loss": 0.5787,
+      "step": 1365
+    },
+    {
+      "epoch": 0.4302362204724409,
+      "grad_norm": 53.81482696533203,
+      "learning_rate": 9.997855118110237e-06,
+      "loss": 0.3341,
+      "step": 1366
+    },
+    {
+      "epoch": 0.4305511811023622,
+      "grad_norm": 47.348201751708984,
+      "learning_rate": 9.997853543307088e-06,
+      "loss": 0.6322,
+      "step": 1367
+    },
+    {
+      "epoch": 0.4308661417322835,
+      "grad_norm": 22.753206253051758,
+      "learning_rate": 9.997851968503937e-06,
+      "loss": 0.4667,
+      "step": 1368
+    },
+    {
+      "epoch": 0.43118110236220475,
+      "grad_norm": 13.805994033813477,
+      "learning_rate": 9.997850393700788e-06,
+      "loss": 0.2587,
+      "step": 1369
+    },
+    {
+      "epoch": 0.431496062992126,
+      "grad_norm": 39.555076599121094,
+      "learning_rate": 9.997848818897639e-06,
+      "loss": 0.4924,
+      "step": 1370
+    },
+    {
+      "epoch": 0.43181102362204726,
+      "grad_norm": 34.64474105834961,
+      "learning_rate": 9.997847244094488e-06,
+      "loss": 0.5824,
+      "step": 1371
+    },
+    {
+      "epoch": 0.4321259842519685,
+      "grad_norm": 19.8635311126709,
+      "learning_rate": 9.99784566929134e-06,
+      "loss": 0.3249,
+      "step": 1372
+    },
+    {
+      "epoch": 0.4324409448818898,
+      "grad_norm": 45.133663177490234,
+      "learning_rate": 9.997844094488189e-06,
+      "loss": 0.4831,
+      "step": 1373
+    },
+    {
+      "epoch": 0.43275590551181103,
+      "grad_norm": 39.303409576416016,
+      "learning_rate": 9.99784251968504e-06,
+      "loss": 0.3898,
+      "step": 1374
+    },
+    {
+      "epoch": 0.4330708661417323,
+      "grad_norm": 34.55439758300781,
+      "learning_rate": 9.99784094488189e-06,
+      "loss": 0.4709,
+      "step": 1375
+    },
+    {
+      "epoch": 0.43338582677165355,
+      "grad_norm": 23.586498260498047,
+      "learning_rate": 9.997839370078742e-06,
+      "loss": 0.2142,
+      "step": 1376
+    },
+    {
+      "epoch": 0.4337007874015748,
+      "grad_norm": 25.976821899414062,
+      "learning_rate": 9.997837795275591e-06,
+      "loss": 0.4925,
+      "step": 1377
+    },
+    {
+      "epoch": 0.43401574803149606,
+      "grad_norm": 42.66455841064453,
+      "learning_rate": 9.997836220472442e-06,
+      "loss": 0.7777,
+      "step": 1378
+    },
+    {
+      "epoch": 0.4343307086614173,
+      "grad_norm": 24.16051483154297,
+      "learning_rate": 9.997834645669293e-06,
+      "loss": 0.3197,
+      "step": 1379
+    },
+    {
+      "epoch": 0.4346456692913386,
+      "grad_norm": 62.70100021362305,
+      "learning_rate": 9.997833070866144e-06,
+      "loss": 0.2733,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4346456692913386,
+      "eval_loss": 0.4844158887863159,
+      "eval_runtime": 338.7196,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1380
+    },
+    {
+      "epoch": 0.43496062992125983,
+      "grad_norm": 43.02194595336914,
+      "learning_rate": 9.997831496062993e-06,
+      "loss": 0.3504,
+      "step": 1381
+    },
+    {
+      "epoch": 0.4352755905511811,
+      "grad_norm": 38.13003158569336,
+      "learning_rate": 9.997829921259842e-06,
+      "loss": 0.39,
+      "step": 1382
+    },
+    {
+      "epoch": 0.43559055118110235,
+      "grad_norm": 65.61643981933594,
+      "learning_rate": 9.997828346456693e-06,
+      "loss": 0.7555,
+      "step": 1383
+    },
+    {
+      "epoch": 0.4359055118110236,
+      "grad_norm": 31.994688034057617,
+      "learning_rate": 9.997826771653544e-06,
+      "loss": 0.4433,
+      "step": 1384
+    },
+    {
+      "epoch": 0.43622047244094486,
+      "grad_norm": 39.22994613647461,
+      "learning_rate": 9.997825196850395e-06,
+      "loss": 0.235,
+      "step": 1385
+    },
+    {
+      "epoch": 0.4365354330708661,
+      "grad_norm": 18.356678009033203,
+      "learning_rate": 9.997823622047245e-06,
+      "loss": 0.1587,
+      "step": 1386
+    },
+    {
+      "epoch": 0.4368503937007874,
+      "grad_norm": 16.878463745117188,
+      "learning_rate": 9.997822047244096e-06,
+      "loss": 0.3613,
+      "step": 1387
+    },
+    {
+      "epoch": 0.4371653543307087,
+      "grad_norm": 27.259780883789062,
+      "learning_rate": 9.997820472440945e-06,
+      "loss": 0.3243,
+      "step": 1388
+    },
+    {
+      "epoch": 0.43748031496062995,
+      "grad_norm": 25.487789154052734,
+      "learning_rate": 9.997818897637796e-06,
+      "loss": 0.4375,
+      "step": 1389
+    },
+    {
+      "epoch": 0.4377952755905512,
+      "grad_norm": 15.02742862701416,
+      "learning_rate": 9.997817322834647e-06,
+      "loss": 0.2624,
+      "step": 1390
+    },
+    {
+      "epoch": 0.43811023622047246,
+      "grad_norm": 26.566652297973633,
+      "learning_rate": 9.997815748031496e-06,
+      "loss": 0.4211,
+      "step": 1391
+    },
+    {
+      "epoch": 0.4384251968503937,
+      "grad_norm": 67.95926666259766,
+      "learning_rate": 9.997814173228347e-06,
+      "loss": 0.9252,
+      "step": 1392
+    },
+    {
+      "epoch": 0.438740157480315,
+      "grad_norm": 29.216794967651367,
+      "learning_rate": 9.997812598425197e-06,
+      "loss": 0.4028,
+      "step": 1393
+    },
+    {
+      "epoch": 0.43905511811023623,
+      "grad_norm": 50.51660919189453,
+      "learning_rate": 9.997811023622048e-06,
+      "loss": 0.4433,
+      "step": 1394
+    },
+    {
+      "epoch": 0.4393700787401575,
+      "grad_norm": 13.636687278747559,
+      "learning_rate": 9.997809448818899e-06,
+      "loss": 0.2541,
+      "step": 1395
+    },
+    {
+      "epoch": 0.43968503937007875,
+      "grad_norm": 26.405738830566406,
+      "learning_rate": 9.99780787401575e-06,
+      "loss": 0.3662,
+      "step": 1396
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 31.268871307373047,
+      "learning_rate": 9.997806299212599e-06,
+      "loss": 0.2522,
+      "step": 1397
+    },
+    {
+      "epoch": 0.44031496062992126,
+      "grad_norm": 52.73464584350586,
+      "learning_rate": 9.99780472440945e-06,
+      "loss": 0.5821,
+      "step": 1398
+    },
+    {
+      "epoch": 0.4406299212598425,
+      "grad_norm": 7.454155921936035,
+      "learning_rate": 9.997803149606299e-06,
+      "loss": 0.0584,
+      "step": 1399
+    },
+    {
+      "epoch": 0.4409448818897638,
+      "grad_norm": 51.72380828857422,
+      "learning_rate": 9.997801574803152e-06,
+      "loss": 0.4622,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4409448818897638,
+      "eval_loss": 0.47116619348526,
+      "eval_runtime": 337.7576,
+      "eval_samples_per_second": 0.346,
+      "eval_steps_per_second": 0.346,
+      "step": 1400
+    },
+    {
+      "epoch": 0.44125984251968503,
+      "grad_norm": 10.400198936462402,
+      "learning_rate": 9.997800000000001e-06,
+      "loss": 0.0597,
+      "step": 1401
+    },
+    {
+      "epoch": 0.4415748031496063,
+      "grad_norm": 48.547332763671875,
+      "learning_rate": 9.99779842519685e-06,
+      "loss": 0.3959,
+      "step": 1402
+    },
+    {
+      "epoch": 0.44188976377952754,
+      "grad_norm": 23.85326385498047,
+      "learning_rate": 9.997796850393701e-06,
+      "loss": 0.2755,
+      "step": 1403
+    },
+    {
+      "epoch": 0.4422047244094488,
+      "grad_norm": 25.128524780273438,
+      "learning_rate": 9.997795275590552e-06,
+      "loss": 0.0733,
+      "step": 1404
+    },
+    {
+      "epoch": 0.44251968503937006,
+      "grad_norm": 51.35587692260742,
+      "learning_rate": 9.997793700787403e-06,
+      "loss": 0.6701,
+      "step": 1405
+    },
+    {
+      "epoch": 0.4428346456692913,
+      "grad_norm": 14.554354667663574,
+      "learning_rate": 9.997792125984253e-06,
+      "loss": 0.064,
+      "step": 1406
+    },
+    {
+      "epoch": 0.4431496062992126,
+      "grad_norm": 67.32514953613281,
+      "learning_rate": 9.997790551181104e-06,
+      "loss": 0.4264,
+      "step": 1407
+    },
+    {
+      "epoch": 0.4434645669291339,
+      "grad_norm": 13.576302528381348,
+      "learning_rate": 9.997788976377953e-06,
+      "loss": 0.0385,
+      "step": 1408
+    },
+    {
+      "epoch": 0.44377952755905514,
+      "grad_norm": 126.57298278808594,
+      "learning_rate": 9.997787401574804e-06,
+      "loss": 0.7388,
+      "step": 1409
+    },
+    {
+      "epoch": 0.4440944881889764,
+      "grad_norm": 9.836527824401855,
+      "learning_rate": 9.997785826771655e-06,
+      "loss": 0.0383,
+      "step": 1410
+    },
+    {
+      "epoch": 0.44440944881889766,
+      "grad_norm": 76.40158081054688,
+      "learning_rate": 9.997784251968504e-06,
+      "loss": 0.3938,
+      "step": 1411
+    },
+    {
+      "epoch": 0.4447244094488189,
+      "grad_norm": 60.20785140991211,
+      "learning_rate": 9.997782677165355e-06,
+      "loss": 0.2053,
+      "step": 1412
+    },
+    {
+      "epoch": 0.44503937007874017,
+      "grad_norm": 14.930899620056152,
+      "learning_rate": 9.997781102362204e-06,
+      "loss": 0.0554,
+      "step": 1413
+    },
+    {
+      "epoch": 0.44535433070866143,
+      "grad_norm": 101.52273559570312,
+      "learning_rate": 9.997779527559055e-06,
+      "loss": 0.6495,
+      "step": 1414
+    },
+    {
+      "epoch": 0.4456692913385827,
+      "grad_norm": 6.1531267166137695,
+      "learning_rate": 9.997777952755906e-06,
+      "loss": 0.0153,
+      "step": 1415
+    },
+    {
+      "epoch": 0.44598425196850394,
+      "grad_norm": 55.18434143066406,
+      "learning_rate": 9.997776377952757e-06,
+      "loss": 0.5381,
+      "step": 1416
+    },
+    {
+      "epoch": 0.4462992125984252,
+      "grad_norm": 5.298098564147949,
+      "learning_rate": 9.997774803149607e-06,
+      "loss": 0.0163,
+      "step": 1417
+    },
+    {
+      "epoch": 0.44661417322834646,
+      "grad_norm": 33.11286163330078,
+      "learning_rate": 9.997773228346458e-06,
+      "loss": 0.051,
+      "step": 1418
+    },
+    {
+      "epoch": 0.4469291338582677,
+      "grad_norm": 86.3932876586914,
+      "learning_rate": 9.997771653543307e-06,
+      "loss": 0.7798,
+      "step": 1419
+    },
+    {
+      "epoch": 0.44724409448818897,
+      "grad_norm": 86.36060333251953,
+      "learning_rate": 9.997770078740158e-06,
+      "loss": 0.812,
+      "step": 1420
+    },
+    {
+      "epoch": 0.44724409448818897,
+      "eval_loss": 0.7094002366065979,
+      "eval_runtime": 338.8984,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4475590551181102,
+      "grad_norm": 85.274169921875,
+      "learning_rate": 9.997768503937009e-06,
+      "loss": 1.2007,
+      "step": 1421
+    },
+    {
+      "epoch": 0.4478740157480315,
+      "grad_norm": 37.372406005859375,
+      "learning_rate": 9.997766929133858e-06,
+      "loss": 0.6024,
+      "step": 1422
+    },
+    {
+      "epoch": 0.44818897637795274,
+      "grad_norm": 72.87973022460938,
+      "learning_rate": 9.99776535433071e-06,
+      "loss": 0.9482,
+      "step": 1423
+    },
+    {
+      "epoch": 0.448503937007874,
+      "grad_norm": 19.026866912841797,
+      "learning_rate": 9.99776377952756e-06,
+      "loss": 0.0342,
+      "step": 1424
+    },
+    {
+      "epoch": 0.44881889763779526,
+      "grad_norm": 124.86366271972656,
+      "learning_rate": 9.997762204724411e-06,
+      "loss": 1.1195,
+      "step": 1425
+    },
+    {
+      "epoch": 0.4491338582677165,
+      "grad_norm": 91.7364273071289,
+      "learning_rate": 9.99776062992126e-06,
+      "loss": 0.6728,
+      "step": 1426
+    },
+    {
+      "epoch": 0.44944881889763777,
+      "grad_norm": 199.09945678710938,
+      "learning_rate": 9.997759055118112e-06,
+      "loss": 0.1931,
+      "step": 1427
+    },
+    {
+      "epoch": 0.4497637795275591,
+      "grad_norm": 88.02843475341797,
+      "learning_rate": 9.997757480314961e-06,
+      "loss": 0.8221,
+      "step": 1428
+    },
+    {
+      "epoch": 0.45007874015748034,
+      "grad_norm": 50.00212860107422,
+      "learning_rate": 9.997755905511812e-06,
+      "loss": 0.4988,
+      "step": 1429
+    },
+    {
+      "epoch": 0.4503937007874016,
+      "grad_norm": 46.531864166259766,
+      "learning_rate": 9.997754330708663e-06,
+      "loss": 0.166,
+      "step": 1430
+    },
+    {
+      "epoch": 0.45070866141732285,
+      "grad_norm": 62.54853439331055,
+      "learning_rate": 9.997752755905512e-06,
+      "loss": 0.8291,
+      "step": 1431
+    },
+    {
+      "epoch": 0.4510236220472441,
+      "grad_norm": 50.34440994262695,
+      "learning_rate": 9.997751181102363e-06,
+      "loss": 0.9383,
+      "step": 1432
+    },
+    {
+      "epoch": 0.45133858267716537,
+      "grad_norm": 93.70585632324219,
+      "learning_rate": 9.997749606299212e-06,
+      "loss": 0.7288,
+      "step": 1433
+    },
+    {
+      "epoch": 0.4516535433070866,
+      "grad_norm": 16.99148941040039,
+      "learning_rate": 9.997748031496063e-06,
+      "loss": 0.193,
+      "step": 1434
+    },
+    {
+      "epoch": 0.4519685039370079,
+      "grad_norm": 51.41777420043945,
+      "learning_rate": 9.997746456692914e-06,
+      "loss": 0.2534,
+      "step": 1435
+    },
+    {
+      "epoch": 0.45228346456692914,
+      "grad_norm": 60.7148323059082,
+      "learning_rate": 9.997744881889765e-06,
+      "loss": 0.824,
+      "step": 1436
+    },
+    {
+      "epoch": 0.4525984251968504,
+      "grad_norm": 42.44105529785156,
+      "learning_rate": 9.997743307086615e-06,
+      "loss": 0.5518,
+      "step": 1437
+    },
+    {
+      "epoch": 0.45291338582677165,
+      "grad_norm": 68.7177963256836,
+      "learning_rate": 9.997741732283466e-06,
+      "loss": 0.1653,
+      "step": 1438
+    },
+    {
+      "epoch": 0.4532283464566929,
+      "grad_norm": 34.2302131652832,
+      "learning_rate": 9.997740157480315e-06,
+      "loss": 0.3412,
+      "step": 1439
+    },
+    {
+      "epoch": 0.45354330708661417,
+      "grad_norm": 18.738046646118164,
+      "learning_rate": 9.997738582677166e-06,
+      "loss": 0.3157,
+      "step": 1440
+    },
+    {
+      "epoch": 0.45354330708661417,
+      "eval_loss": 0.5591472387313843,
+      "eval_runtime": 340.3548,
+      "eval_samples_per_second": 0.344,
+      "eval_steps_per_second": 0.344,
+      "step": 1440
+    },
+    {
+      "epoch": 0.4538582677165354,
+      "grad_norm": 113.84444427490234,
+      "learning_rate": 9.997737007874017e-06,
+      "loss": 1.0174,
+      "step": 1441
+    },
+    {
+      "epoch": 0.4541732283464567,
+      "grad_norm": 38.3621940612793,
+      "learning_rate": 9.997735433070866e-06,
+      "loss": 0.6811,
+      "step": 1442
+    },
+    {
+      "epoch": 0.45448818897637794,
+      "grad_norm": 34.84929656982422,
+      "learning_rate": 9.997733858267717e-06,
+      "loss": 0.2798,
+      "step": 1443
+    },
+    {
+      "epoch": 0.4548031496062992,
+      "grad_norm": 10.604235649108887,
+      "learning_rate": 9.997732283464567e-06,
+      "loss": 0.0995,
+      "step": 1444
+    },
+    {
+      "epoch": 0.45511811023622045,
+      "grad_norm": 48.81785202026367,
+      "learning_rate": 9.997730708661418e-06,
+      "loss": 0.713,
+      "step": 1445
+    },
+    {
+      "epoch": 0.4554330708661417,
+      "grad_norm": 22.666885375976562,
+      "learning_rate": 9.997729133858269e-06,
+      "loss": 0.5139,
+      "step": 1446
+    },
+    {
+      "epoch": 0.45574803149606297,
+      "grad_norm": 12.92003059387207,
+      "learning_rate": 9.99772755905512e-06,
+      "loss": 0.1314,
+      "step": 1447
+    },
+    {
+      "epoch": 0.4560629921259843,
+      "grad_norm": 29.922138214111328,
+      "learning_rate": 9.997725984251969e-06,
+      "loss": 0.4134,
+      "step": 1448
+    },
+    {
+      "epoch": 0.45637795275590554,
+      "grad_norm": 47.216609954833984,
+      "learning_rate": 9.99772440944882e-06,
+      "loss": 1.0959,
+      "step": 1449
+    },
+    {
+      "epoch": 0.4566929133858268,
+      "grad_norm": 20.693220138549805,
+      "learning_rate": 9.99772283464567e-06,
+      "loss": 0.1627,
+      "step": 1450
+    },
+    {
+      "epoch": 0.45700787401574805,
+      "grad_norm": 27.217304229736328,
+      "learning_rate": 9.99772125984252e-06,
+      "loss": 0.2322,
+      "step": 1451
+    },
+    {
+      "epoch": 0.4573228346456693,
+      "grad_norm": 49.21244430541992,
+      "learning_rate": 9.997719685039371e-06,
+      "loss": 0.2963,
+      "step": 1452
+    },
+    {
+      "epoch": 0.45763779527559056,
+      "grad_norm": 29.46310043334961,
+      "learning_rate": 9.99771811023622e-06,
+      "loss": 0.3798,
+      "step": 1453
+    },
+    {
+      "epoch": 0.4579527559055118,
+      "grad_norm": 30.430831909179688,
+      "learning_rate": 9.997716535433071e-06,
+      "loss": 0.3264,
+      "step": 1454
+    },
+    {
+      "epoch": 0.4582677165354331,
+      "grad_norm": 61.625083923339844,
+      "learning_rate": 9.997714960629922e-06,
+      "loss": 1.0914,
+      "step": 1455
+    },
+    {
+      "epoch": 0.45858267716535434,
+      "grad_norm": 28.40131187438965,
+      "learning_rate": 9.997713385826773e-06,
+      "loss": 0.1856,
+      "step": 1456
+    },
+    {
+      "epoch": 0.4588976377952756,
+      "grad_norm": 30.0435733795166,
+      "learning_rate": 9.997711811023623e-06,
+      "loss": 0.5765,
+      "step": 1457
+    },
+    {
+      "epoch": 0.45921259842519685,
+      "grad_norm": 15.2453031539917,
+      "learning_rate": 9.997710236220474e-06,
+      "loss": 0.287,
+      "step": 1458
+    },
+    {
+      "epoch": 0.4595275590551181,
+      "grad_norm": 36.65151596069336,
+      "learning_rate": 9.997708661417323e-06,
+      "loss": 0.2213,
+      "step": 1459
+    },
+    {
+      "epoch": 0.45984251968503936,
+      "grad_norm": 37.6453971862793,
+      "learning_rate": 9.997707086614174e-06,
+      "loss": 0.5465,
+      "step": 1460
+    },
+    {
+      "epoch": 0.45984251968503936,
+      "eval_loss": 0.49336767196655273,
+      "eval_runtime": 340.9663,
+      "eval_samples_per_second": 0.343,
+      "eval_steps_per_second": 0.343,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4601574803149606,
+      "grad_norm": 21.586259841918945,
+      "learning_rate": 9.997705511811025e-06,
+      "loss": 0.1797,
+      "step": 1461
+    },
+    {
+      "epoch": 0.4604724409448819,
+      "grad_norm": 15.331527709960938,
+      "learning_rate": 9.997703937007874e-06,
+      "loss": 0.2192,
+      "step": 1462
+    },
+    {
+      "epoch": 0.46078740157480313,
+      "grad_norm": 27.347505569458008,
+      "learning_rate": 9.997702362204725e-06,
+      "loss": 0.6414,
+      "step": 1463
+    },
+    {
+      "epoch": 0.4611023622047244,
+      "grad_norm": 49.80681610107422,
+      "learning_rate": 9.997700787401575e-06,
+      "loss": 0.2621,
+      "step": 1464
+    },
+    {
+      "epoch": 0.46141732283464565,
+      "grad_norm": 20.706789016723633,
+      "learning_rate": 9.997699212598426e-06,
+      "loss": 0.1548,
+      "step": 1465
+    },
+    {
+      "epoch": 0.4617322834645669,
+      "grad_norm": 59.39773941040039,
+      "learning_rate": 9.997697637795276e-06,
+      "loss": 0.514,
+      "step": 1466
+    },
+    {
+      "epoch": 0.46204724409448816,
+      "grad_norm": 29.0017147064209,
+      "learning_rate": 9.997696062992127e-06,
+      "loss": 0.4804,
+      "step": 1467
+    },
+    {
+      "epoch": 0.4623622047244095,
+      "grad_norm": 59.08629608154297,
+      "learning_rate": 9.997694488188977e-06,
+      "loss": 0.6536,
+      "step": 1468
+    },
+    {
+      "epoch": 0.46267716535433073,
+      "grad_norm": 39.09469223022461,
+      "learning_rate": 9.997692913385828e-06,
+      "loss": 0.3326,
+      "step": 1469
+    },
+    {
+      "epoch": 0.462992125984252,
+      "grad_norm": 14.819294929504395,
+      "learning_rate": 9.997691338582679e-06,
+      "loss": 0.164,
+      "step": 1470
+    },
+    {
+      "epoch": 0.46330708661417325,
+      "grad_norm": 23.74578094482422,
+      "learning_rate": 9.997689763779528e-06,
+      "loss": 0.3426,
+      "step": 1471
+    },
+    {
+      "epoch": 0.4636220472440945,
+      "grad_norm": 31.888647079467773,
+      "learning_rate": 9.997688188976379e-06,
+      "loss": 0.4113,
+      "step": 1472
+    },
+    {
+      "epoch": 0.46393700787401576,
+      "grad_norm": 42.581398010253906,
+      "learning_rate": 9.997686614173228e-06,
+      "loss": 0.3324,
+      "step": 1473
+    },
+    {
+      "epoch": 0.464251968503937,
+      "grad_norm": 45.32304382324219,
+      "learning_rate": 9.99768503937008e-06,
+      "loss": 0.5497,
+      "step": 1474
+    },
+    {
+      "epoch": 0.4645669291338583,
+      "grad_norm": 31.52800750732422,
+      "learning_rate": 9.99768346456693e-06,
+      "loss": 0.6702,
+      "step": 1475
+    },
+    {
+      "epoch": 0.46488188976377953,
+      "grad_norm": 41.72283172607422,
+      "learning_rate": 9.997681889763781e-06,
+      "loss": 0.2619,
+      "step": 1476
+    },
+    {
+      "epoch": 0.4651968503937008,
+      "grad_norm": 97.3839340209961,
+      "learning_rate": 9.99768031496063e-06,
+      "loss": 0.4892,
+      "step": 1477
+    },
+    {
+      "epoch": 0.46551181102362205,
+      "grad_norm": 39.59928512573242,
+      "learning_rate": 9.997678740157482e-06,
+      "loss": 0.5622,
+      "step": 1478
+    },
+    {
+      "epoch": 0.4658267716535433,
+      "grad_norm": 23.758737564086914,
+      "learning_rate": 9.997677165354331e-06,
+      "loss": 0.3074,
+      "step": 1479
+    },
+    {
+      "epoch": 0.46614173228346456,
+      "grad_norm": 39.63565444946289,
+      "learning_rate": 9.997675590551182e-06,
+      "loss": 0.3154,
+      "step": 1480
+    },
+    {
+      "epoch": 0.46614173228346456,
+      "eval_loss": 0.4589572548866272,
+      "eval_runtime": 341.5074,
+      "eval_samples_per_second": 0.343,
+      "eval_steps_per_second": 0.343,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4664566929133858,
+      "grad_norm": 19.873315811157227,
+      "learning_rate": 9.997674015748033e-06,
+      "loss": 0.2133,
+      "step": 1481
+    },
+    {
+      "epoch": 0.4667716535433071,
+      "grad_norm": 32.47504806518555,
+      "learning_rate": 9.997672440944882e-06,
+      "loss": 0.3143,
+      "step": 1482
+    },
+    {
+      "epoch": 0.46708661417322833,
+      "grad_norm": 27.075977325439453,
+      "learning_rate": 9.997670866141733e-06,
+      "loss": 0.315,
+      "step": 1483
+    },
+    {
+      "epoch": 0.4674015748031496,
+      "grad_norm": 18.573650360107422,
+      "learning_rate": 9.997669291338582e-06,
+      "loss": 0.1998,
+      "step": 1484
+    },
+    {
+      "epoch": 0.46771653543307085,
+      "grad_norm": 19.674373626708984,
+      "learning_rate": 9.997667716535433e-06,
+      "loss": 0.2505,
+      "step": 1485
+    },
+    {
+      "epoch": 0.4680314960629921,
+      "grad_norm": 40.249202728271484,
+      "learning_rate": 9.997666141732284e-06,
+      "loss": 0.4321,
+      "step": 1486
+    },
+    {
+      "epoch": 0.46834645669291336,
+      "grad_norm": 12.53464412689209,
+      "learning_rate": 9.997664566929135e-06,
+      "loss": 0.069,
+      "step": 1487
+    },
+    {
+      "epoch": 0.4686614173228347,
+      "grad_norm": 38.792728424072266,
+      "learning_rate": 9.997662992125985e-06,
+      "loss": 0.417,
+      "step": 1488
+    },
+    {
+      "epoch": 0.46897637795275593,
+      "grad_norm": 56.2455940246582,
+      "learning_rate": 9.997661417322836e-06,
+      "loss": 0.7037,
+      "step": 1489
+    },
+    {
+      "epoch": 0.4692913385826772,
+      "grad_norm": 47.736263275146484,
+      "learning_rate": 9.997659842519685e-06,
+      "loss": 0.2714,
+      "step": 1490
+    },
+    {
+      "epoch": 0.46960629921259844,
+      "grad_norm": 75.22129821777344,
+      "learning_rate": 9.997658267716536e-06,
+      "loss": 0.5145,
+      "step": 1491
+    },
+    {
+      "epoch": 0.4699212598425197,
+      "grad_norm": 21.2304744720459,
+      "learning_rate": 9.997656692913387e-06,
+      "loss": 0.2883,
+      "step": 1492
+    },
+    {
+      "epoch": 0.47023622047244096,
+      "grad_norm": 62.4765625,
+      "learning_rate": 9.997655118110236e-06,
+      "loss": 0.4325,
+      "step": 1493
+    },
+    {
+      "epoch": 0.4705511811023622,
+      "grad_norm": 49.23244094848633,
+      "learning_rate": 9.997653543307087e-06,
+      "loss": 0.6759,
+      "step": 1494
+    },
+    {
+      "epoch": 0.47086614173228347,
+      "grad_norm": 21.292394638061523,
+      "learning_rate": 9.997651968503938e-06,
+      "loss": 0.1809,
+      "step": 1495
+    },
+    {
+      "epoch": 0.47118110236220473,
+      "grad_norm": 57.66270446777344,
+      "learning_rate": 9.99765039370079e-06,
+      "loss": 0.7726,
+      "step": 1496
+    },
+    {
+      "epoch": 0.471496062992126,
+      "grad_norm": 61.524253845214844,
+      "learning_rate": 9.997648818897639e-06,
+      "loss": 0.3385,
+      "step": 1497
+    },
+    {
+      "epoch": 0.47181102362204724,
+      "grad_norm": 51.74467468261719,
+      "learning_rate": 9.99764724409449e-06,
+      "loss": 0.1262,
+      "step": 1498
+    },
+    {
+      "epoch": 0.4721259842519685,
+      "grad_norm": 64.45372009277344,
+      "learning_rate": 9.997645669291339e-06,
+      "loss": 0.5856,
+      "step": 1499
+    },
+    {
+      "epoch": 0.47244094488188976,
+      "grad_norm": 33.162811279296875,
+      "learning_rate": 9.99764409448819e-06,
+      "loss": 0.8524,
+      "step": 1500
+    },
+    {
+      "epoch": 0.47244094488188976,
+      "eval_loss": 0.4843520522117615,
+      "eval_runtime": 337.6077,
+      "eval_samples_per_second": 0.347,
+      "eval_steps_per_second": 0.347,
+      "step": 1500
+    },
+    {
+      "epoch": 0.472755905511811,
+      "grad_norm": 58.66725158691406,
+      "learning_rate": 9.99764251968504e-06,
+      "loss": 0.7346,
+      "step": 1501
+    },
+    {
+      "epoch": 0.47307086614173227,
+      "grad_norm": 113.04570770263672,
+      "learning_rate": 9.99764094488189e-06,
+      "loss": 0.4166,
+      "step": 1502
+    },
+    {
+      "epoch": 0.47338582677165353,
+      "grad_norm": 49.267024993896484,
+      "learning_rate": 9.997639370078741e-06,
+      "loss": 0.8303,
+      "step": 1503
+    },
+    {
+      "epoch": 0.4737007874015748,
+      "grad_norm": 18.804046630859375,
+      "learning_rate": 9.99763779527559e-06,
+      "loss": 0.2172,
+      "step": 1504
+    },
+    {
+      "epoch": 0.47401574803149604,
+      "grad_norm": 23.38251495361328,
+      "learning_rate": 9.997636220472441e-06,
+      "loss": 0.1723,
+      "step": 1505
+    },
+    {
+      "epoch": 0.4743307086614173,
+      "grad_norm": 32.252750396728516,
+      "learning_rate": 9.997634645669292e-06,
+      "loss": 0.1733,
+      "step": 1506
+    },
+    {
+      "epoch": 0.47464566929133856,
+      "grad_norm": 58.86079025268555,
+      "learning_rate": 9.997633070866143e-06,
+      "loss": 0.8325,
+      "step": 1507
+    },
+    {
+      "epoch": 0.47496062992125987,
+      "grad_norm": 18.70965003967285,
+      "learning_rate": 9.997631496062993e-06,
+      "loss": 0.1669,
+      "step": 1508
+    },
+    {
+      "epoch": 0.4752755905511811,
+      "grad_norm": 31.96597671508789,
+      "learning_rate": 9.997629921259844e-06,
+      "loss": 0.3987,
+      "step": 1509
+    },
+    {
+      "epoch": 0.4755905511811024,
+      "grad_norm": 21.83759880065918,
+      "learning_rate": 9.997628346456693e-06,
+      "loss": 0.1909,
+      "step": 1510
+    },
+    {
+      "epoch": 0.47590551181102364,
+      "grad_norm": 27.163360595703125,
+      "learning_rate": 9.997626771653544e-06,
+      "loss": 0.3432,
+      "step": 1511
+    },
+    {
+      "epoch": 0.4762204724409449,
+      "grad_norm": 81.91073608398438,
+      "learning_rate": 9.997625196850395e-06,
+      "loss": 0.4286,
+      "step": 1512
+    },
+    {
+      "epoch": 0.47653543307086615,
+      "grad_norm": 23.85365867614746,
+      "learning_rate": 9.997623622047244e-06,
+      "loss": 0.2956,
+      "step": 1513
+    },
+    {
+      "epoch": 0.4768503937007874,
+      "grad_norm": 20.397815704345703,
+      "learning_rate": 9.997622047244095e-06,
+      "loss": 0.4461,
+      "step": 1514
+    },
+    {
+      "epoch": 0.47716535433070867,
+      "grad_norm": 30.885562896728516,
+      "learning_rate": 9.997620472440945e-06,
+      "loss": 0.477,
+      "step": 1515
+    },
+    {
+      "epoch": 0.4774803149606299,
+      "grad_norm": 33.92880630493164,
+      "learning_rate": 9.997618897637796e-06,
+      "loss": 0.3881,
+      "step": 1516
+    },
+    {
+      "epoch": 0.4777952755905512,
+      "grad_norm": 24.906478881835938,
+      "learning_rate": 9.997617322834647e-06,
+      "loss": 0.3944,
+      "step": 1517
+    },
+    {
+      "epoch": 0.47811023622047244,
+      "grad_norm": 17.711050033569336,
+      "learning_rate": 9.997615748031498e-06,
+      "loss": 0.3327,
+      "step": 1518
+    },
+    {
+      "epoch": 0.4784251968503937,
+      "grad_norm": 28.30522918701172,
+      "learning_rate": 9.997614173228347e-06,
+      "loss": 0.4556,
+      "step": 1519
+    },
+    {
+      "epoch": 0.47874015748031495,
+      "grad_norm": 22.290746688842773,
+      "learning_rate": 9.997612598425198e-06,
+      "loss": 0.177,
+      "step": 1520
+    },
+    {
+      "epoch": 0.47874015748031495,
+      "eval_loss": 0.45914146304130554,
+      "eval_runtime": 337.9882,
+      "eval_samples_per_second": 0.346,
+      "eval_steps_per_second": 0.346,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4790551181102362,
+      "grad_norm": 51.145206451416016,
+      "learning_rate": 9.997611023622049e-06,
+      "loss": 0.2974,
+      "step": 1521
+    },
+    {
+      "epoch": 0.47937007874015747,
+      "grad_norm": 53.45519256591797,
+      "learning_rate": 9.997609448818898e-06,
+      "loss": 0.2501,
+      "step": 1522
+    },
+    {
+      "epoch": 0.4796850393700787,
+      "grad_norm": 41.410377502441406,
+      "learning_rate": 9.997607874015749e-06,
+      "loss": 0.5987,
+      "step": 1523
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 29.04597282409668,
+      "learning_rate": 9.997606299212598e-06,
+      "loss": 0.3196,
+      "step": 1524
+    },
+    {
+      "epoch": 0.48031496062992124,
+      "grad_norm": 37.950008392333984,
+      "learning_rate": 9.99760472440945e-06,
+      "loss": 0.444,
+      "step": 1525
+    },
+    {
+      "epoch": 0.4806299212598425,
+      "grad_norm": 47.16240692138672,
+      "learning_rate": 9.9976031496063e-06,
+      "loss": 0.6532,
+      "step": 1526
+    },
+    {
+      "epoch": 0.48094488188976375,
+      "grad_norm": 33.97493362426758,
+      "learning_rate": 9.997601574803151e-06,
+      "loss": 0.4194,
+      "step": 1527
+    },
+    {
+      "epoch": 0.48125984251968507,
+      "grad_norm": 36.6172981262207,
+      "learning_rate": 9.9976e-06,
+      "loss": 0.4505,
+      "step": 1528
+    },
+    {
+      "epoch": 0.4815748031496063,
+      "grad_norm": 26.39623260498047,
+      "learning_rate": 9.997598425196852e-06,
+      "loss": 0.2039,
+      "step": 1529
+    },
+    {
+      "epoch": 0.4818897637795276,
+      "grad_norm": 24.622024536132812,
+      "learning_rate": 9.997596850393701e-06,
+      "loss": 0.2808,
+      "step": 1530
+    },
+    {
+      "epoch": 0.48220472440944884,
+      "grad_norm": 58.132904052734375,
+      "learning_rate": 9.997595275590552e-06,
+      "loss": 0.4782,
+      "step": 1531
+    },
+    {
+      "epoch": 0.4825196850393701,
+      "grad_norm": 53.385154724121094,
+      "learning_rate": 9.997593700787403e-06,
+      "loss": 0.4064,
+      "step": 1532
+    },
+    {
+      "epoch": 0.48283464566929135,
+      "grad_norm": 18.42597007751465,
+      "learning_rate": 9.997592125984252e-06,
+      "loss": 0.0884,
+      "step": 1533
+    },
+    {
+      "epoch": 0.4831496062992126,
+      "grad_norm": 68.61125183105469,
+      "learning_rate": 9.997590551181103e-06,
+      "loss": 0.52,
+      "step": 1534
+    },
+    {
+      "epoch": 0.48346456692913387,
+      "grad_norm": 19.761259078979492,
+      "learning_rate": 9.997588976377952e-06,
+      "loss": 0.1054,
+      "step": 1535
+    },
+    {
+      "epoch": 0.4837795275590551,
+      "grad_norm": 20.919004440307617,
+      "learning_rate": 9.997587401574803e-06,
+      "loss": 0.2819,
+      "step": 1536
+    },
+    {
+      "epoch": 0.4840944881889764,
+      "grad_norm": 24.11286735534668,
+      "learning_rate": 9.997585826771654e-06,
+      "loss": 0.2413,
+      "step": 1537
+    },
+    {
+      "epoch": 0.48440944881889764,
+      "grad_norm": 34.09194564819336,
+      "learning_rate": 9.997584251968505e-06,
+      "loss": 0.2417,
+      "step": 1538
+    },
+    {
+      "epoch": 0.4847244094488189,
+      "grad_norm": 53.1319580078125,
+      "learning_rate": 9.997582677165355e-06,
+      "loss": 0.5983,
+      "step": 1539
+    },
+    {
+      "epoch": 0.48503937007874015,
+      "grad_norm": 23.620807647705078,
+      "learning_rate": 9.997581102362206e-06,
+      "loss": 0.1747,
+      "step": 1540
+    },
+    {
+      "epoch": 0.48503937007874015,
+      "eval_loss": 0.453780859708786,
+      "eval_runtime": 336.017,
+      "eval_samples_per_second": 0.348,
+      "eval_steps_per_second": 0.348,
+      "step": 1540
+    },
+    {
+      "epoch": 0.4853543307086614,
+      "grad_norm": 45.581031799316406,
+      "learning_rate": 9.997579527559057e-06,
+      "loss": 0.4102,
+      "step": 1541
+    },
+    {
+      "epoch": 0.48566929133858266,
+      "grad_norm": 49.25587463378906,
+      "learning_rate": 9.997577952755906e-06,
+      "loss": 0.71,
+      "step": 1542
+    },
+    {
+      "epoch": 0.4859842519685039,
+      "grad_norm": 4.6258649826049805,
+      "learning_rate": 9.997576377952757e-06,
+      "loss": 0.0368,
+      "step": 1543
+    },
+    {
+      "epoch": 0.4862992125984252,
+      "grad_norm": 17.629261016845703,
+      "learning_rate": 9.997574803149606e-06,
+      "loss": 0.2871,
+      "step": 1544
+    },
+    {
+      "epoch": 0.48661417322834644,
+      "grad_norm": 53.029666900634766,
+      "learning_rate": 9.997573228346457e-06,
+      "loss": 0.4007,
+      "step": 1545
+    },
+    {
+      "epoch": 0.4869291338582677,
+      "grad_norm": 25.097759246826172,
+      "learning_rate": 9.997571653543308e-06,
+      "loss": 0.2665,
+      "step": 1546
+    },
+    {
+      "epoch": 0.48724409448818895,
+      "grad_norm": 59.07927703857422,
+      "learning_rate": 9.99757007874016e-06,
+      "loss": 0.8133,
+      "step": 1547
+    },
+    {
+      "epoch": 0.48755905511811026,
+      "grad_norm": 39.4317741394043,
+      "learning_rate": 9.997568503937009e-06,
+      "loss": 0.1684,
+      "step": 1548
+    },
+    {
+      "epoch": 0.4878740157480315,
+      "grad_norm": 61.61114501953125,
+      "learning_rate": 9.99756692913386e-06,
+      "loss": 0.5758,
+      "step": 1549
+    },
+    {
+      "epoch": 0.4881889763779528,
+      "grad_norm": 57.046142578125,
+      "learning_rate": 9.997565354330709e-06,
+      "loss": 0.4657,
+      "step": 1550
+    },
+    {
+      "epoch": 0.48850393700787403,
+      "grad_norm": 28.860565185546875,
+      "learning_rate": 9.99756377952756e-06,
+      "loss": 0.4151,
+      "step": 1551
+    },
+    {
+      "epoch": 0.4888188976377953,
+      "grad_norm": 92.83087158203125,
+      "learning_rate": 9.997562204724411e-06,
+      "loss": 0.7444,
+      "step": 1552
+    },
+    {
+      "epoch": 0.48913385826771655,
+      "grad_norm": 41.72923278808594,
+      "learning_rate": 9.99756062992126e-06,
+      "loss": 0.7972,
+      "step": 1553
+    },
+    {
+      "epoch": 0.4894488188976378,
+      "grad_norm": 43.05347442626953,
+      "learning_rate": 9.997559055118111e-06,
+      "loss": 0.1623,
+      "step": 1554
+    },
+    {
+      "epoch": 0.48976377952755906,
+      "grad_norm": 37.73609161376953,
+      "learning_rate": 9.99755748031496e-06,
+      "loss": 0.4308,
+      "step": 1555
+    },
+    {
+      "epoch": 0.4900787401574803,
+      "grad_norm": 22.235315322875977,
+      "learning_rate": 9.997555905511811e-06,
+      "loss": 0.2682,
+      "step": 1556
+    },
+    {
+      "epoch": 0.4903937007874016,
+      "grad_norm": 4.401269435882568,
+      "learning_rate": 9.997554330708662e-06,
+      "loss": 0.0164,
+      "step": 1557
+    },
+    {
+      "epoch": 0.49070866141732283,
+      "grad_norm": 44.11044692993164,
+      "learning_rate": 9.997552755905513e-06,
+      "loss": 0.3743,
+      "step": 1558
+    },
+    {
+      "epoch": 0.4910236220472441,
+      "grad_norm": 38.67685317993164,
+      "learning_rate": 9.997551181102363e-06,
+      "loss": 0.3507,
+      "step": 1559
+    },
+    {
+      "epoch": 0.49133858267716535,
+      "grad_norm": 20.494264602661133,
+      "learning_rate": 9.997549606299214e-06,
+      "loss": 0.3759,
+      "step": 1560
+    },
+    {
+      "epoch": 0.49133858267716535,
+      "eval_loss": 0.43347296118736267,
+      "eval_runtime": 339.139,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1560
+    },
+    {
+      "epoch": 0.4916535433070866,
+      "grad_norm": 21.03292465209961,
+      "learning_rate": 9.997548031496063e-06,
+      "loss": 0.2048,
+      "step": 1561
+    },
+    {
+      "epoch": 0.49196850393700786,
+      "grad_norm": 40.08341979980469,
+      "learning_rate": 9.997546456692914e-06,
+      "loss": 0.1279,
+      "step": 1562
+    },
+    {
+      "epoch": 0.4922834645669291,
+      "grad_norm": 31.835474014282227,
+      "learning_rate": 9.997544881889765e-06,
+      "loss": 0.26,
+      "step": 1563
+    },
+    {
+      "epoch": 0.4925984251968504,
+      "grad_norm": 26.725603103637695,
+      "learning_rate": 9.997543307086614e-06,
+      "loss": 0.2091,
+      "step": 1564
+    },
+    {
+      "epoch": 0.49291338582677163,
+      "grad_norm": 33.68645095825195,
+      "learning_rate": 9.997541732283465e-06,
+      "loss": 0.2875,
+      "step": 1565
+    },
+    {
+      "epoch": 0.4932283464566929,
+      "grad_norm": 56.92742156982422,
+      "learning_rate": 9.997540157480316e-06,
+      "loss": 1.1372,
+      "step": 1566
+    },
+    {
+      "epoch": 0.49354330708661415,
+      "grad_norm": 10.900672912597656,
+      "learning_rate": 9.997538582677167e-06,
+      "loss": 0.0537,
+      "step": 1567
+    },
+    {
+      "epoch": 0.49385826771653546,
+      "grad_norm": 20.076473236083984,
+      "learning_rate": 9.997537007874017e-06,
+      "loss": 0.1373,
+      "step": 1568
+    },
+    {
+      "epoch": 0.4941732283464567,
+      "grad_norm": 30.94587516784668,
+      "learning_rate": 9.997535433070868e-06,
+      "loss": 0.348,
+      "step": 1569
+    },
+    {
+      "epoch": 0.494488188976378,
+      "grad_norm": 79.50629425048828,
+      "learning_rate": 9.997533858267717e-06,
+      "loss": 0.9603,
+      "step": 1570
+    },
+    {
+      "epoch": 0.49480314960629923,
+      "grad_norm": 51.20693588256836,
+      "learning_rate": 9.997532283464568e-06,
+      "loss": 1.1457,
+      "step": 1571
+    },
+    {
+      "epoch": 0.4951181102362205,
+      "grad_norm": 34.06742477416992,
+      "learning_rate": 9.997530708661419e-06,
+      "loss": 0.4457,
+      "step": 1572
+    },
+    {
+      "epoch": 0.49543307086614174,
+      "grad_norm": 40.76358413696289,
+      "learning_rate": 9.997529133858268e-06,
+      "loss": 0.2998,
+      "step": 1573
+    },
+    {
+      "epoch": 0.495748031496063,
+      "grad_norm": 23.580713272094727,
+      "learning_rate": 9.997527559055119e-06,
+      "loss": 0.1263,
+      "step": 1574
+    },
+    {
+      "epoch": 0.49606299212598426,
+      "grad_norm": 6.127594470977783,
+      "learning_rate": 9.997525984251968e-06,
+      "loss": 0.0331,
+      "step": 1575
+    },
+    {
+      "epoch": 0.4963779527559055,
+      "grad_norm": 28.260459899902344,
+      "learning_rate": 9.99752440944882e-06,
+      "loss": 0.3719,
+      "step": 1576
+    },
+    {
+      "epoch": 0.4966929133858268,
+      "grad_norm": 68.38239288330078,
+      "learning_rate": 9.99752283464567e-06,
+      "loss": 0.6032,
+      "step": 1577
+    },
+    {
+      "epoch": 0.49700787401574803,
+      "grad_norm": 60.19215393066406,
+      "learning_rate": 9.997521259842521e-06,
+      "loss": 0.2563,
+      "step": 1578
+    },
+    {
+      "epoch": 0.4973228346456693,
+      "grad_norm": 40.08391571044922,
+      "learning_rate": 9.99751968503937e-06,
+      "loss": 0.2775,
+      "step": 1579
+    },
+    {
+      "epoch": 0.49763779527559054,
+      "grad_norm": 37.87879180908203,
+      "learning_rate": 9.997518110236222e-06,
+      "loss": 0.117,
+      "step": 1580
+    },
+    {
+      "epoch": 0.49763779527559054,
+      "eval_loss": 0.5100625157356262,
+      "eval_runtime": 338.9567,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1580
+    },
+    {
+      "epoch": 0.4979527559055118,
+      "grad_norm": 43.88140869140625,
+      "learning_rate": 9.997516535433071e-06,
+      "loss": 0.7505,
+      "step": 1581
+    },
+    {
+      "epoch": 0.49826771653543306,
+      "grad_norm": 10.613367080688477,
+      "learning_rate": 9.997514960629922e-06,
+      "loss": 0.0331,
+      "step": 1582
+    },
+    {
+      "epoch": 0.4985826771653543,
+      "grad_norm": 115.95816040039062,
+      "learning_rate": 9.997513385826773e-06,
+      "loss": 0.6034,
+      "step": 1583
+    },
+    {
+      "epoch": 0.4988976377952756,
+      "grad_norm": 52.247474670410156,
+      "learning_rate": 9.997511811023622e-06,
+      "loss": 0.7637,
+      "step": 1584
+    },
+    {
+      "epoch": 0.49921259842519683,
+      "grad_norm": 49.17156982421875,
+      "learning_rate": 9.997510236220473e-06,
+      "loss": 0.2622,
+      "step": 1585
+    },
+    {
+      "epoch": 0.4995275590551181,
+      "grad_norm": 52.29065704345703,
+      "learning_rate": 9.997508661417323e-06,
+      "loss": 0.2516,
+      "step": 1586
+    },
+    {
+      "epoch": 0.49984251968503934,
+      "grad_norm": 4.530742168426514,
+      "learning_rate": 9.997507086614175e-06,
+      "loss": 0.039,
+      "step": 1587
+    },
+    {
+      "epoch": 0.5001574803149607,
+      "grad_norm": 54.83158874511719,
+      "learning_rate": 9.997505511811024e-06,
+      "loss": 1.2184,
+      "step": 1588
+    },
+    {
+      "epoch": 0.5004724409448819,
+      "grad_norm": 80.9505615234375,
+      "learning_rate": 9.997503937007875e-06,
+      "loss": 0.6721,
+      "step": 1589
+    },
+    {
+      "epoch": 0.5007874015748032,
+      "grad_norm": 88.2104263305664,
+      "learning_rate": 9.997502362204725e-06,
+      "loss": 0.8541,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5011023622047244,
+      "grad_norm": 54.88927459716797,
+      "learning_rate": 9.997500787401576e-06,
+      "loss": 0.7247,
+      "step": 1591
+    },
+    {
+      "epoch": 0.5014173228346457,
+      "grad_norm": 58.55228805541992,
+      "learning_rate": 9.997499212598427e-06,
+      "loss": 0.6949,
+      "step": 1592
+    },
+    {
+      "epoch": 0.5017322834645669,
+      "grad_norm": 70.87881469726562,
+      "learning_rate": 9.997497637795276e-06,
+      "loss": 0.5249,
+      "step": 1593
+    },
+    {
+      "epoch": 0.5020472440944882,
+      "grad_norm": 49.13249206542969,
+      "learning_rate": 9.997496062992127e-06,
+      "loss": 0.3263,
+      "step": 1594
+    },
+    {
+      "epoch": 0.5023622047244094,
+      "grad_norm": 44.89517593383789,
+      "learning_rate": 9.997494488188976e-06,
+      "loss": 0.2266,
+      "step": 1595
+    },
+    {
+      "epoch": 0.5026771653543307,
+      "grad_norm": 34.858665466308594,
+      "learning_rate": 9.997492913385827e-06,
+      "loss": 0.3243,
+      "step": 1596
+    },
+    {
+      "epoch": 0.5029921259842519,
+      "grad_norm": 48.156105041503906,
+      "learning_rate": 9.997491338582678e-06,
+      "loss": 0.6439,
+      "step": 1597
+    },
+    {
+      "epoch": 0.5033070866141732,
+      "grad_norm": 23.67864418029785,
+      "learning_rate": 9.99748976377953e-06,
+      "loss": 0.4013,
+      "step": 1598
+    },
+    {
+      "epoch": 0.5036220472440945,
+      "grad_norm": 33.883583068847656,
+      "learning_rate": 9.997488188976379e-06,
+      "loss": 0.3123,
+      "step": 1599
+    },
+    {
+      "epoch": 0.5039370078740157,
+      "grad_norm": 14.358415603637695,
+      "learning_rate": 9.99748661417323e-06,
+      "loss": 0.1317,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5039370078740157,
+      "eval_loss": 0.46397241950035095,
+      "eval_runtime": 338.8895,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1600
+    },
+    {
+      "epoch": 0.504251968503937,
+      "grad_norm": 29.868349075317383,
+      "learning_rate": 9.997485039370079e-06,
+      "loss": 0.2516,
+      "step": 1601
+    },
+    {
+      "epoch": 0.5045669291338583,
+      "grad_norm": 21.991357803344727,
+      "learning_rate": 9.99748346456693e-06,
+      "loss": 0.2228,
+      "step": 1602
+    },
+    {
+      "epoch": 0.5048818897637796,
+      "grad_norm": 15.654339790344238,
+      "learning_rate": 9.997481889763781e-06,
+      "loss": 0.1345,
+      "step": 1603
+    },
+    {
+      "epoch": 0.5051968503937008,
+      "grad_norm": 15.310891151428223,
+      "learning_rate": 9.99748031496063e-06,
+      "loss": 0.1679,
+      "step": 1604
+    },
+    {
+      "epoch": 0.5055118110236221,
+      "grad_norm": 38.296146392822266,
+      "learning_rate": 9.997478740157481e-06,
+      "loss": 0.3338,
+      "step": 1605
+    },
+    {
+      "epoch": 0.5058267716535433,
+      "grad_norm": 43.396427154541016,
+      "learning_rate": 9.99747716535433e-06,
+      "loss": 0.4996,
+      "step": 1606
+    },
+    {
+      "epoch": 0.5061417322834646,
+      "grad_norm": 94.86878204345703,
+      "learning_rate": 9.997475590551181e-06,
+      "loss": 0.8225,
+      "step": 1607
+    },
+    {
+      "epoch": 0.5064566929133858,
+      "grad_norm": 52.50116729736328,
+      "learning_rate": 9.997474015748032e-06,
+      "loss": 0.3828,
+      "step": 1608
+    },
+    {
+      "epoch": 0.5067716535433071,
+      "grad_norm": 38.4481086730957,
+      "learning_rate": 9.997472440944883e-06,
+      "loss": 0.1611,
+      "step": 1609
+    },
+    {
+      "epoch": 0.5070866141732283,
+      "grad_norm": 49.7927131652832,
+      "learning_rate": 9.997470866141733e-06,
+      "loss": 1.0682,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5074015748031496,
+      "grad_norm": 8.284367561340332,
+      "learning_rate": 9.997469291338584e-06,
+      "loss": 0.0571,
+      "step": 1611
+    },
+    {
+      "epoch": 0.5077165354330708,
+      "grad_norm": 24.673439025878906,
+      "learning_rate": 9.997467716535435e-06,
+      "loss": 0.1879,
+      "step": 1612
+    },
+    {
+      "epoch": 0.5080314960629921,
+      "grad_norm": 33.2513542175293,
+      "learning_rate": 9.997466141732284e-06,
+      "loss": 0.636,
+      "step": 1613
+    },
+    {
+      "epoch": 0.5083464566929133,
+      "grad_norm": 62.058738708496094,
+      "learning_rate": 9.997464566929135e-06,
+      "loss": 0.878,
+      "step": 1614
+    },
+    {
+      "epoch": 0.5086614173228347,
+      "grad_norm": 79.89865112304688,
+      "learning_rate": 9.997462992125984e-06,
+      "loss": 0.2207,
+      "step": 1615
+    },
+    {
+      "epoch": 0.5089763779527559,
+      "grad_norm": 37.931819915771484,
+      "learning_rate": 9.997461417322835e-06,
+      "loss": 0.1365,
+      "step": 1616
+    },
+    {
+      "epoch": 0.5092913385826772,
+      "grad_norm": 30.21257209777832,
+      "learning_rate": 9.997459842519686e-06,
+      "loss": 0.4815,
+      "step": 1617
+    },
+    {
+      "epoch": 0.5096062992125985,
+      "grad_norm": 35.088890075683594,
+      "learning_rate": 9.997458267716537e-06,
+      "loss": 0.2641,
+      "step": 1618
+    },
+    {
+      "epoch": 0.5099212598425197,
+      "grad_norm": 31.518869400024414,
+      "learning_rate": 9.997456692913387e-06,
+      "loss": 0.3096,
+      "step": 1619
+    },
+    {
+      "epoch": 0.510236220472441,
+      "grad_norm": 43.287391662597656,
+      "learning_rate": 9.997455118110238e-06,
+      "loss": 0.4844,
+      "step": 1620
+    },
+    {
+      "epoch": 0.510236220472441,
+      "eval_loss": 0.5515217781066895,
+      "eval_runtime": 339.8588,
+      "eval_samples_per_second": 0.344,
+      "eval_steps_per_second": 0.344,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5105511811023622,
+      "grad_norm": 20.773025512695312,
+      "learning_rate": 9.997453543307087e-06,
+      "loss": 0.0673,
+      "step": 1621
+    },
+    {
+      "epoch": 0.5108661417322835,
+      "grad_norm": 22.52477264404297,
+      "learning_rate": 9.997451968503938e-06,
+      "loss": 0.2089,
+      "step": 1622
+    },
+    {
+      "epoch": 0.5111811023622047,
+      "grad_norm": 21.807056427001953,
+      "learning_rate": 9.997450393700789e-06,
+      "loss": 0.1542,
+      "step": 1623
+    },
+    {
+      "epoch": 0.511496062992126,
+      "grad_norm": 60.92094802856445,
+      "learning_rate": 9.997448818897638e-06,
+      "loss": 0.9874,
+      "step": 1624
+    },
+    {
+      "epoch": 0.5118110236220472,
+      "grad_norm": 100.76142120361328,
+      "learning_rate": 9.997447244094489e-06,
+      "loss": 0.3835,
+      "step": 1625
+    },
+    {
+      "epoch": 0.5121259842519685,
+      "grad_norm": 36.08515167236328,
+      "learning_rate": 9.997445669291338e-06,
+      "loss": 0.3638,
+      "step": 1626
+    },
+    {
+      "epoch": 0.5124409448818897,
+      "grad_norm": 36.939170837402344,
+      "learning_rate": 9.99744409448819e-06,
+      "loss": 0.7436,
+      "step": 1627
+    },
+    {
+      "epoch": 0.512755905511811,
+      "grad_norm": 48.17253494262695,
+      "learning_rate": 9.99744251968504e-06,
+      "loss": 0.4945,
+      "step": 1628
+    },
+    {
+      "epoch": 0.5130708661417323,
+      "grad_norm": 79.20890808105469,
+      "learning_rate": 9.997440944881891e-06,
+      "loss": 0.8642,
+      "step": 1629
+    },
+    {
+      "epoch": 0.5133858267716536,
+      "grad_norm": 19.723230361938477,
+      "learning_rate": 9.99743937007874e-06,
+      "loss": 0.1813,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5137007874015748,
+      "grad_norm": 39.59589385986328,
+      "learning_rate": 9.997437795275592e-06,
+      "loss": 0.3752,
+      "step": 1631
+    },
+    {
+      "epoch": 0.5140157480314961,
+      "grad_norm": 77.0748062133789,
+      "learning_rate": 9.997436220472441e-06,
+      "loss": 0.6325,
+      "step": 1632
+    },
+    {
+      "epoch": 0.5143307086614173,
+      "grad_norm": 20.3349552154541,
+      "learning_rate": 9.997434645669292e-06,
+      "loss": 0.2886,
+      "step": 1633
+    },
+    {
+      "epoch": 0.5146456692913386,
+      "grad_norm": 46.71043014526367,
+      "learning_rate": 9.997433070866143e-06,
+      "loss": 0.9402,
+      "step": 1634
+    },
+    {
+      "epoch": 0.5149606299212598,
+      "grad_norm": 34.89626693725586,
+      "learning_rate": 9.997431496062992e-06,
+      "loss": 0.4485,
+      "step": 1635
+    },
+    {
+      "epoch": 0.5152755905511811,
+      "grad_norm": 28.397123336791992,
+      "learning_rate": 9.997429921259843e-06,
+      "loss": 0.3992,
+      "step": 1636
+    },
+    {
+      "epoch": 0.5155905511811023,
+      "grad_norm": 50.874427795410156,
+      "learning_rate": 9.997428346456694e-06,
+      "loss": 0.9479,
+      "step": 1637
+    },
+    {
+      "epoch": 0.5159055118110236,
+      "grad_norm": 38.96868896484375,
+      "learning_rate": 9.997426771653545e-06,
+      "loss": 0.5054,
+      "step": 1638
+    },
+    {
+      "epoch": 0.5162204724409449,
+      "grad_norm": 7.598008155822754,
+      "learning_rate": 9.997425196850395e-06,
+      "loss": 0.0787,
+      "step": 1639
+    },
+    {
+      "epoch": 0.5165354330708661,
+      "grad_norm": 24.391128540039062,
+      "learning_rate": 9.997423622047246e-06,
+      "loss": 0.1297,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5165354330708661,
+      "eval_loss": 0.4850241541862488,
+      "eval_runtime": 337.9062,
+      "eval_samples_per_second": 0.346,
+      "eval_steps_per_second": 0.346,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5168503937007874,
+      "grad_norm": 21.02195167541504,
+      "learning_rate": 9.997422047244095e-06,
+      "loss": 0.3174,
+      "step": 1641
+    },
+    {
+      "epoch": 0.5171653543307086,
+      "grad_norm": 19.174169540405273,
+      "learning_rate": 9.997420472440946e-06,
+      "loss": 0.4078,
+      "step": 1642
+    },
+    {
+      "epoch": 0.51748031496063,
+      "grad_norm": 35.037322998046875,
+      "learning_rate": 9.997418897637797e-06,
+      "loss": 0.3795,
+      "step": 1643
+    },
+    {
+      "epoch": 0.5177952755905512,
+      "grad_norm": 19.232954025268555,
+      "learning_rate": 9.997417322834646e-06,
+      "loss": 0.4354,
+      "step": 1644
+    },
+    {
+      "epoch": 0.5181102362204725,
+      "grad_norm": 41.107784271240234,
+      "learning_rate": 9.997415748031497e-06,
+      "loss": 0.1806,
+      "step": 1645
+    },
+    {
+      "epoch": 0.5184251968503937,
+      "grad_norm": 31.30357551574707,
+      "learning_rate": 9.997414173228346e-06,
+      "loss": 0.5762,
+      "step": 1646
+    },
+    {
+      "epoch": 0.518740157480315,
+      "grad_norm": 27.336881637573242,
+      "learning_rate": 9.997412598425197e-06,
+      "loss": 0.5801,
+      "step": 1647
+    },
+    {
+      "epoch": 0.5190551181102362,
+      "grad_norm": 43.55338668823242,
+      "learning_rate": 9.997411023622048e-06,
+      "loss": 0.9297,
+      "step": 1648
+    },
+    {
+      "epoch": 0.5193700787401575,
+      "grad_norm": 33.737937927246094,
+      "learning_rate": 9.9974094488189e-06,
+      "loss": 0.3568,
+      "step": 1649
+    },
+    {
+      "epoch": 0.5196850393700787,
+      "grad_norm": 24.022850036621094,
+      "learning_rate": 9.997407874015749e-06,
+      "loss": 0.1614,
+      "step": 1650
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 27.58238410949707,
+      "learning_rate": 9.9974062992126e-06,
+      "loss": 0.3589,
+      "step": 1651
+    },
+    {
+      "epoch": 0.5203149606299212,
+      "grad_norm": 41.2049446105957,
+      "learning_rate": 9.997404724409449e-06,
+      "loss": 0.2244,
+      "step": 1652
+    },
+    {
+      "epoch": 0.5206299212598425,
+      "grad_norm": 19.164548873901367,
+      "learning_rate": 9.9974031496063e-06,
+      "loss": 0.3534,
+      "step": 1653
+    },
+    {
+      "epoch": 0.5209448818897637,
+      "grad_norm": 24.569271087646484,
+      "learning_rate": 9.997401574803151e-06,
+      "loss": 0.3207,
+      "step": 1654
+    },
+    {
+      "epoch": 0.521259842519685,
+      "grad_norm": 24.17620849609375,
+      "learning_rate": 9.9974e-06,
+      "loss": 0.2762,
+      "step": 1655
+    },
+    {
+      "epoch": 0.5215748031496062,
+      "grad_norm": 45.885250091552734,
+      "learning_rate": 9.997398425196851e-06,
+      "loss": 0.1903,
+      "step": 1656
+    },
+    {
+      "epoch": 0.5218897637795276,
+      "grad_norm": 10.778152465820312,
+      "learning_rate": 9.997396850393702e-06,
+      "loss": 0.0588,
+      "step": 1657
+    },
+    {
+      "epoch": 0.5222047244094489,
+      "grad_norm": 36.7594108581543,
+      "learning_rate": 9.997395275590553e-06,
+      "loss": 0.2227,
+      "step": 1658
+    },
+    {
+      "epoch": 0.5225196850393701,
+      "grad_norm": 13.830334663391113,
+      "learning_rate": 9.997393700787402e-06,
+      "loss": 0.2058,
+      "step": 1659
+    },
+    {
+      "epoch": 0.5228346456692914,
+      "grad_norm": 70.07479095458984,
+      "learning_rate": 9.997392125984253e-06,
+      "loss": 0.3415,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5228346456692914,
+      "eval_loss": 0.4270039498806,
+      "eval_runtime": 338.5531,
+      "eval_samples_per_second": 0.346,
+      "eval_steps_per_second": 0.346,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5231496062992126,
+      "grad_norm": 54.377689361572266,
+      "learning_rate": 9.997390551181103e-06,
+      "loss": 0.699,
+      "step": 1661
+    },
+    {
+      "epoch": 0.5234645669291339,
+      "grad_norm": 20.332332611083984,
+      "learning_rate": 9.997388976377954e-06,
+      "loss": 0.3042,
+      "step": 1662
+    },
+    {
+      "epoch": 0.5237795275590551,
+      "grad_norm": 29.605314254760742,
+      "learning_rate": 9.997387401574805e-06,
+      "loss": 0.616,
+      "step": 1663
+    },
+    {
+      "epoch": 0.5240944881889764,
+      "grad_norm": 26.555742263793945,
+      "learning_rate": 9.997385826771654e-06,
+      "loss": 0.2776,
+      "step": 1664
+    },
+    {
+      "epoch": 0.5244094488188976,
+      "grad_norm": 24.46210479736328,
+      "learning_rate": 9.997384251968505e-06,
+      "loss": 0.3006,
+      "step": 1665
+    },
+    {
+      "epoch": 0.5247244094488189,
+      "grad_norm": 135.4957275390625,
+      "learning_rate": 9.997382677165354e-06,
+      "loss": 0.4272,
+      "step": 1666
+    },
+    {
+      "epoch": 0.5250393700787401,
+      "grad_norm": 8.375167846679688,
+      "learning_rate": 9.997381102362205e-06,
+      "loss": 0.0964,
+      "step": 1667
+    },
+    {
+      "epoch": 0.5253543307086614,
+      "grad_norm": 17.8012752532959,
+      "learning_rate": 9.997379527559056e-06,
+      "loss": 0.1773,
+      "step": 1668
+    },
+    {
+      "epoch": 0.5256692913385826,
+      "grad_norm": 43.08966064453125,
+      "learning_rate": 9.997377952755907e-06,
+      "loss": 0.3974,
+      "step": 1669
+    },
+    {
+      "epoch": 0.525984251968504,
+      "grad_norm": 34.40397644042969,
+      "learning_rate": 9.997376377952757e-06,
+      "loss": 0.3196,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5262992125984252,
+      "grad_norm": 16.06707000732422,
+      "learning_rate": 9.997374803149608e-06,
+      "loss": 0.1063,
+      "step": 1671
+    },
+    {
+      "epoch": 0.5266141732283465,
+      "grad_norm": 12.523601531982422,
+      "learning_rate": 9.997373228346457e-06,
+      "loss": 0.1203,
+      "step": 1672
+    },
+    {
+      "epoch": 0.5269291338582677,
+      "grad_norm": 43.67927169799805,
+      "learning_rate": 9.997371653543308e-06,
+      "loss": 0.285,
+      "step": 1673
+    },
+    {
+      "epoch": 0.527244094488189,
+      "grad_norm": 58.05824279785156,
+      "learning_rate": 9.997370078740159e-06,
+      "loss": 0.2903,
+      "step": 1674
+    },
+    {
+      "epoch": 0.5275590551181102,
+      "grad_norm": 52.33161163330078,
+      "learning_rate": 9.997368503937008e-06,
+      "loss": 0.3971,
+      "step": 1675
+    },
+    {
+      "epoch": 0.5278740157480315,
+      "grad_norm": 43.415462493896484,
+      "learning_rate": 9.997366929133859e-06,
+      "loss": 0.2529,
+      "step": 1676
+    },
+    {
+      "epoch": 0.5281889763779527,
+      "grad_norm": 37.16794967651367,
+      "learning_rate": 9.997365354330708e-06,
+      "loss": 0.6718,
+      "step": 1677
+    },
+    {
+      "epoch": 0.528503937007874,
+      "grad_norm": 19.802600860595703,
+      "learning_rate": 9.99736377952756e-06,
+      "loss": 0.1329,
+      "step": 1678
+    },
+    {
+      "epoch": 0.5288188976377953,
+      "grad_norm": 18.722341537475586,
+      "learning_rate": 9.99736220472441e-06,
+      "loss": 0.0449,
+      "step": 1679
+    },
+    {
+      "epoch": 0.5291338582677165,
+      "grad_norm": 30.56394386291504,
+      "learning_rate": 9.997360629921261e-06,
+      "loss": 0.0851,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5291338582677165,
+      "eval_loss": 0.5506137013435364,
+      "eval_runtime": 340.4385,
+      "eval_samples_per_second": 0.344,
+      "eval_steps_per_second": 0.344,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5294488188976378,
+      "grad_norm": 15.041778564453125,
+      "learning_rate": 9.99735905511811e-06,
+      "loss": 0.0869,
+      "step": 1681
+    },
+    {
+      "epoch": 0.529763779527559,
+      "grad_norm": 81.58389282226562,
+      "learning_rate": 9.997357480314962e-06,
+      "loss": 0.6074,
+      "step": 1682
+    },
+    {
+      "epoch": 0.5300787401574804,
+      "grad_norm": 99.90658569335938,
+      "learning_rate": 9.997355905511813e-06,
+      "loss": 0.2411,
+      "step": 1683
+    },
+    {
+      "epoch": 0.5303937007874016,
+      "grad_norm": 25.98404312133789,
+      "learning_rate": 9.997354330708662e-06,
+      "loss": 0.1055,
+      "step": 1684
+    },
+    {
+      "epoch": 0.5307086614173229,
+      "grad_norm": 34.40901184082031,
+      "learning_rate": 9.997352755905513e-06,
+      "loss": 0.4056,
+      "step": 1685
+    },
+    {
+      "epoch": 0.5310236220472441,
+      "grad_norm": 12.796487808227539,
+      "learning_rate": 9.997351181102362e-06,
+      "loss": 0.0303,
+      "step": 1686
+    },
+    {
+      "epoch": 0.5313385826771654,
+      "grad_norm": 14.561634063720703,
+      "learning_rate": 9.997349606299213e-06,
+      "loss": 0.0729,
+      "step": 1687
+    },
+    {
+      "epoch": 0.5316535433070866,
+      "grad_norm": 38.87438201904297,
+      "learning_rate": 9.997348031496064e-06,
+      "loss": 0.1667,
+      "step": 1688
+    },
+    {
+      "epoch": 0.5319685039370079,
+      "grad_norm": 39.482303619384766,
+      "learning_rate": 9.997346456692915e-06,
+      "loss": 0.0675,
+      "step": 1689
+    },
+    {
+      "epoch": 0.5322834645669291,
+      "grad_norm": 162.6251220703125,
+      "learning_rate": 9.997344881889765e-06,
+      "loss": 0.5522,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5325984251968504,
+      "grad_norm": 10.552274703979492,
+      "learning_rate": 9.997343307086616e-06,
+      "loss": 0.036,
+      "step": 1691
+    },
+    {
+      "epoch": 0.5329133858267716,
+      "grad_norm": 118.2337646484375,
+      "learning_rate": 9.997341732283465e-06,
+      "loss": 0.1975,
+      "step": 1692
+    },
+    {
+      "epoch": 0.5332283464566929,
+      "grad_norm": 46.612266540527344,
+      "learning_rate": 9.997340157480316e-06,
+      "loss": 0.1823,
+      "step": 1693
+    },
+    {
+      "epoch": 0.5335433070866141,
+      "grad_norm": 93.66093444824219,
+      "learning_rate": 9.997338582677167e-06,
+      "loss": 1.0047,
+      "step": 1694
+    },
+    {
+      "epoch": 0.5338582677165354,
+      "grad_norm": 42.73204040527344,
+      "learning_rate": 9.997337007874016e-06,
+      "loss": 0.1202,
+      "step": 1695
+    },
+    {
+      "epoch": 0.5341732283464566,
+      "grad_norm": 116.17491912841797,
+      "learning_rate": 9.997335433070867e-06,
+      "loss": 0.5415,
+      "step": 1696
+    },
+    {
+      "epoch": 0.534488188976378,
+      "grad_norm": 38.49724197387695,
+      "learning_rate": 9.997333858267716e-06,
+      "loss": 0.7654,
+      "step": 1697
+    },
+    {
+      "epoch": 0.5348031496062993,
+      "grad_norm": 76.99092864990234,
+      "learning_rate": 9.997332283464567e-06,
+      "loss": 0.3025,
+      "step": 1698
+    },
+    {
+      "epoch": 0.5351181102362205,
+      "grad_norm": 102.11122131347656,
+      "learning_rate": 9.997330708661418e-06,
+      "loss": 0.6456,
+      "step": 1699
+    },
+    {
+      "epoch": 0.5354330708661418,
+      "grad_norm": 140.7232666015625,
+      "learning_rate": 9.99732913385827e-06,
+      "loss": 0.4218,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5354330708661418,
+      "eval_loss": 0.5890030860900879,
+      "eval_runtime": 338.8108,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1700
+    },
+    {
+      "epoch": 0.535748031496063,
+      "grad_norm": 111.65464782714844,
+      "learning_rate": 9.997327559055119e-06,
+      "loss": 1.0181,
+      "step": 1701
+    },
+    {
+      "epoch": 0.5360629921259843,
+      "grad_norm": 37.92324447631836,
+      "learning_rate": 9.99732598425197e-06,
+      "loss": 0.7354,
+      "step": 1702
+    },
+    {
+      "epoch": 0.5363779527559055,
+      "grad_norm": 42.922149658203125,
+      "learning_rate": 9.997324409448819e-06,
+      "loss": 0.2447,
+      "step": 1703
+    },
+    {
+      "epoch": 0.5366929133858268,
+      "grad_norm": 45.183082580566406,
+      "learning_rate": 9.99732283464567e-06,
+      "loss": 0.337,
+      "step": 1704
+    },
+    {
+      "epoch": 0.537007874015748,
+      "grad_norm": 39.247962951660156,
+      "learning_rate": 9.997321259842521e-06,
+      "loss": 0.3811,
+      "step": 1705
+    },
+    {
+      "epoch": 0.5373228346456693,
+      "grad_norm": 128.93247985839844,
+      "learning_rate": 9.99731968503937e-06,
+      "loss": 0.5403,
+      "step": 1706
+    },
+    {
+      "epoch": 0.5376377952755905,
+      "grad_norm": 107.43035888671875,
+      "learning_rate": 9.997318110236221e-06,
+      "loss": 0.3911,
+      "step": 1707
+    },
+    {
+      "epoch": 0.5379527559055118,
+      "grad_norm": 43.16225051879883,
+      "learning_rate": 9.997316535433072e-06,
+      "loss": 0.2933,
+      "step": 1708
+    },
+    {
+      "epoch": 0.538267716535433,
+      "grad_norm": 87.62896728515625,
+      "learning_rate": 9.997314960629923e-06,
+      "loss": 0.4866,
+      "step": 1709
+    },
+    {
+      "epoch": 0.5385826771653544,
+      "grad_norm": 40.5244255065918,
+      "learning_rate": 9.997313385826772e-06,
+      "loss": 0.382,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5388976377952756,
+      "grad_norm": 34.54233932495117,
+      "learning_rate": 9.997311811023623e-06,
+      "loss": 0.2301,
+      "step": 1711
+    },
+    {
+      "epoch": 0.5392125984251969,
+      "grad_norm": 95.98748016357422,
+      "learning_rate": 9.997310236220473e-06,
+      "loss": 0.8653,
+      "step": 1712
+    },
+    {
+      "epoch": 0.5395275590551181,
+      "grad_norm": 86.79911041259766,
+      "learning_rate": 9.997308661417324e-06,
+      "loss": 0.4673,
+      "step": 1713
+    },
+    {
+      "epoch": 0.5398425196850394,
+      "grad_norm": 13.791953086853027,
+      "learning_rate": 9.997307086614175e-06,
+      "loss": 0.0652,
+      "step": 1714
+    },
+    {
+      "epoch": 0.5401574803149606,
+      "grad_norm": 16.453601837158203,
+      "learning_rate": 9.997305511811024e-06,
+      "loss": 0.103,
+      "step": 1715
+    },
+    {
+      "epoch": 0.5404724409448819,
+      "grad_norm": 40.83580780029297,
+      "learning_rate": 9.997303937007875e-06,
+      "loss": 0.4618,
+      "step": 1716
+    },
+    {
+      "epoch": 0.5407874015748031,
+      "grad_norm": 57.49140548706055,
+      "learning_rate": 9.997302362204724e-06,
+      "loss": 0.2785,
+      "step": 1717
+    },
+    {
+      "epoch": 0.5411023622047244,
+      "grad_norm": 56.28849792480469,
+      "learning_rate": 9.997300787401575e-06,
+      "loss": 0.6193,
+      "step": 1718
+    },
+    {
+      "epoch": 0.5414173228346457,
+      "grad_norm": 37.70257568359375,
+      "learning_rate": 9.997299212598426e-06,
+      "loss": 0.2423,
+      "step": 1719
+    },
+    {
+      "epoch": 0.5417322834645669,
+      "grad_norm": 60.47585678100586,
+      "learning_rate": 9.997297637795277e-06,
+      "loss": 0.3203,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5417322834645669,
+      "eval_loss": 0.4833095669746399,
+      "eval_runtime": 339.1777,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5420472440944882,
+      "grad_norm": 45.25422286987305,
+      "learning_rate": 9.997296062992127e-06,
+      "loss": 0.4233,
+      "step": 1721
+    },
+    {
+      "epoch": 0.5423622047244094,
+      "grad_norm": 21.047258377075195,
+      "learning_rate": 9.997294488188978e-06,
+      "loss": 0.1222,
+      "step": 1722
+    },
+    {
+      "epoch": 0.5426771653543307,
+      "grad_norm": 41.04436492919922,
+      "learning_rate": 9.997292913385827e-06,
+      "loss": 0.1144,
+      "step": 1723
+    },
+    {
+      "epoch": 0.542992125984252,
+      "grad_norm": 9.362834930419922,
+      "learning_rate": 9.997291338582678e-06,
+      "loss": 0.0498,
+      "step": 1724
+    },
+    {
+      "epoch": 0.5433070866141733,
+      "grad_norm": 50.78684997558594,
+      "learning_rate": 9.997289763779529e-06,
+      "loss": 0.6966,
+      "step": 1725
+    },
+    {
+      "epoch": 0.5436220472440945,
+      "grad_norm": 33.23752212524414,
+      "learning_rate": 9.997288188976378e-06,
+      "loss": 0.1696,
+      "step": 1726
+    },
+    {
+      "epoch": 0.5439370078740158,
+      "grad_norm": 29.13566780090332,
+      "learning_rate": 9.99728661417323e-06,
+      "loss": 0.511,
+      "step": 1727
+    },
+    {
+      "epoch": 0.544251968503937,
+      "grad_norm": 36.36374282836914,
+      "learning_rate": 9.99728503937008e-06,
+      "loss": 0.3232,
+      "step": 1728
+    },
+    {
+      "epoch": 0.5445669291338583,
+      "grad_norm": 9.268793106079102,
+      "learning_rate": 9.997283464566931e-06,
+      "loss": 0.0484,
+      "step": 1729
+    },
+    {
+      "epoch": 0.5448818897637795,
+      "grad_norm": 10.723762512207031,
+      "learning_rate": 9.99728188976378e-06,
+      "loss": 0.0748,
+      "step": 1730
+    },
+    {
+      "epoch": 0.5451968503937008,
+      "grad_norm": 42.956424713134766,
+      "learning_rate": 9.997280314960631e-06,
+      "loss": 0.2678,
+      "step": 1731
+    },
+    {
+      "epoch": 0.545511811023622,
+      "grad_norm": 33.43272018432617,
+      "learning_rate": 9.99727874015748e-06,
+      "loss": 0.4772,
+      "step": 1732
+    },
+    {
+      "epoch": 0.5458267716535433,
+      "grad_norm": 59.14657974243164,
+      "learning_rate": 9.997277165354332e-06,
+      "loss": 0.2235,
+      "step": 1733
+    },
+    {
+      "epoch": 0.5461417322834645,
+      "grad_norm": 29.12813377380371,
+      "learning_rate": 9.997275590551183e-06,
+      "loss": 0.0931,
+      "step": 1734
+    },
+    {
+      "epoch": 0.5464566929133858,
+      "grad_norm": 17.903165817260742,
+      "learning_rate": 9.997274015748032e-06,
+      "loss": 0.4091,
+      "step": 1735
+    },
+    {
+      "epoch": 0.546771653543307,
+      "grad_norm": 72.72879028320312,
+      "learning_rate": 9.997272440944883e-06,
+      "loss": 0.3515,
+      "step": 1736
+    },
+    {
+      "epoch": 0.5470866141732283,
+      "grad_norm": 42.820655822753906,
+      "learning_rate": 9.997270866141732e-06,
+      "loss": 0.8093,
+      "step": 1737
+    },
+    {
+      "epoch": 0.5474015748031497,
+      "grad_norm": 34.83147048950195,
+      "learning_rate": 9.997269291338583e-06,
+      "loss": 0.0899,
+      "step": 1738
+    },
+    {
+      "epoch": 0.5477165354330709,
+      "grad_norm": 165.32769775390625,
+      "learning_rate": 9.997267716535434e-06,
+      "loss": 1.2147,
+      "step": 1739
+    },
+    {
+      "epoch": 0.5480314960629922,
+      "grad_norm": 54.89258575439453,
+      "learning_rate": 9.997266141732285e-06,
+      "loss": 0.3279,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5480314960629922,
+      "eval_loss": 0.5086050033569336,
+      "eval_runtime": 340.7446,
+      "eval_samples_per_second": 0.343,
+      "eval_steps_per_second": 0.343,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5483464566929134,
+      "grad_norm": 19.018238067626953,
+      "learning_rate": 9.997264566929135e-06,
+      "loss": 0.0628,
+      "step": 1741
+    },
+    {
+      "epoch": 0.5486614173228347,
+      "grad_norm": 44.377803802490234,
+      "learning_rate": 9.997262992125986e-06,
+      "loss": 0.1875,
+      "step": 1742
+    },
+    {
+      "epoch": 0.5489763779527559,
+      "grad_norm": 51.6882209777832,
+      "learning_rate": 9.997261417322835e-06,
+      "loss": 0.5064,
+      "step": 1743
+    },
+    {
+      "epoch": 0.5492913385826772,
+      "grad_norm": 6.151045799255371,
+      "learning_rate": 9.997259842519686e-06,
+      "loss": 0.0298,
+      "step": 1744
+    },
+    {
+      "epoch": 0.5496062992125984,
+      "grad_norm": 76.562255859375,
+      "learning_rate": 9.997258267716537e-06,
+      "loss": 0.6654,
+      "step": 1745
+    },
+    {
+      "epoch": 0.5499212598425197,
+      "grad_norm": 48.19951629638672,
+      "learning_rate": 9.997256692913386e-06,
+      "loss": 0.4798,
+      "step": 1746
+    },
+    {
+      "epoch": 0.5502362204724409,
+      "grad_norm": 68.81988525390625,
+      "learning_rate": 9.997255118110237e-06,
+      "loss": 0.4747,
+      "step": 1747
+    },
+    {
+      "epoch": 0.5505511811023622,
+      "grad_norm": 24.68579864501953,
+      "learning_rate": 9.997253543307086e-06,
+      "loss": 0.1881,
+      "step": 1748
+    },
+    {
+      "epoch": 0.5508661417322834,
+      "grad_norm": 21.575008392333984,
+      "learning_rate": 9.997251968503937e-06,
+      "loss": 0.3208,
+      "step": 1749
+    },
+    {
+      "epoch": 0.5511811023622047,
+      "grad_norm": 45.560203552246094,
+      "learning_rate": 9.997250393700788e-06,
+      "loss": 0.7353,
+      "step": 1750
+    },
+    {
+      "epoch": 0.551496062992126,
+      "grad_norm": 11.304056167602539,
+      "learning_rate": 9.99724881889764e-06,
+      "loss": 0.0611,
+      "step": 1751
+    },
+    {
+      "epoch": 0.5518110236220473,
+      "grad_norm": 53.74604797363281,
+      "learning_rate": 9.997247244094489e-06,
+      "loss": 0.3013,
+      "step": 1752
+    },
+    {
+      "epoch": 0.5521259842519685,
+      "grad_norm": 40.34061050415039,
+      "learning_rate": 9.99724566929134e-06,
+      "loss": 0.9412,
+      "step": 1753
+    },
+    {
+      "epoch": 0.5524409448818898,
+      "grad_norm": 38.01073455810547,
+      "learning_rate": 9.99724409448819e-06,
+      "loss": 0.1551,
+      "step": 1754
+    },
+    {
+      "epoch": 0.552755905511811,
+      "grad_norm": 64.29601287841797,
+      "learning_rate": 9.99724251968504e-06,
+      "loss": 1.8098,
+      "step": 1755
+    },
+    {
+      "epoch": 0.5530708661417323,
+      "grad_norm": 50.7859001159668,
+      "learning_rate": 9.997240944881891e-06,
+      "loss": 0.3809,
+      "step": 1756
+    },
+    {
+      "epoch": 0.5533858267716535,
+      "grad_norm": 23.059926986694336,
+      "learning_rate": 9.99723937007874e-06,
+      "loss": 0.1878,
+      "step": 1757
+    },
+    {
+      "epoch": 0.5537007874015748,
+      "grad_norm": 15.40706729888916,
+      "learning_rate": 9.997237795275591e-06,
+      "loss": 0.2192,
+      "step": 1758
+    },
+    {
+      "epoch": 0.5540157480314961,
+      "grad_norm": 50.74030685424805,
+      "learning_rate": 9.997236220472442e-06,
+      "loss": 0.8022,
+      "step": 1759
+    },
+    {
+      "epoch": 0.5543307086614173,
+      "grad_norm": 49.88266372680664,
+      "learning_rate": 9.997234645669293e-06,
+      "loss": 0.3615,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5543307086614173,
+      "eval_loss": 0.4197517931461334,
+      "eval_runtime": 338.0619,
+      "eval_samples_per_second": 0.346,
+      "eval_steps_per_second": 0.346,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5546456692913386,
+      "grad_norm": 33.29900360107422,
+      "learning_rate": 9.997233070866143e-06,
+      "loss": 0.4475,
+      "step": 1761
+    },
+    {
+      "epoch": 0.5549606299212598,
+      "grad_norm": 57.282649993896484,
+      "learning_rate": 9.997231496062994e-06,
+      "loss": 0.5688,
+      "step": 1762
+    },
+    {
+      "epoch": 0.5552755905511811,
+      "grad_norm": 16.435340881347656,
+      "learning_rate": 9.997229921259843e-06,
+      "loss": 0.2603,
+      "step": 1763
+    },
+    {
+      "epoch": 0.5555905511811023,
+      "grad_norm": 45.144737243652344,
+      "learning_rate": 9.997228346456694e-06,
+      "loss": 0.6854,
+      "step": 1764
+    },
+    {
+      "epoch": 0.5559055118110237,
+      "grad_norm": 42.861515045166016,
+      "learning_rate": 9.997226771653545e-06,
+      "loss": 0.6296,
+      "step": 1765
+    },
+    {
+      "epoch": 0.5562204724409449,
+      "grad_norm": 48.16584777832031,
+      "learning_rate": 9.997225196850394e-06,
+      "loss": 0.4873,
+      "step": 1766
+    },
+    {
+      "epoch": 0.5565354330708662,
+      "grad_norm": 61.42652893066406,
+      "learning_rate": 9.997223622047245e-06,
+      "loss": 0.7604,
+      "step": 1767
+    },
+    {
+      "epoch": 0.5568503937007874,
+      "grad_norm": 24.190889358520508,
+      "learning_rate": 9.997222047244094e-06,
+      "loss": 0.3546,
+      "step": 1768
+    },
+    {
+      "epoch": 0.5571653543307087,
+      "grad_norm": 19.951372146606445,
+      "learning_rate": 9.997220472440945e-06,
+      "loss": 0.4902,
+      "step": 1769
+    },
+    {
+      "epoch": 0.5574803149606299,
+      "grad_norm": 24.672101974487305,
+      "learning_rate": 9.997218897637796e-06,
+      "loss": 0.3603,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5577952755905512,
+      "grad_norm": 16.915287017822266,
+      "learning_rate": 9.997217322834647e-06,
+      "loss": 0.1767,
+      "step": 1771
+    },
+    {
+      "epoch": 0.5581102362204724,
+      "grad_norm": 28.743501663208008,
+      "learning_rate": 9.997215748031497e-06,
+      "loss": 0.3357,
+      "step": 1772
+    },
+    {
+      "epoch": 0.5584251968503937,
+      "grad_norm": 48.64558029174805,
+      "learning_rate": 9.997214173228346e-06,
+      "loss": 0.3531,
+      "step": 1773
+    },
+    {
+      "epoch": 0.5587401574803149,
+      "grad_norm": 18.73015022277832,
+      "learning_rate": 9.997212598425197e-06,
+      "loss": 0.2041,
+      "step": 1774
+    },
+    {
+      "epoch": 0.5590551181102362,
+      "grad_norm": 27.524059295654297,
+      "learning_rate": 9.997211023622048e-06,
+      "loss": 0.2056,
+      "step": 1775
+    },
+    {
+      "epoch": 0.5593700787401574,
+      "grad_norm": 15.509268760681152,
+      "learning_rate": 9.997209448818899e-06,
+      "loss": 0.1131,
+      "step": 1776
+    },
+    {
+      "epoch": 0.5596850393700787,
+      "grad_norm": 39.2099609375,
+      "learning_rate": 9.997207874015748e-06,
+      "loss": 0.2837,
+      "step": 1777
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 41.994815826416016,
+      "learning_rate": 9.9972062992126e-06,
+      "loss": 0.3212,
+      "step": 1778
+    },
+    {
+      "epoch": 0.5603149606299213,
+      "grad_norm": 50.88380432128906,
+      "learning_rate": 9.99720472440945e-06,
+      "loss": 0.476,
+      "step": 1779
+    },
+    {
+      "epoch": 0.5606299212598426,
+      "grad_norm": 23.539987564086914,
+      "learning_rate": 9.997203149606301e-06,
+      "loss": 0.1563,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5606299212598426,
+      "eval_loss": 0.5017465353012085,
+      "eval_runtime": 339.1116,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5609448818897638,
+      "grad_norm": 48.950191497802734,
+      "learning_rate": 9.99720157480315e-06,
+      "loss": 0.7028,
+      "step": 1781
+    },
+    {
+      "epoch": 0.5612598425196851,
+      "grad_norm": 15.981232643127441,
+      "learning_rate": 9.997200000000001e-06,
+      "loss": 0.0955,
+      "step": 1782
+    },
+    {
+      "epoch": 0.5615748031496063,
+      "grad_norm": 53.745574951171875,
+      "learning_rate": 9.99719842519685e-06,
+      "loss": 0.5677,
+      "step": 1783
+    },
+    {
+      "epoch": 0.5618897637795276,
+      "grad_norm": 57.129024505615234,
+      "learning_rate": 9.997196850393702e-06,
+      "loss": 0.5015,
+      "step": 1784
+    },
+    {
+      "epoch": 0.5622047244094488,
+      "grad_norm": 25.303375244140625,
+      "learning_rate": 9.997195275590553e-06,
+      "loss": 0.3141,
+      "step": 1785
+    },
+    {
+      "epoch": 0.5625196850393701,
+      "grad_norm": 29.012252807617188,
+      "learning_rate": 9.997193700787402e-06,
+      "loss": 0.3536,
+      "step": 1786
+    },
+    {
+      "epoch": 0.5628346456692913,
+      "grad_norm": 10.699189186096191,
+      "learning_rate": 9.997192125984253e-06,
+      "loss": 0.0627,
+      "step": 1787
+    },
+    {
+      "epoch": 0.5631496062992126,
+      "grad_norm": 8.34593391418457,
+      "learning_rate": 9.997190551181102e-06,
+      "loss": 0.0391,
+      "step": 1788
+    },
+    {
+      "epoch": 0.5634645669291338,
+      "grad_norm": 42.121437072753906,
+      "learning_rate": 9.997188976377953e-06,
+      "loss": 0.8368,
+      "step": 1789
+    },
+    {
+      "epoch": 0.5637795275590551,
+      "grad_norm": 35.72675704956055,
+      "learning_rate": 9.997187401574804e-06,
+      "loss": 0.5532,
+      "step": 1790
+    },
+    {
+      "epoch": 0.5640944881889763,
+      "grad_norm": 45.88935470581055,
+      "learning_rate": 9.997185826771655e-06,
+      "loss": 0.1764,
+      "step": 1791
+    },
+    {
+      "epoch": 0.5644094488188977,
+      "grad_norm": 32.38536071777344,
+      "learning_rate": 9.997184251968505e-06,
+      "loss": 0.1779,
+      "step": 1792
+    },
+    {
+      "epoch": 0.5647244094488189,
+      "grad_norm": 72.07355499267578,
+      "learning_rate": 9.997182677165354e-06,
+      "loss": 0.3102,
+      "step": 1793
+    },
+    {
+      "epoch": 0.5650393700787402,
+      "grad_norm": 82.99500274658203,
+      "learning_rate": 9.997181102362205e-06,
+      "loss": 0.6726,
+      "step": 1794
+    },
+    {
+      "epoch": 0.5653543307086614,
+      "grad_norm": 80.94422149658203,
+      "learning_rate": 9.997179527559056e-06,
+      "loss": 0.7582,
+      "step": 1795
+    },
+    {
+      "epoch": 0.5656692913385827,
+      "grad_norm": 4.303206443786621,
+      "learning_rate": 9.997177952755907e-06,
+      "loss": 0.0169,
+      "step": 1796
+    },
+    {
+      "epoch": 0.5659842519685039,
+      "grad_norm": 38.44367980957031,
+      "learning_rate": 9.997176377952756e-06,
+      "loss": 0.5267,
+      "step": 1797
+    },
+    {
+      "epoch": 0.5662992125984252,
+      "grad_norm": 48.196807861328125,
+      "learning_rate": 9.997174803149607e-06,
+      "loss": 0.6009,
+      "step": 1798
+    },
+    {
+      "epoch": 0.5666141732283465,
+      "grad_norm": 33.66616439819336,
+      "learning_rate": 9.997173228346458e-06,
+      "loss": 0.5003,
+      "step": 1799
+    },
+    {
+      "epoch": 0.5669291338582677,
+      "grad_norm": 34.213218688964844,
+      "learning_rate": 9.997171653543309e-06,
+      "loss": 0.2974,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5669291338582677,
+      "eval_loss": 0.477321982383728,
+      "eval_runtime": 338.9113,
+      "eval_samples_per_second": 0.345,
+      "eval_steps_per_second": 0.345,
+      "step": 1800
+    },
+    {
+      "epoch": 0.567244094488189,
+      "grad_norm": 63.08566665649414,
+      "learning_rate": 9.997170078740158e-06,
+      "loss": 0.1859,
+      "step": 1801
+    },
+    {
+      "epoch": 0.5675590551181102,
+      "grad_norm": 61.78851318359375,
+      "learning_rate": 9.99716850393701e-06,
+      "loss": 0.9895,
+      "step": 1802
+    },
+    {
+      "epoch": 0.5678740157480315,
+      "grad_norm": 47.29201126098633,
+      "learning_rate": 9.997166929133859e-06,
+      "loss": 0.5153,
+      "step": 1803
+    },
+    {
+      "epoch": 0.5681889763779527,
+      "grad_norm": 33.21207046508789,
+      "learning_rate": 9.99716535433071e-06,
+      "loss": 0.3221,
+      "step": 1804
+    },
+    {
+      "epoch": 0.568503937007874,
+      "grad_norm": 20.20362663269043,
+      "learning_rate": 9.99716377952756e-06,
+      "loss": 0.0697,
+      "step": 1805
+    },
+    {
+      "epoch": 0.5688188976377953,
+      "grad_norm": 23.14605712890625,
+      "learning_rate": 9.99716220472441e-06,
+      "loss": 0.3096,
+      "step": 1806
+    },
+    {
+      "epoch": 0.5691338582677166,
+      "grad_norm": 58.990699768066406,
+      "learning_rate": 9.997160629921261e-06,
+      "loss": 0.6046,
+      "step": 1807
+    },
+    {
+      "epoch": 0.5694488188976378,
+      "grad_norm": 36.48517608642578,
+      "learning_rate": 9.99715905511811e-06,
+      "loss": 0.2886,
+      "step": 1808
+    },
+    {
+      "epoch": 0.5697637795275591,
+      "grad_norm": 56.9811897277832,
+      "learning_rate": 9.997157480314961e-06,
+      "loss": 0.4658,
+      "step": 1809
+    },
+    {
+      "epoch": 0.5700787401574803,
+      "grad_norm": 40.79648208618164,
+      "learning_rate": 9.997155905511812e-06,
+      "loss": 0.3177,
+      "step": 1810
+    },
+    {
+      "epoch": 0.5703937007874016,
+      "grad_norm": 31.444948196411133,
+      "learning_rate": 9.997154330708663e-06,
+      "loss": 0.3236,
+      "step": 1811
+    },
+    {
+      "epoch": 0.5707086614173228,
+      "grad_norm": 21.288089752197266,
+      "learning_rate": 9.997152755905513e-06,
+      "loss": 0.4159,
+      "step": 1812
+    },
+    {
+      "epoch": 0.5710236220472441,
+      "grad_norm": 58.0438117980957,
+      "learning_rate": 9.997151181102362e-06,
+      "loss": 0.6176,
+      "step": 1813
+    },
+    {
+      "epoch": 0.5713385826771653,
+      "grad_norm": 19.920330047607422,
+      "learning_rate": 9.997149606299213e-06,
+      "loss": 0.305,
+      "step": 1814
+    },
+    {
+      "epoch": 0.5716535433070866,
+      "grad_norm": 35.30345916748047,
+      "learning_rate": 9.997148031496064e-06,
+      "loss": 0.4661,
+      "step": 1815
+    },
+    {
+      "epoch": 0.5719685039370078,
+      "grad_norm": 16.50104522705078,
+      "learning_rate": 9.997146456692915e-06,
+      "loss": 0.2696,
+      "step": 1816
+    },
+    {
+      "epoch": 0.5722834645669291,
+      "grad_norm": 40.40153121948242,
+      "learning_rate": 9.997144881889764e-06,
+      "loss": 0.5205,
+      "step": 1817
+    },
+    {
+      "epoch": 0.5725984251968504,
+      "grad_norm": 12.835118293762207,
+      "learning_rate": 9.997143307086615e-06,
+      "loss": 0.1722,
+      "step": 1818
+    },
+    {
+      "epoch": 0.5729133858267716,
+      "grad_norm": 40.450950622558594,
+      "learning_rate": 9.997141732283464e-06,
+      "loss": 0.3829,
+      "step": 1819
+    },
+    {
+      "epoch": 0.573228346456693,
+      "grad_norm": 20.867347717285156,
+      "learning_rate": 9.997140157480315e-06,
+      "loss": 0.4121,
+      "step": 1820
+    },
+    {
+      "epoch": 0.573228346456693,
+      "eval_loss": 0.40022820234298706,
+      "eval_runtime": 337.3982,
+      "eval_samples_per_second": 0.347,
+      "eval_steps_per_second": 0.347,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5735433070866142,
+      "grad_norm": 27.65648651123047,
+      "learning_rate": 9.997138582677166e-06,
+      "loss": 0.1822,
+      "step": 1821
+    },
+    {
+      "epoch": 0.5738582677165355,
+      "grad_norm": 57.512733459472656,
+      "learning_rate": 9.997137007874017e-06,
+      "loss": 0.8702,
+      "step": 1822
+    },
+    {
+      "epoch": 0.5741732283464567,
+      "grad_norm": 13.143925666809082,
+      "learning_rate": 9.997135433070867e-06,
+      "loss": 0.183,
+      "step": 1823
+    },
+    {
+      "epoch": 0.574488188976378,
+      "grad_norm": 49.15605926513672,
+      "learning_rate": 9.997133858267718e-06,
+      "loss": 0.3416,
+      "step": 1824
+    },
+    {
+      "epoch": 0.5748031496062992,
+      "grad_norm": 12.571438789367676,
+      "learning_rate": 9.997132283464569e-06,
+      "loss": 0.1638,
+      "step": 1825
+    },
+    {
+      "epoch": 0.5751181102362205,
+      "grad_norm": 66.83301544189453,
+      "learning_rate": 9.997130708661418e-06,
+      "loss": 0.6822,
+      "step": 1826
+    },
+    {
+      "epoch": 0.5754330708661417,
+      "grad_norm": 22.997888565063477,
+      "learning_rate": 9.997129133858269e-06,
+      "loss": 0.2096,
+      "step": 1827
+    },
+    {
+      "epoch": 0.575748031496063,
+      "grad_norm": 13.75336742401123,
+      "learning_rate": 9.997127559055118e-06,
+      "loss": 0.0664,
+      "step": 1828
+    },
+    {
+      "epoch": 0.5760629921259842,
+      "grad_norm": 60.55500411987305,
+      "learning_rate": 9.99712598425197e-06,
+      "loss": 0.8159,
+      "step": 1829
+    },
+    {
+      "epoch": 0.5763779527559055,
+      "grad_norm": 35.01063537597656,
+      "learning_rate": 9.99712440944882e-06,
+      "loss": 0.3943,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5766929133858267,
+      "grad_norm": 31.06571388244629,
+      "learning_rate": 9.997122834645671e-06,
+      "loss": 0.4897,
+      "step": 1831
+    },
+    {
+      "epoch": 0.577007874015748,
+      "grad_norm": 78.39657592773438,
+      "learning_rate": 9.99712125984252e-06,
+      "loss": 0.5343,
+      "step": 1832
+    },
+    {
+      "epoch": 0.5773228346456692,
+      "grad_norm": 46.089942932128906,
+      "learning_rate": 9.99711968503937e-06,
+      "loss": 0.5686,
+      "step": 1833
+    },
+    {
+      "epoch": 0.5776377952755906,
+      "grad_norm": 37.687374114990234,
+      "learning_rate": 9.99711811023622e-06,
+      "loss": 0.6468,
+      "step": 1834
+    },
+    {
+      "epoch": 0.5779527559055118,
+      "grad_norm": 22.699844360351562,
+      "learning_rate": 9.997116535433072e-06,
+      "loss": 0.3065,
+      "step": 1835
+    },
+    {
+      "epoch": 0.5782677165354331,
+      "grad_norm": 89.35218048095703,
+      "learning_rate": 9.997114960629923e-06,
+      "loss": 0.4874,
+      "step": 1836
+    },
+    {
+      "epoch": 0.5785826771653543,
+      "grad_norm": 95.06536102294922,
+      "learning_rate": 9.997113385826772e-06,
+      "loss": 0.4179,
+      "step": 1837
+    },
+    {
+      "epoch": 0.5788976377952756,
+      "grad_norm": 44.81085205078125,
+      "learning_rate": 9.997111811023623e-06,
+      "loss": 0.3927,
+      "step": 1838
+    },
+    {
+      "epoch": 0.5792125984251969,
+      "grad_norm": 49.45285415649414,
+      "learning_rate": 9.997110236220472e-06,
+      "loss": 0.5742,
+      "step": 1839
+    },
+    {
+      "epoch": 0.5795275590551181,
+      "grad_norm": 35.893402099609375,
+      "learning_rate": 9.997108661417323e-06,
+      "loss": 0.4282,
+      "step": 1840
+    },
+    {
+      "epoch": 0.5795275590551181,
+      "eval_loss": 0.43516087532043457,
+      "eval_runtime": 337.9832,
+      "eval_samples_per_second": 0.346,
+      "eval_steps_per_second": 0.346,
+      "step": 1840
+    },
+    {
+      "epoch": 0.5798425196850394,
+      "grad_norm": 42.341896057128906,
+      "learning_rate": 9.997107086614174e-06,
+      "loss": 0.2899,
+      "step": 1841
+    },
+    {
+      "epoch": 0.5801574803149606,
+      "grad_norm": 28.99087142944336,
+      "learning_rate": 9.997105511811025e-06,
+      "loss": 0.1677,
+      "step": 1842
+    },
+    {
+      "epoch": 0.5804724409448819,
+      "grad_norm": 13.114380836486816,
+      "learning_rate": 9.997103937007875e-06,
+      "loss": 0.063,
+      "step": 1843
+    },
+    {
+      "epoch": 0.5807874015748031,
+      "grad_norm": 28.455913543701172,
+      "learning_rate": 9.997102362204724e-06,
+      "loss": 0.3202,
+      "step": 1844
+    },
+    {
+      "epoch": 0.5811023622047244,
+      "grad_norm": 65.52840423583984,
+      "learning_rate": 9.997100787401577e-06,
+      "loss": 0.5528,
+      "step": 1845
+    },
+    {
+      "epoch": 0.5814173228346456,
+      "grad_norm": 14.398350715637207,
+      "learning_rate": 9.997099212598426e-06,
+      "loss": 0.0874,
+      "step": 1846
+    },
+    {
+      "epoch": 0.581732283464567,
+      "grad_norm": 46.19868850708008,
+      "learning_rate": 9.997097637795277e-06,
+      "loss": 0.2662,
+      "step": 1847
+    },
+    {
+      "epoch": 0.5820472440944882,
+      "grad_norm": 13.150195121765137,
+      "learning_rate": 9.997096062992126e-06,
+      "loss": 0.1102,
+      "step": 1848
+    },
+    {
+      "epoch": 0.5823622047244095,
+      "grad_norm": 19.446687698364258,
+      "learning_rate": 9.997094488188977e-06,
+      "loss": 0.1399,
+      "step": 1849
+    },
+    {
+      "epoch": 0.5826771653543307,
+      "grad_norm": 112.38624572753906,
+      "learning_rate": 9.997092913385828e-06,
+      "loss": 0.3521,
+      "step": 1850
+    },
+    {
+      "epoch": 0.582992125984252,
+      "grad_norm": 14.854012489318848,
+      "learning_rate": 9.997091338582679e-06,
+      "loss": 0.1212,
+      "step": 1851
+    },
+    {
+      "epoch": 0.5833070866141732,
+      "grad_norm": 58.18556213378906,
+      "learning_rate": 9.997089763779528e-06,
+      "loss": 0.4609,
+      "step": 1852
+    },
+    {
+      "epoch": 0.5836220472440945,
+      "grad_norm": 8.105414390563965,
+      "learning_rate": 9.997088188976378e-06,
+      "loss": 0.047,
+      "step": 1853
+    },
+    {
+      "epoch": 0.5839370078740157,
+      "grad_norm": 78.19012451171875,
+      "learning_rate": 9.997086614173229e-06,
+      "loss": 0.3938,
+      "step": 1854
+    },
+    {
+      "epoch": 0.584251968503937,
+      "grad_norm": 60.095645904541016,
+      "learning_rate": 9.99708503937008e-06,
+      "loss": 1.0973,
+      "step": 1855
+    },
+    {
+      "epoch": 0.5845669291338582,
+      "grad_norm": 27.53265953063965,
+      "learning_rate": 9.99708346456693e-06,
+      "loss": 0.3794,
+      "step": 1856
+    },
+    {
+      "epoch": 0.5848818897637795,
+      "grad_norm": 90.6170654296875,
+      "learning_rate": 9.99708188976378e-06,
+      "loss": 0.7531,
+      "step": 1857
+    },
+    {
+      "epoch": 0.5851968503937008,
+      "grad_norm": 82.42227935791016,
+      "learning_rate": 9.997080314960631e-06,
+      "loss": 0.8938,
+      "step": 1858
+    },
+    {
+      "epoch": 0.585511811023622,
+      "grad_norm": 37.9282112121582,
+      "learning_rate": 9.99707874015748e-06,
+      "loss": 0.2363,
+      "step": 1859
+    },
+    {
+      "epoch": 0.5858267716535434,
+      "grad_norm": 17.56612777709961,
+      "learning_rate": 9.997077165354331e-06,
+      "loss": 0.3155,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5858267716535434,
+      "eval_loss": 0.5003868937492371,
+      "eval_runtime": 368.9458,
+      "eval_samples_per_second": 0.317,
+      "eval_steps_per_second": 0.317,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5861417322834646,
+      "grad_norm": 40.288368225097656,
+      "learning_rate": 9.997075590551182e-06,
+      "loss": 0.3474,
+      "step": 1861
+    },
+    {
+      "epoch": 0.5864566929133859,
+      "grad_norm": 29.836639404296875,
+      "learning_rate": 9.997074015748033e-06,
+      "loss": 0.3913,
+      "step": 1862
+    },
+    {
+      "epoch": 0.5867716535433071,
+      "grad_norm": 32.93700408935547,
+      "learning_rate": 9.997072440944883e-06,
+      "loss": 0.2077,
+      "step": 1863
+    },
+    {
+      "epoch": 0.5870866141732284,
+      "grad_norm": 44.143409729003906,
+      "learning_rate": 9.997070866141732e-06,
+      "loss": 0.359,
+      "step": 1864
+    },
+    {
+      "epoch": 0.5874015748031496,
+      "grad_norm": 18.514257431030273,
+      "learning_rate": 9.997069291338583e-06,
+      "loss": 0.2217,
+      "step": 1865
+    },
+    {
+      "epoch": 0.5877165354330709,
+      "grad_norm": 72.15349578857422,
+      "learning_rate": 9.997067716535434e-06,
+      "loss": 0.6858,
+      "step": 1866
+    },
+    {
+      "epoch": 0.5880314960629921,
+      "grad_norm": 53.466983795166016,
+      "learning_rate": 9.997066141732285e-06,
+      "loss": 0.4746,
+      "step": 1867
+    },
+    {
+      "epoch": 0.5883464566929134,
+      "grad_norm": 22.331130981445312,
+      "learning_rate": 9.997064566929134e-06,
+      "loss": 0.2087,
+      "step": 1868
+    },
+    {
+      "epoch": 0.5886614173228346,
+      "grad_norm": 49.12759017944336,
+      "learning_rate": 9.997062992125985e-06,
+      "loss": 0.8626,
+      "step": 1869
+    },
+    {
+      "epoch": 0.5889763779527559,
+      "grad_norm": 68.28533935546875,
+      "learning_rate": 9.997061417322836e-06,
+      "loss": 0.9698,
+      "step": 1870
+    },
+    {
+      "epoch": 0.5892913385826771,
+      "grad_norm": 61.11606979370117,
+      "learning_rate": 9.997059842519687e-06,
+      "loss": 0.6654,
+      "step": 1871
+    },
+    {
+      "epoch": 0.5896062992125984,
+      "grad_norm": 39.162593841552734,
+      "learning_rate": 9.997058267716536e-06,
+      "loss": 0.3398,
+      "step": 1872
+    },
+    {
+      "epoch": 0.5899212598425196,
+      "grad_norm": 34.68936538696289,
+      "learning_rate": 9.997056692913386e-06,
+      "loss": 0.4498,
+      "step": 1873
+    },
+    {
+      "epoch": 0.590236220472441,
+      "grad_norm": 31.508249282836914,
+      "learning_rate": 9.997055118110237e-06,
+      "loss": 0.5166,
+      "step": 1874
+    },
+    {
+      "epoch": 0.5905511811023622,
+      "grad_norm": 26.828365325927734,
+      "learning_rate": 9.997053543307088e-06,
+      "loss": 0.3993,
+      "step": 1875
+    },
+    {
+      "epoch": 0.5908661417322835,
+      "grad_norm": 31.714258193969727,
+      "learning_rate": 9.997051968503939e-06,
+      "loss": 0.4829,
+      "step": 1876
+    },
+    {
+      "epoch": 0.5911811023622047,
+      "grad_norm": 13.222908020019531,
+      "learning_rate": 9.997050393700788e-06,
+      "loss": 0.2407,
+      "step": 1877
+    },
+    {
+      "epoch": 0.591496062992126,
+      "grad_norm": 18.35155487060547,
+      "learning_rate": 9.997048818897639e-06,
+      "loss": 0.4279,
+      "step": 1878
+    },
+    {
+      "epoch": 0.5918110236220473,
+      "grad_norm": 11.358118057250977,
+      "learning_rate": 9.997047244094488e-06,
+      "loss": 0.208,
+      "step": 1879
+    },
+    {
+      "epoch": 0.5921259842519685,
+      "grad_norm": 26.622526168823242,
+      "learning_rate": 9.99704566929134e-06,
+      "loss": 0.4925,
+      "step": 1880
+    },
+    {
+      "epoch": 0.5921259842519685,
+      "eval_loss": 0.5159415602684021,
+      "eval_runtime": 351.9211,
+      "eval_samples_per_second": 0.332,
+      "eval_steps_per_second": 0.332,
+      "step": 1880
+    },
+    {
+      "epoch": 0.5924409448818898,
+      "grad_norm": 32.826812744140625,
+      "learning_rate": 9.99704409448819e-06,
+      "loss": 0.6125,
+      "step": 1881
+    },
+    {
+      "epoch": 0.592755905511811,
+      "grad_norm": 55.511314392089844,
+      "learning_rate": 9.997042519685041e-06,
+      "loss": 0.3093,
+      "step": 1882
+    },
+    {
+      "epoch": 0.5930708661417323,
+      "grad_norm": 35.73579788208008,
+      "learning_rate": 9.99704094488189e-06,
+      "loss": 0.7267,
+      "step": 1883
+    },
+    {
+      "epoch": 0.5933858267716535,
+      "grad_norm": 62.44449234008789,
+      "learning_rate": 9.99703937007874e-06,
+      "loss": 0.6222,
+      "step": 1884
+    },
+    {
+      "epoch": 0.5937007874015748,
+      "grad_norm": 33.10036087036133,
+      "learning_rate": 9.99703779527559e-06,
+      "loss": 0.9203,
+      "step": 1885
+    },
+    {
+      "epoch": 0.594015748031496,
+      "grad_norm": 12.248406410217285,
+      "learning_rate": 9.997036220472442e-06,
+      "loss": 0.1355,
+      "step": 1886
+    },
+    {
+      "epoch": 0.5943307086614174,
+      "grad_norm": 30.380659103393555,
+      "learning_rate": 9.997034645669293e-06,
+      "loss": 0.4853,
+      "step": 1887
+    },
+    {
+      "epoch": 0.5946456692913386,
+      "grad_norm": 32.66392135620117,
+      "learning_rate": 9.997033070866142e-06,
+      "loss": 0.6233,
+      "step": 1888
+    },
+    {
+      "epoch": 0.5949606299212599,
+      "grad_norm": 50.096702575683594,
+      "learning_rate": 9.997031496062993e-06,
+      "loss": 0.4143,
+      "step": 1889
+    },
+    {
+      "epoch": 0.5952755905511811,
+      "grad_norm": 10.48125171661377,
+      "learning_rate": 9.997029921259842e-06,
+      "loss": 0.1208,
+      "step": 1890
+    },
+    {
+      "epoch": 0.5955905511811024,
+      "grad_norm": 39.09800720214844,
+      "learning_rate": 9.997028346456693e-06,
+      "loss": 0.2144,
+      "step": 1891
+    },
+    {
+      "epoch": 0.5959055118110236,
+      "grad_norm": 18.518939971923828,
+      "learning_rate": 9.997026771653544e-06,
+      "loss": 0.3954,
+      "step": 1892
+    },
+    {
+      "epoch": 0.5962204724409449,
+      "grad_norm": 28.824628829956055,
+      "learning_rate": 9.997025196850394e-06,
+      "loss": 0.5161,
+      "step": 1893
+    },
+    {
+      "epoch": 0.5965354330708661,
+      "grad_norm": 36.205902099609375,
+      "learning_rate": 9.997023622047245e-06,
+      "loss": 0.3268,
+      "step": 1894
+    },
+    {
+      "epoch": 0.5968503937007874,
+      "grad_norm": 44.05073165893555,
+      "learning_rate": 9.997022047244096e-06,
+      "loss": 0.6844,
+      "step": 1895
+    },
+    {
+      "epoch": 0.5971653543307086,
+      "grad_norm": 32.883384704589844,
+      "learning_rate": 9.997020472440947e-06,
+      "loss": 0.2967,
+      "step": 1896
+    },
+    {
+      "epoch": 0.5974803149606299,
+      "grad_norm": 41.35356140136719,
+      "learning_rate": 9.997018897637796e-06,
+      "loss": 0.8327,
+      "step": 1897
+    },
+    {
+      "epoch": 0.5977952755905512,
+      "grad_norm": 30.476848602294922,
+      "learning_rate": 9.997017322834647e-06,
+      "loss": 0.384,
+      "step": 1898
+    },
+    {
+      "epoch": 0.5981102362204724,
+      "grad_norm": 20.232952117919922,
+      "learning_rate": 9.997015748031496e-06,
+      "loss": 0.4447,
+      "step": 1899
+    },
+    {
+      "epoch": 0.5984251968503937,
+      "grad_norm": 5.822659492492676,
+      "learning_rate": 9.997014173228347e-06,
+      "loss": 0.0435,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5984251968503937,
+      "eval_loss": 0.49179500341415405,
+      "eval_runtime": 348.9961,
+      "eval_samples_per_second": 0.335,
+      "eval_steps_per_second": 0.335,
+      "step": 1900
+    },
+    {
+      "epoch": 0.598740157480315,
+      "grad_norm": 18.793500900268555,
+      "learning_rate": 9.997012598425198e-06,
+      "loss": 0.2095,
+      "step": 1901
+    },
+    {
+      "epoch": 0.5990551181102363,
+      "grad_norm": 17.714536666870117,
+      "learning_rate": 9.99701102362205e-06,
+      "loss": 0.222,
+      "step": 1902
+    },
+    {
+      "epoch": 0.5993700787401575,
+      "grad_norm": 35.780208587646484,
+      "learning_rate": 9.997009448818898e-06,
+      "loss": 0.2701,
+      "step": 1903
+    },
+    {
+      "epoch": 0.5996850393700788,
+      "grad_norm": 53.45842361450195,
+      "learning_rate": 9.997007874015748e-06,
+      "loss": 0.8297,
+      "step": 1904
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 40.31769561767578,
+      "learning_rate": 9.997006299212599e-06,
+      "loss": 0.3894,
+      "step": 1905
+    },
+    {
+      "epoch": 0.6003149606299213,
+      "grad_norm": 24.99709129333496,
+      "learning_rate": 9.99700472440945e-06,
+      "loss": 0.3682,
+      "step": 1906
+    },
+    {
+      "epoch": 0.6006299212598425,
+      "grad_norm": 52.774383544921875,
+      "learning_rate": 9.9970031496063e-06,
+      "loss": 0.3242,
+      "step": 1907
+    },
+    {
+      "epoch": 0.6009448818897638,
+      "grad_norm": 19.92904281616211,
+      "learning_rate": 9.99700157480315e-06,
+      "loss": 0.2621,
+      "step": 1908
+    },
+    {
+      "epoch": 0.601259842519685,
+      "grad_norm": 38.63066864013672,
+      "learning_rate": 9.997000000000001e-06,
+      "loss": 0.2645,
+      "step": 1909
+    },
+    {
+      "epoch": 0.6015748031496063,
+      "grad_norm": 20.248315811157227,
+      "learning_rate": 9.99699842519685e-06,
+      "loss": 0.0956,
+      "step": 1910
+    },
+    {
+      "epoch": 0.6018897637795275,
+      "grad_norm": 162.4951934814453,
+      "learning_rate": 9.996996850393701e-06,
+      "loss": 0.4668,
+      "step": 1911
+    },
+    {
+      "epoch": 0.6022047244094488,
+      "grad_norm": 11.444823265075684,
+      "learning_rate": 9.996995275590552e-06,
+      "loss": 0.0933,
+      "step": 1912
+    },
+    {
+      "epoch": 0.60251968503937,
+      "grad_norm": 16.599746704101562,
+      "learning_rate": 9.996993700787403e-06,
+      "loss": 0.1043,
+      "step": 1913
+    },
+    {
+      "epoch": 0.6028346456692913,
+      "grad_norm": 66.4779281616211,
+      "learning_rate": 9.996992125984253e-06,
+      "loss": 0.818,
+      "step": 1914
+    },
+    {
+      "epoch": 0.6031496062992125,
+      "grad_norm": 26.693084716796875,
+      "learning_rate": 9.996990551181104e-06,
+      "loss": 0.572,
+      "step": 1915
+    },
+    {
+      "epoch": 0.6034645669291339,
+      "grad_norm": 71.67481994628906,
+      "learning_rate": 9.996988976377955e-06,
+      "loss": 0.1498,
+      "step": 1916
+    },
+    {
+      "epoch": 0.6037795275590552,
+      "grad_norm": 32.80756378173828,
+      "learning_rate": 9.996987401574804e-06,
+      "loss": 0.2654,
+      "step": 1917
+    },
+    {
+      "epoch": 0.6040944881889764,
+      "grad_norm": 73.54534149169922,
+      "learning_rate": 9.996985826771655e-06,
+      "loss": 0.8395,
+      "step": 1918
+    },
+    {
+      "epoch": 0.6044094488188977,
+      "grad_norm": 124.98379516601562,
+      "learning_rate": 9.996984251968504e-06,
+      "loss": 0.4122,
+      "step": 1919
+    },
+    {
+      "epoch": 0.6047244094488189,
+      "grad_norm": 58.07841873168945,
+      "learning_rate": 9.996982677165355e-06,
+      "loss": 0.4811,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6047244094488189,
+      "eval_loss": 0.490536630153656,
+      "eval_runtime": 351.4081,
+      "eval_samples_per_second": 0.333,
+      "eval_steps_per_second": 0.333,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6050393700787402,
+      "grad_norm": 47.089378356933594,
+      "learning_rate": 9.996981102362206e-06,
+      "loss": 0.5479,
+      "step": 1921
+    },
+    {
+      "epoch": 0.6053543307086614,
+      "grad_norm": 40.61234664916992,
+      "learning_rate": 9.996979527559057e-06,
+      "loss": 0.1702,
+      "step": 1922
+    },
+    {
+      "epoch": 0.6056692913385827,
+      "grad_norm": 76.10828399658203,
+      "learning_rate": 9.996977952755906e-06,
+      "loss": 1.0415,
+      "step": 1923
+    },
+    {
+      "epoch": 0.6059842519685039,
+      "grad_norm": 8.746352195739746,
+      "learning_rate": 9.996976377952756e-06,
+      "loss": 0.0595,
+      "step": 1924
+    },
+    {
+      "epoch": 0.6062992125984252,
+      "grad_norm": 49.07436752319336,
+      "learning_rate": 9.996974803149607e-06,
+      "loss": 0.8017,
+      "step": 1925
+    },
+    {
+      "epoch": 0.6066141732283464,
+      "grad_norm": 37.18422317504883,
+      "learning_rate": 9.996973228346458e-06,
+      "loss": 0.3798,
+      "step": 1926
+    },
+    {
+      "epoch": 0.6069291338582677,
+      "grad_norm": 41.602073669433594,
+      "learning_rate": 9.996971653543309e-06,
+      "loss": 0.4223,
+      "step": 1927
+    },
+    {
+      "epoch": 0.607244094488189,
+      "grad_norm": 46.16876220703125,
+      "learning_rate": 9.996970078740158e-06,
+      "loss": 0.5156,
+      "step": 1928
+    },
+    {
+      "epoch": 0.6075590551181103,
+      "grad_norm": 38.204036712646484,
+      "learning_rate": 9.996968503937009e-06,
+      "loss": 0.3032,
+      "step": 1929
+    },
+    {
+      "epoch": 0.6078740157480315,
+      "grad_norm": 7.078056335449219,
+      "learning_rate": 9.996966929133858e-06,
+      "loss": 0.0285,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6081889763779528,
+      "grad_norm": 76.96788787841797,
+      "learning_rate": 9.99696535433071e-06,
+      "loss": 0.4164,
+      "step": 1931
+    },
+    {
+      "epoch": 0.608503937007874,
+      "grad_norm": 33.73112487792969,
+      "learning_rate": 9.99696377952756e-06,
+      "loss": 0.1971,
+      "step": 1932
+    },
+    {
+      "epoch": 0.6088188976377953,
+      "grad_norm": 50.70700454711914,
+      "learning_rate": 9.996962204724411e-06,
+      "loss": 0.6604,
+      "step": 1933
+    },
+    {
+      "epoch": 0.6091338582677165,
+      "grad_norm": 16.3695011138916,
+      "learning_rate": 9.99696062992126e-06,
+      "loss": 0.2403,
+      "step": 1934
+    },
+    {
+      "epoch": 0.6094488188976378,
+      "grad_norm": 32.59841537475586,
+      "learning_rate": 9.99695905511811e-06,
+      "loss": 0.3409,
+      "step": 1935
+    },
+    {
+      "epoch": 0.609763779527559,
+      "grad_norm": 33.59854507446289,
+      "learning_rate": 9.99695748031496e-06,
+      "loss": 0.2444,
+      "step": 1936
+    },
+    {
+      "epoch": 0.6100787401574803,
+      "grad_norm": 53.62559509277344,
+      "learning_rate": 9.996955905511812e-06,
+      "loss": 0.1804,
+      "step": 1937
+    },
+    {
+      "epoch": 0.6103937007874016,
+      "grad_norm": 41.21846389770508,
+      "learning_rate": 9.996954330708663e-06,
+      "loss": 0.8659,
+      "step": 1938
+    },
+    {
+      "epoch": 0.6107086614173228,
+      "grad_norm": 65.04766845703125,
+      "learning_rate": 9.996952755905512e-06,
+      "loss": 0.4908,
+      "step": 1939
+    },
+    {
+      "epoch": 0.6110236220472441,
+      "grad_norm": 63.45843505859375,
+      "learning_rate": 9.996951181102363e-06,
+      "loss": 0.8725,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6110236220472441,
+      "eval_loss": 0.5175274014472961,
+      "eval_runtime": 349.2545,
+      "eval_samples_per_second": 0.335,
+      "eval_steps_per_second": 0.335,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6113385826771653,
+      "grad_norm": 59.46821212768555,
+      "learning_rate": 9.996949606299214e-06,
+      "loss": 0.5103,
+      "step": 1941
+    },
+    {
+      "epoch": 0.6116535433070867,
+      "grad_norm": 18.964391708374023,
+      "learning_rate": 9.996948031496065e-06,
+      "loss": 0.3203,
+      "step": 1942
+    },
+    {
+      "epoch": 0.6119685039370079,
+      "grad_norm": 101.10069274902344,
+      "learning_rate": 9.996946456692914e-06,
+      "loss": 0.7592,
+      "step": 1943
+    },
+    {
+      "epoch": 0.6122834645669292,
+      "grad_norm": 36.87664794921875,
+      "learning_rate": 9.996944881889764e-06,
+      "loss": 0.5184,
+      "step": 1944
+    },
+    {
+      "epoch": 0.6125984251968504,
+      "grad_norm": 31.650327682495117,
+      "learning_rate": 9.996943307086615e-06,
+      "loss": 0.3825,
+      "step": 1945
+    },
+    {
+      "epoch": 0.6129133858267717,
+      "grad_norm": 23.641586303710938,
+      "learning_rate": 9.996941732283466e-06,
+      "loss": 0.1946,
+      "step": 1946
+    },
+    {
+      "epoch": 0.6132283464566929,
+      "grad_norm": 19.757226943969727,
+      "learning_rate": 9.996940157480317e-06,
+      "loss": 0.0762,
+      "step": 1947
+    },
+    {
+      "epoch": 0.6135433070866142,
+      "grad_norm": 10.93740177154541,
+      "learning_rate": 9.996938582677166e-06,
+      "loss": 0.0708,
+      "step": 1948
+    },
+    {
+      "epoch": 0.6138582677165354,
+      "grad_norm": 21.542688369750977,
+      "learning_rate": 9.996937007874017e-06,
+      "loss": 0.2724,
+      "step": 1949
+    },
+    {
+      "epoch": 0.6141732283464567,
+      "grad_norm": 27.340009689331055,
+      "learning_rate": 9.996935433070866e-06,
+      "loss": 0.0854,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6144881889763779,
+      "grad_norm": 47.907955169677734,
+      "learning_rate": 9.996933858267717e-06,
+      "loss": 0.3816,
+      "step": 1951
+    },
+    {
+      "epoch": 0.6148031496062992,
+      "grad_norm": 65.70764923095703,
+      "learning_rate": 9.996932283464568e-06,
+      "loss": 0.2682,
+      "step": 1952
+    },
+    {
+      "epoch": 0.6151181102362204,
+      "grad_norm": 39.29658889770508,
+      "learning_rate": 9.99693070866142e-06,
+      "loss": 0.3791,
+      "step": 1953
+    },
+    {
+      "epoch": 0.6154330708661417,
+      "grad_norm": 7.722301006317139,
+      "learning_rate": 9.996929133858268e-06,
+      "loss": 0.0215,
+      "step": 1954
+    },
+    {
+      "epoch": 0.6157480314960629,
+      "grad_norm": 47.819461822509766,
+      "learning_rate": 9.996927559055118e-06,
+      "loss": 0.6154,
+      "step": 1955
+    },
+    {
+      "epoch": 0.6160629921259843,
+      "grad_norm": 32.92623519897461,
+      "learning_rate": 9.996925984251969e-06,
+      "loss": 0.1975,
+      "step": 1956
+    },
+    {
+      "epoch": 0.6163779527559056,
+      "grad_norm": 73.50929260253906,
+      "learning_rate": 9.99692440944882e-06,
+      "loss": 0.2781,
+      "step": 1957
+    },
+    {
+      "epoch": 0.6166929133858268,
+      "grad_norm": 26.083810806274414,
+      "learning_rate": 9.99692283464567e-06,
+      "loss": 0.3916,
+      "step": 1958
+    },
+    {
+      "epoch": 0.6170078740157481,
+      "grad_norm": 3.3253297805786133,
+      "learning_rate": 9.99692125984252e-06,
+      "loss": 0.0313,
+      "step": 1959
+    },
+    {
+      "epoch": 0.6173228346456693,
+      "grad_norm": 0.6226401329040527,
+      "learning_rate": 9.996919685039371e-06,
+      "loss": 0.0049,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6173228346456693,
+      "eval_loss": 0.4793933033943176,
+      "eval_runtime": 341.1297,
+      "eval_samples_per_second": 0.343,
+      "eval_steps_per_second": 0.343,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6176377952755906,
+      "grad_norm": 38.08949279785156,
+      "learning_rate": 9.99691811023622e-06,
+      "loss": 0.2089,
+      "step": 1961
+    },
+    {
+      "epoch": 0.6179527559055118,
+      "grad_norm": 96.51960754394531,
+      "learning_rate": 9.996916535433071e-06,
+      "loss": 0.6145,
+      "step": 1962
+    },
+    {
+      "epoch": 0.6182677165354331,
+      "grad_norm": 36.37595748901367,
+      "learning_rate": 9.996914960629922e-06,
+      "loss": 0.1043,
+      "step": 1963
+    },
+    {
+      "epoch": 0.6185826771653543,
+      "grad_norm": 112.60614013671875,
+      "learning_rate": 9.996913385826772e-06,
+      "loss": 1.0763,
+      "step": 1964
+    },
+    {
+      "epoch": 0.6188976377952756,
+      "grad_norm": 49.78593444824219,
+      "learning_rate": 9.996911811023623e-06,
+      "loss": 0.1965,
+      "step": 1965
+    },
+    {
+      "epoch": 0.6192125984251968,
+      "grad_norm": 58.12906265258789,
+      "learning_rate": 9.996910236220474e-06,
+      "loss": 0.3536,
+      "step": 1966
+    },
+    {
+      "epoch": 0.6195275590551181,
+      "grad_norm": 71.21797180175781,
+      "learning_rate": 9.996908661417325e-06,
+      "loss": 0.6605,
+      "step": 1967
+    },
+    {
+      "epoch": 0.6198425196850393,
+      "grad_norm": 104.464111328125,
+      "learning_rate": 9.996907086614174e-06,
+      "loss": 0.3808,
+      "step": 1968
+    },
+    {
+      "epoch": 0.6201574803149607,
+      "grad_norm": 79.76933288574219,
+      "learning_rate": 9.996905511811025e-06,
+      "loss": 0.5754,
+      "step": 1969
+    },
+    {
+      "epoch": 0.6204724409448819,
+      "grad_norm": 21.67342758178711,
+      "learning_rate": 9.996903937007874e-06,
+      "loss": 0.0917,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6207874015748032,
+      "grad_norm": 34.137447357177734,
+      "learning_rate": 9.996902362204725e-06,
+      "loss": 0.5807,
+      "step": 1971
+    },
+    {
+      "epoch": 0.6211023622047244,
+      "grad_norm": 46.111595153808594,
+      "learning_rate": 9.996900787401576e-06,
+      "loss": 0.3738,
+      "step": 1972
+    },
+    {
+      "epoch": 0.6214173228346457,
+      "grad_norm": 45.28417205810547,
+      "learning_rate": 9.996899212598427e-06,
+      "loss": 0.7565,
+      "step": 1973
+    },
+    {
+      "epoch": 0.6217322834645669,
+      "grad_norm": 36.73102569580078,
+      "learning_rate": 9.996897637795276e-06,
+      "loss": 0.7887,
+      "step": 1974
+    },
+    {
+      "epoch": 0.6220472440944882,
+      "grad_norm": 69.32728576660156,
+      "learning_rate": 9.996896062992126e-06,
+      "loss": 0.4563,
+      "step": 1975
+    },
+    {
+      "epoch": 0.6223622047244094,
+      "grad_norm": 43.308555603027344,
+      "learning_rate": 9.996894488188977e-06,
+      "loss": 0.2911,
+      "step": 1976
+    },
+    {
+      "epoch": 0.6226771653543307,
+      "grad_norm": 40.517086029052734,
+      "learning_rate": 9.996892913385828e-06,
+      "loss": 0.3979,
+      "step": 1977
+    },
+    {
+      "epoch": 0.622992125984252,
+      "grad_norm": 68.62828826904297,
+      "learning_rate": 9.996891338582679e-06,
+      "loss": 0.3365,
+      "step": 1978
+    },
+    {
+      "epoch": 0.6233070866141732,
+      "grad_norm": 28.892871856689453,
+      "learning_rate": 9.996889763779528e-06,
+      "loss": 0.5079,
+      "step": 1979
+    },
+    {
+      "epoch": 0.6236220472440945,
+      "grad_norm": 22.088882446289062,
+      "learning_rate": 9.996888188976379e-06,
+      "loss": 0.3113,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6236220472440945,
+      "eval_loss": 0.4940292537212372,
+      "eval_runtime": 352.4729,
+      "eval_samples_per_second": 0.332,
+      "eval_steps_per_second": 0.332,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6239370078740157,
+      "grad_norm": 37.80302047729492,
+      "learning_rate": 9.996886614173228e-06,
+      "loss": 0.6656,
+      "step": 1981
+    },
+    {
+      "epoch": 0.624251968503937,
+      "grad_norm": 19.02584457397461,
+      "learning_rate": 9.99688503937008e-06,
+      "loss": 0.1548,
+      "step": 1982
+    },
+    {
+      "epoch": 0.6245669291338583,
+      "grad_norm": 10.896634101867676,
+      "learning_rate": 9.99688346456693e-06,
+      "loss": 0.1322,
+      "step": 1983
+    },
+    {
+      "epoch": 0.6248818897637796,
+      "grad_norm": 33.37139892578125,
+      "learning_rate": 9.99688188976378e-06,
+      "loss": 0.3109,
+      "step": 1984
+    },
+    {
+      "epoch": 0.6251968503937008,
+      "grad_norm": 20.222166061401367,
+      "learning_rate": 9.99688031496063e-06,
+      "loss": 0.3568,
+      "step": 1985
+    },
+    {
+      "epoch": 0.6255118110236221,
+      "grad_norm": 26.97902488708496,
+      "learning_rate": 9.996878740157482e-06,
+      "loss": 0.4469,
+      "step": 1986
+    },
+    {
+      "epoch": 0.6258267716535433,
+      "grad_norm": 3.6876025199890137,
+      "learning_rate": 9.996877165354333e-06,
+      "loss": 0.017,
+      "step": 1987
+    },
+    {
+      "epoch": 0.6261417322834646,
+      "grad_norm": 3.7643909454345703,
+      "learning_rate": 9.996875590551182e-06,
+      "loss": 0.0312,
+      "step": 1988
+    },
+    {
+      "epoch": 0.6264566929133858,
+      "grad_norm": 58.000083923339844,
+      "learning_rate": 9.996874015748033e-06,
+      "loss": 0.5242,
+      "step": 1989
+    },
+    {
+      "epoch": 0.6267716535433071,
+      "grad_norm": 57.707000732421875,
+      "learning_rate": 9.996872440944882e-06,
+      "loss": 0.537,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6270866141732283,
+      "grad_norm": 23.221139907836914,
+      "learning_rate": 9.996870866141733e-06,
+      "loss": 0.1436,
+      "step": 1991
+    },
+    {
+      "epoch": 0.6274015748031496,
+      "grad_norm": 17.156478881835938,
+      "learning_rate": 9.996869291338584e-06,
+      "loss": 0.2777,
+      "step": 1992
+    },
+    {
+      "epoch": 0.6277165354330708,
+      "grad_norm": 81.11396789550781,
+      "learning_rate": 9.996867716535435e-06,
+      "loss": 0.6183,
+      "step": 1993
+    },
+    {
+      "epoch": 0.6280314960629921,
+      "grad_norm": 17.866575241088867,
+      "learning_rate": 9.996866141732284e-06,
+      "loss": 0.2052,
+      "step": 1994
+    },
+    {
+      "epoch": 0.6283464566929133,
+      "grad_norm": 49.33943557739258,
+      "learning_rate": 9.996864566929134e-06,
+      "loss": 0.2541,
+      "step": 1995
+    },
+    {
+      "epoch": 0.6286614173228346,
+      "grad_norm": 24.039709091186523,
+      "learning_rate": 9.996862992125985e-06,
+      "loss": 0.1116,
+      "step": 1996
+    },
+    {
+      "epoch": 0.628976377952756,
+      "grad_norm": 17.275371551513672,
+      "learning_rate": 9.996861417322836e-06,
+      "loss": 0.054,
+      "step": 1997
+    },
+    {
+      "epoch": 0.6292913385826772,
+      "grad_norm": 65.09687805175781,
+      "learning_rate": 9.996859842519687e-06,
+      "loss": 0.842,
+      "step": 1998
+    },
+    {
+      "epoch": 0.6296062992125985,
+      "grad_norm": 63.773319244384766,
+      "learning_rate": 9.996858267716536e-06,
+      "loss": 0.2476,
+      "step": 1999
+    },
+    {
+      "epoch": 0.6299212598425197,
+      "grad_norm": 101.19293212890625,
+      "learning_rate": 9.996856692913387e-06,
+      "loss": 1.0292,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6299212598425197,
+      "eval_loss": 0.49186941981315613,
+      "eval_runtime": 611.4185,
+      "eval_samples_per_second": 0.191,
+      "eval_steps_per_second": 0.191,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 6350000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2000,
+  "save_steps": 20,
+  "total_flos": 1.252556321381376e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}