Upload 10 files

Browse files

Files changed (6) hide show

model.safetensors +1 -1
optimizer-001.pt +3 -0
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +2465 -515
training_args.bin +1 -1

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:50ab72ee4006e90d132d254bd261b095cdf9f599f538918a37986c6071d1773e
 size 1583544840

 version https://git-lfs.github.com/spec/v1
+oid sha256:ab5986497028108a88ad993e68c18562e52cb6c8e03114fe01200e5da0854e3b
 size 1583544840

optimizer-001.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d64b452c7cc1666de71a32e00b462c42364fad37b649fea63dfceeb3684a87ec
+size 3167201739

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cd7d1ab5fa201d20d98db247d9f5d0ebc8dcb20aa2e1128cb5af2b40e8ae23a1
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:adbea6f60dc78ded4b6f92de10291bbc5facf95a1b4fb8015f15dc7bc7f39302
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:683f9d438efb114c8ac5a1515e4472a3e865f2fc3aea8a4b95df341b9ab5537f
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:9fc94cd9c1543eabca01e851af1d33e6ad7156a1958832e168dcad4d8856974b
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,1189 +2,3139 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.015,
   "eval_steps": 1000,
-  "global_step": 15000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 1e-06,
-      "grad_norm": 1.1795536279678345,
       "learning_rate": 0.0,
-      "loss": 1.4139,
       "step": 1
     },
     {
       "epoch": 0.0001,
-      "grad_norm": 1.1734141111373901,
-      "learning_rate": 9.900000000000001e-08,
-      "loss": 1.387,
       "step": 100
     },
     {
       "epoch": 0.0002,
-      "grad_norm": 1.1503151655197144,
-      "learning_rate": 1.9900000000000002e-07,
-      "loss": 1.3882,
       "step": 200
     },
     {
       "epoch": 0.0003,
-      "grad_norm": 1.1478229761123657,
-      "learning_rate": 2.99e-07,
-      "loss": 1.386,
       "step": 300
     },
     {
       "epoch": 0.0004,
-      "grad_norm": 1.1559761762619019,
-      "learning_rate": 3.99e-07,
-      "loss": 1.3823,
       "step": 400
     },
     {
       "epoch": 0.0005,
-      "grad_norm": 1.1433175802230835,
-      "learning_rate": 4.99e-07,
-      "loss": 1.381,
       "step": 500
     },
     {
       "epoch": 0.0006,
-      "grad_norm": 1.1483551263809204,
-      "learning_rate": 5.990000000000001e-07,
-      "loss": 1.3807,
       "step": 600
     },
     {
       "epoch": 0.0007,
-      "grad_norm": 1.161496877670288,
-      "learning_rate": 6.990000000000001e-07,
-      "loss": 1.3833,
       "step": 700
     },
     {
       "epoch": 0.0008,
-      "grad_norm": 1.139211654663086,
-      "learning_rate": 7.990000000000001e-07,
-      "loss": 1.3835,
       "step": 800
     },
     {
       "epoch": 0.0009,
-      "grad_norm": 1.133931040763855,
-      "learning_rate": 8.99e-07,
-      "loss": 1.3719,
       "step": 900
     },
     {
       "epoch": 0.001,
-      "grad_norm": 1.1143814325332642,
-      "learning_rate": 9.99e-07,
-      "loss": 1.3761,
       "step": 1000
     },
     {
       "epoch": 0.001,
-      "eval_loss": 1.4045685529708862,
-      "eval_runtime": 27.4497,
-      "eval_samples_per_second": 182.152,
-      "eval_steps_per_second": 2.878,
       "step": 1000
     },
     {
       "epoch": 0.0011,
-      "grad_norm": 1.153507947921753,
-      "learning_rate": 1.099e-06,
-      "loss": 1.3669,
       "step": 1100
     },
     {
       "epoch": 0.0012,
-      "grad_norm": 1.1281546354293823,
-      "learning_rate": 1.199e-06,
-      "loss": 1.375,
       "step": 1200
     },
     {
       "epoch": 0.0013,
-      "grad_norm": 1.1093217134475708,
-      "learning_rate": 1.299e-06,
-      "loss": 1.3726,
       "step": 1300
     },
     {
       "epoch": 0.0014,
-      "grad_norm": 1.1526917219161987,
-      "learning_rate": 1.399e-06,
-      "loss": 1.3696,
       "step": 1400
     },
     {
       "epoch": 0.0015,
-      "grad_norm": 1.1092661619186401,
-      "learning_rate": 1.4990000000000002e-06,
-      "loss": 1.3699,
       "step": 1500
     },
     {
       "epoch": 0.0016,
-      "grad_norm": 1.5104150772094727,
-      "learning_rate": 1.599e-06,
-      "loss": 1.3734,
       "step": 1600
     },
     {
       "epoch": 0.0017,
-      "grad_norm": 1.1301764249801636,
-      "learning_rate": 1.6990000000000002e-06,
-      "loss": 1.3719,
       "step": 1700
     },
     {
       "epoch": 0.0018,
-      "grad_norm": 1.120370626449585,
-      "learning_rate": 1.7990000000000003e-06,
-      "loss": 1.3695,
       "step": 1800
     },
     {
       "epoch": 0.0019,
-      "grad_norm": 1.145676612854004,
-      "learning_rate": 1.8990000000000004e-06,
-      "loss": 1.3675,
       "step": 1900
     },
     {
       "epoch": 0.002,
-      "grad_norm": 1.1365715265274048,
-      "learning_rate": 1.9990000000000003e-06,
-      "loss": 1.3616,
       "step": 2000
     },
     {
       "epoch": 0.002,
-      "eval_loss": 1.3989644050598145,
-      "eval_runtime": 24.4886,
-      "eval_samples_per_second": 204.177,
-      "eval_steps_per_second": 3.226,
       "step": 2000
     },
     {
       "epoch": 0.0021,
-      "grad_norm": 1.118861198425293,
-      "learning_rate": 2.099e-06,
-      "loss": 1.3635,
       "step": 2100
     },
     {
       "epoch": 0.0022,
-      "grad_norm": 1.1307072639465332,
-      "learning_rate": 2.1990000000000005e-06,
-      "loss": 1.375,
       "step": 2200
     },
     {
       "epoch": 0.0023,
-      "grad_norm": 1.088172197341919,
-      "learning_rate": 2.299e-06,
-      "loss": 1.3627,
       "step": 2300
     },
     {
       "epoch": 0.0024,
-      "grad_norm": 1.1681883335113525,
-      "learning_rate": 2.3990000000000002e-06,
-      "loss": 1.3607,
       "step": 2400
     },
     {
       "epoch": 0.0025,
-      "grad_norm": 1.1483210325241089,
-      "learning_rate": 2.499e-06,
-      "loss": 1.3687,
       "step": 2500
     },
     {
       "epoch": 0.0026,
-      "grad_norm": 1.1572397947311401,
-      "learning_rate": 2.5990000000000004e-06,
-      "loss": 1.3695,
       "step": 2600
     },
     {
       "epoch": 0.0027,
-      "grad_norm": 1.124837875366211,
-      "learning_rate": 2.699e-06,
-      "loss": 1.3532,
       "step": 2700
     },
     {
       "epoch": 0.0028,
-      "grad_norm": 1.0974047183990479,
-      "learning_rate": 2.7990000000000002e-06,
-      "loss": 1.3577,
       "step": 2800
     },
     {
       "epoch": 0.0029,
-      "grad_norm": 1.1722006797790527,
-      "learning_rate": 2.899e-06,
-      "loss": 1.3673,
       "step": 2900
     },
     {
       "epoch": 0.003,
-      "grad_norm": 1.106062650680542,
-      "learning_rate": 2.9990000000000004e-06,
-      "loss": 1.36,
       "step": 3000
     },
     {
       "epoch": 0.003,
-      "eval_loss": 1.3754355907440186,
-      "eval_runtime": 24.5927,
-      "eval_samples_per_second": 203.312,
-      "eval_steps_per_second": 3.212,
       "step": 3000
     },
     {
       "epoch": 0.0031,
-      "grad_norm": 1.1039618253707886,
-      "learning_rate": 3.0990000000000003e-06,
-      "loss": 1.3567,
       "step": 3100
     },
     {
       "epoch": 0.0032,
-      "grad_norm": 1.1439259052276611,
-      "learning_rate": 3.1990000000000006e-06,
-      "loss": 1.3543,
       "step": 3200
     },
     {
       "epoch": 0.0033,
-      "grad_norm": 1.1732087135314941,
-      "learning_rate": 3.2990000000000005e-06,
-      "loss": 1.3464,
       "step": 3300
     },
     {
       "epoch": 0.0034,
-      "grad_norm": 1.0517069101333618,
-      "learning_rate": 3.399e-06,
-      "loss": 1.3398,
       "step": 3400
     },
     {
       "epoch": 0.0035,
-      "grad_norm": 1.0987197160720825,
-      "learning_rate": 3.4990000000000003e-06,
-      "loss": 1.356,
       "step": 3500
     },
     {
       "epoch": 0.0036,
-      "grad_norm": 1.1524548530578613,
-      "learning_rate": 3.599e-06,
-      "loss": 1.3481,
       "step": 3600
     },
     {
       "epoch": 0.0037,
-      "grad_norm": 1.10309636592865,
-      "learning_rate": 3.6990000000000005e-06,
-      "loss": 1.3515,
       "step": 3700
     },
     {
       "epoch": 0.0038,
-      "grad_norm": 1.1285984516143799,
-      "learning_rate": 3.7990000000000004e-06,
-      "loss": 1.3541,
       "step": 3800
     },
     {
       "epoch": 0.0039,
-      "grad_norm": 1.1621686220169067,
-      "learning_rate": 3.899e-06,
-      "loss": 1.3532,
       "step": 3900
     },
     {
       "epoch": 0.004,
-      "grad_norm": 1.078803300857544,
-      "learning_rate": 3.999e-06,
-      "loss": 1.3468,
       "step": 4000
     },
     {
       "epoch": 0.004,
-      "eval_loss": 1.3711252212524414,
-      "eval_runtime": 24.5467,
-      "eval_samples_per_second": 203.693,
-      "eval_steps_per_second": 3.218,
       "step": 4000
     },
     {
       "epoch": 0.0041,
-      "grad_norm": 1.1375211477279663,
-      "learning_rate": 4.099e-06,
-      "loss": 1.341,
       "step": 4100
     },
     {
       "epoch": 0.0042,
-      "grad_norm": 1.0922551155090332,
-      "learning_rate": 4.199e-06,
-      "loss": 1.3427,
       "step": 4200
     },
     {
       "epoch": 0.0043,
-      "grad_norm": 1.124060034751892,
-      "learning_rate": 4.299000000000001e-06,
-      "loss": 1.3409,
       "step": 4300
     },
     {
       "epoch": 0.0044,
-      "grad_norm": 1.125467300415039,
-      "learning_rate": 4.3990000000000006e-06,
-      "loss": 1.3467,
       "step": 4400
     },
     {
       "epoch": 0.0045,
-      "grad_norm": 1.1384063959121704,
-      "learning_rate": 4.4990000000000005e-06,
-      "loss": 1.3426,
       "step": 4500
     },
     {
       "epoch": 0.0046,
-      "grad_norm": 1.1456679105758667,
-      "learning_rate": 4.599e-06,
-      "loss": 1.3445,
       "step": 4600
     },
     {
       "epoch": 0.0047,
-      "grad_norm": 1.1553903818130493,
-      "learning_rate": 4.699e-06,
-      "loss": 1.3372,
       "step": 4700
     },
     {
       "epoch": 0.0048,
-      "grad_norm": 1.1315921545028687,
-      "learning_rate": 4.799e-06,
-      "loss": 1.3398,
       "step": 4800
     },
     {
       "epoch": 0.0049,
-      "grad_norm": 1.08122980594635,
-      "learning_rate": 4.899e-06,
-      "loss": 1.3364,
       "step": 4900
     },
     {
       "epoch": 0.005,
-      "grad_norm": 1.09906804561615,
-      "learning_rate": 4.999000000000001e-06,
-      "loss": 1.3366,
       "step": 5000
     },
     {
       "epoch": 0.005,
-      "eval_loss": 1.3536914587020874,
-      "eval_runtime": 24.5551,
-      "eval_samples_per_second": 203.624,
-      "eval_steps_per_second": 3.217,
       "step": 5000
     },
     {
       "epoch": 0.0051,
-      "grad_norm": 1.1291029453277588,
-      "learning_rate": 5.099000000000001e-06,
-      "loss": 1.3396,
       "step": 5100
     },
     {
       "epoch": 0.0052,
-      "grad_norm": 1.1673402786254883,
-      "learning_rate": 5.1990000000000005e-06,
-      "loss": 1.3358,
       "step": 5200
     },
     {
       "epoch": 0.0053,
-      "grad_norm": 1.1300634145736694,
-      "learning_rate": 5.2990000000000004e-06,
-      "loss": 1.3384,
       "step": 5300
     },
     {
       "epoch": 0.0054,
-      "grad_norm": 1.1179150342941284,
-      "learning_rate": 5.399000000000001e-06,
-      "loss": 1.3332,
       "step": 5400
     },
     {
       "epoch": 0.0055,
-      "grad_norm": 1.091856837272644,
-      "learning_rate": 5.499000000000001e-06,
-      "loss": 1.3348,
       "step": 5500
     },
     {
       "epoch": 0.0056,
-      "grad_norm": 1.0551645755767822,
-      "learning_rate": 5.599e-06,
-      "loss": 1.336,
       "step": 5600
     },
     {
       "epoch": 0.0057,
-      "grad_norm": 1.1457860469818115,
-      "learning_rate": 5.699e-06,
-      "loss": 1.333,
       "step": 5700
     },
     {
       "epoch": 0.0058,
-      "grad_norm": 1.1662046909332275,
-      "learning_rate": 5.799e-06,
-      "loss": 1.3299,
       "step": 5800
     },
     {
       "epoch": 0.0059,
-      "grad_norm": 1.1879452466964722,
-      "learning_rate": 5.899000000000001e-06,
-      "loss": 1.3354,
       "step": 5900
     },
     {
       "epoch": 0.006,
-      "grad_norm": 1.1441973447799683,
-      "learning_rate": 5.9990000000000005e-06,
-      "loss": 1.3329,
       "step": 6000
     },
     {
       "epoch": 0.006,
-      "eval_loss": 1.3535875082015991,
-      "eval_runtime": 24.3908,
-      "eval_samples_per_second": 204.995,
-      "eval_steps_per_second": 3.239,
       "step": 6000
     },
     {
       "epoch": 0.0061,
-      "grad_norm": 1.121394395828247,
-      "learning_rate": 6.099e-06,
-      "loss": 1.3295,
       "step": 6100
     },
     {
       "epoch": 0.0062,
-      "grad_norm": 1.1496130228042603,
-      "learning_rate": 6.199e-06,
-      "loss": 1.3303,
       "step": 6200
     },
     {
       "epoch": 0.0063,
-      "grad_norm": 1.2465569972991943,
-      "learning_rate": 6.299000000000001e-06,
-      "loss": 1.3268,
       "step": 6300
     },
     {
       "epoch": 0.0064,
-      "grad_norm": 1.1363328695297241,
-      "learning_rate": 6.399000000000001e-06,
-      "loss": 1.3248,
       "step": 6400
     },
     {
       "epoch": 0.0065,
-      "grad_norm": 1.1142207384109497,
-      "learning_rate": 6.499000000000001e-06,
-      "loss": 1.3212,
       "step": 6500
     },
     {
       "epoch": 0.0066,
-      "grad_norm": 1.1020450592041016,
-      "learning_rate": 6.599000000000001e-06,
-      "loss": 1.3305,
       "step": 6600
     },
     {
       "epoch": 0.0067,
-      "grad_norm": 1.0636595487594604,
-      "learning_rate": 6.699000000000001e-06,
-      "loss": 1.3343,
       "step": 6700
     },
     {
       "epoch": 0.0068,
-      "grad_norm": 1.0846408605575562,
-      "learning_rate": 6.7990000000000005e-06,
-      "loss": 1.3306,
       "step": 6800
     },
     {
       "epoch": 0.0069,
-      "grad_norm": 1.2017494440078735,
-      "learning_rate": 6.899e-06,
-      "loss": 1.3191,
       "step": 6900
     },
     {
       "epoch": 0.007,
-      "grad_norm": 1.159947156906128,
-      "learning_rate": 6.999e-06,
-      "loss": 1.334,
       "step": 7000
     },
     {
       "epoch": 0.007,
-      "eval_loss": 1.3385692834854126,
-      "eval_runtime": 24.4488,
-      "eval_samples_per_second": 204.509,
-      "eval_steps_per_second": 3.231,
       "step": 7000
     },
     {
       "epoch": 0.0071,
-      "grad_norm": 1.1962409019470215,
-      "learning_rate": 7.099e-06,
-      "loss": 1.323,
       "step": 7100
     },
     {
       "epoch": 0.0072,
-      "grad_norm": 1.1551247835159302,
-      "learning_rate": 7.199e-06,
-      "loss": 1.3119,
       "step": 7200
     },
     {
       "epoch": 0.0073,
-      "grad_norm": 1.1543225049972534,
-      "learning_rate": 7.299000000000001e-06,
-      "loss": 1.3261,
       "step": 7300
     },
     {
       "epoch": 0.0074,
-      "grad_norm": 1.133355975151062,
-      "learning_rate": 7.399000000000001e-06,
-      "loss": 1.3241,
       "step": 7400
     },
     {
       "epoch": 0.0075,
-      "grad_norm": 1.1490956544876099,
-      "learning_rate": 7.4990000000000005e-06,
-      "loss": 1.3293,
       "step": 7500
     },
     {
       "epoch": 0.0076,
-      "grad_norm": 1.0732618570327759,
-      "learning_rate": 7.5990000000000004e-06,
-      "loss": 1.3216,
       "step": 7600
     },
     {
       "epoch": 0.0077,
-      "grad_norm": 1.170203685760498,
-      "learning_rate": 7.699e-06,
-      "loss": 1.3193,
       "step": 7700
     },
     {
       "epoch": 0.0078,
-      "grad_norm": 1.0613148212432861,
-      "learning_rate": 7.799000000000001e-06,
-      "loss": 1.329,
       "step": 7800
     },
     {
       "epoch": 0.0079,
-      "grad_norm": 1.2019593715667725,
-      "learning_rate": 7.899000000000002e-06,
-      "loss": 1.315,
       "step": 7900
     },
     {
       "epoch": 0.008,
-      "grad_norm": 1.1080353260040283,
-      "learning_rate": 7.999e-06,
-      "loss": 1.3181,
       "step": 8000
     },
     {
       "epoch": 0.008,
-      "eval_loss": 1.3239587545394897,
-      "eval_runtime": 24.4556,
-      "eval_samples_per_second": 204.452,
-      "eval_steps_per_second": 3.23,
       "step": 8000
     },
     {
       "epoch": 0.0081,
-      "grad_norm": 1.1273937225341797,
-      "learning_rate": 8.099e-06,
-      "loss": 1.3252,
       "step": 8100
     },
     {
       "epoch": 0.0082,
-      "grad_norm": 1.0942583084106445,
-      "learning_rate": 8.199e-06,
-      "loss": 1.3164,
       "step": 8200
     },
     {
       "epoch": 0.0083,
-      "grad_norm": 1.1845577955245972,
-      "learning_rate": 8.299e-06,
-      "loss": 1.32,
       "step": 8300
     },
     {
       "epoch": 0.0084,
-      "grad_norm": 1.2376071214675903,
-      "learning_rate": 8.399e-06,
-      "loss": 1.314,
       "step": 8400
     },
     {
       "epoch": 0.0085,
-      "grad_norm": 1.5554766654968262,
-      "learning_rate": 8.499000000000001e-06,
-      "loss": 1.4128,
       "step": 8500
     },
     {
       "epoch": 0.0086,
-      "grad_norm": 1.736693024635315,
-      "learning_rate": 8.599e-06,
-      "loss": 1.5028,
       "step": 8600
     },
     {
       "epoch": 0.0087,
-      "grad_norm": 1.8339451551437378,
-      "learning_rate": 8.699000000000001e-06,
-      "loss": 1.5346,
       "step": 8700
     },
     {
       "epoch": 0.0088,
-      "grad_norm": 1.827017068862915,
-      "learning_rate": 8.799000000000002e-06,
-      "loss": 1.5309,
       "step": 8800
     },
     {
       "epoch": 0.0089,
-      "grad_norm": 1.7209491729736328,
-      "learning_rate": 8.899e-06,
-      "loss": 1.5202,
       "step": 8900
     },
     {
       "epoch": 0.009,
-      "grad_norm": 1.7649836540222168,
-      "learning_rate": 8.999000000000001e-06,
-      "loss": 1.5311,
       "step": 9000
     },
     {
       "epoch": 0.009,
-      "eval_loss": 1.3453279733657837,
-      "eval_runtime": 24.4639,
-      "eval_samples_per_second": 204.383,
-      "eval_steps_per_second": 3.229,
       "step": 9000
     },
     {
       "epoch": 0.0091,
-      "grad_norm": 1.758984923362732,
-      "learning_rate": 9.099e-06,
-      "loss": 1.5277,
       "step": 9100
     },
     {
       "epoch": 0.0092,
-      "grad_norm": 1.5517253875732422,
-      "learning_rate": 9.199000000000001e-06,
-      "loss": 1.5331,
       "step": 9200
     },
     {
       "epoch": 0.0093,
-      "grad_norm": 1.7491697072982788,
-      "learning_rate": 9.299e-06,
-      "loss": 1.5376,
       "step": 9300
     },
     {
       "epoch": 0.0094,
-      "grad_norm": 1.7253761291503906,
-      "learning_rate": 9.399000000000001e-06,
-      "loss": 1.5319,
       "step": 9400
     },
     {
       "epoch": 0.0095,
-      "grad_norm": 1.7779654264450073,
-      "learning_rate": 9.499e-06,
-      "loss": 1.5455,
       "step": 9500
     },
     {
       "epoch": 0.0096,
-      "grad_norm": 1.8502960205078125,
-      "learning_rate": 9.599e-06,
-      "loss": 1.5256,
       "step": 9600
     },
     {
       "epoch": 0.0097,
-      "grad_norm": 1.595805287361145,
-      "learning_rate": 9.699e-06,
-      "loss": 1.5338,
       "step": 9700
     },
     {
       "epoch": 0.0098,
-      "grad_norm": 1.7826145887374878,
-      "learning_rate": 9.799e-06,
-      "loss": 1.5297,
       "step": 9800
     },
     {
       "epoch": 0.0099,
-      "grad_norm": 1.8574384450912476,
-      "learning_rate": 9.899000000000001e-06,
-      "loss": 1.537,
       "step": 9900
     },
     {
       "epoch": 0.01,
-      "grad_norm": 1.6225100755691528,
-      "learning_rate": 9.999e-06,
-      "loss": 1.5373,
       "step": 10000
     },
     {
       "epoch": 0.01,
-      "eval_loss": 1.3474788665771484,
-      "eval_runtime": 24.6009,
-      "eval_samples_per_second": 203.244,
-      "eval_steps_per_second": 3.211,
       "step": 10000
     },
     {
       "epoch": 0.0101,
-      "grad_norm": 1.7013579607009888,
-      "learning_rate": 9.999999753259893e-06,
-      "loss": 1.5213,
       "step": 10100
     },
     {
       "epoch": 0.0102,
-      "grad_norm": 1.8451807498931885,
-      "learning_rate": 9.999999003045122e-06,
-      "loss": 1.5252,
       "step": 10200
     },
     {
       "epoch": 0.0103,
-      "grad_norm": 1.6487650871276855,
-      "learning_rate": 9.999997749330588e-06,
-      "loss": 1.5313,
       "step": 10300
     },
     {
       "epoch": 0.0104,
-      "grad_norm": 1.7240970134735107,
-      "learning_rate": 9.999995992116415e-06,
-      "loss": 1.5375,
       "step": 10400
     },
     {
       "epoch": 0.0105,
-      "grad_norm": 1.5860111713409424,
-      "learning_rate": 9.999993731402786e-06,
-      "loss": 1.535,
       "step": 10500
     },
     {
       "epoch": 0.0106,
-      "grad_norm": 1.6990783214569092,
-      "learning_rate": 9.999990967189924e-06,
-      "loss": 1.5415,
       "step": 10600
     },
     {
       "epoch": 0.0107,
-      "grad_norm": 1.7421098947525024,
-      "learning_rate": 9.999987699478109e-06,
-      "loss": 1.5266,
       "step": 10700
     },
     {
       "epoch": 0.0108,
-      "grad_norm": 1.6578110456466675,
-      "learning_rate": 9.999983928267668e-06,
-      "loss": 1.5256,
       "step": 10800
     },
     {
       "epoch": 0.0109,
-      "grad_norm": 1.8193341493606567,
-      "learning_rate": 9.999979653558982e-06,
-      "loss": 1.54,
       "step": 10900
     },
     {
       "epoch": 0.011,
-      "grad_norm": 1.7376822233200073,
-      "learning_rate": 9.999974875352482e-06,
-      "loss": 1.5345,
       "step": 11000
     },
     {
       "epoch": 0.011,
-      "eval_loss": 1.3439626693725586,
-      "eval_runtime": 24.6158,
-      "eval_samples_per_second": 203.122,
-      "eval_steps_per_second": 3.209,
       "step": 11000
     },
     {
       "epoch": 0.0111,
-      "grad_norm": 1.7770408391952515,
-      "learning_rate": 9.999969593648651e-06,
-      "loss": 1.5257,
       "step": 11100
     },
     {
       "epoch": 0.0112,
-      "grad_norm": 1.703754186630249,
-      "learning_rate": 9.999963808448016e-06,
-      "loss": 1.523,
       "step": 11200
     },
     {
       "epoch": 0.0113,
-      "grad_norm": 1.7194414138793945,
-      "learning_rate": 9.999957519751165e-06,
-      "loss": 1.5404,
       "step": 11300
     },
     {
       "epoch": 0.0114,
-      "grad_norm": 1.694810390472412,
-      "learning_rate": 9.999950727558727e-06,
-      "loss": 1.534,
       "step": 11400
     },
     {
       "epoch": 0.0115,
-      "grad_norm": 1.644400715827942,
-      "learning_rate": 9.999943431871388e-06,
-      "loss": 1.531,
       "step": 11500
     },
     {
       "epoch": 0.0116,
-      "grad_norm": 1.792406678199768,
-      "learning_rate": 9.99993563268988e-06,
-      "loss": 1.5298,
       "step": 11600
     },
     {
       "epoch": 0.0117,
-      "grad_norm": 1.9580830335617065,
-      "learning_rate": 9.999927330014993e-06,
-      "loss": 1.5268,
       "step": 11700
     },
     {
       "epoch": 0.0118,
-      "grad_norm": 1.6442023515701294,
-      "learning_rate": 9.99991852384756e-06,
-      "loss": 1.5257,
       "step": 11800
     },
     {
       "epoch": 0.0119,
-      "grad_norm": 1.680830478668213,
-      "learning_rate": 9.99990921418847e-06,
-      "loss": 1.5191,
       "step": 11900
     },
     {
       "epoch": 0.012,
-      "grad_norm": 1.6746671199798584,
-      "learning_rate": 9.999899401038656e-06,
-      "loss": 1.5372,
       "step": 12000
     },
     {
       "epoch": 0.012,
-      "eval_loss": 1.3511897325515747,
-      "eval_runtime": 24.555,
-      "eval_samples_per_second": 203.625,
-      "eval_steps_per_second": 3.217,
       "step": 12000
     },
     {
       "epoch": 0.0121,
-      "grad_norm": 1.7775862216949463,
-      "learning_rate": 9.99988908439911e-06,
-      "loss": 1.5182,
       "step": 12100
     },
     {
       "epoch": 0.0122,
-      "grad_norm": 1.5296705961227417,
-      "learning_rate": 9.999878264270871e-06,
-      "loss": 1.5303,
       "step": 12200
     },
     {
       "epoch": 0.0123,
-      "grad_norm": 1.7957079410552979,
-      "learning_rate": 9.999866940655027e-06,
-      "loss": 1.5328,
       "step": 12300
     },
     {
       "epoch": 0.0124,
-      "grad_norm": 1.8484801054000854,
-      "learning_rate": 9.99985511355272e-06,
-      "loss": 1.5162,
       "step": 12400
     },
     {
       "epoch": 0.0125,
-      "grad_norm": 1.7253010272979736,
-      "learning_rate": 9.999842782965139e-06,
-      "loss": 1.5178,
       "step": 12500
     },
     {
       "epoch": 0.0126,
-      "grad_norm": 1.7495081424713135,
-      "learning_rate": 9.999829948893528e-06,
-      "loss": 1.5233,
       "step": 12600
     },
     {
       "epoch": 0.0127,
-      "grad_norm": 1.6750719547271729,
-      "learning_rate": 9.999816611339175e-06,
-      "loss": 1.5203,
       "step": 12700
     },
     {
       "epoch": 0.0128,
-      "grad_norm": 1.7870038747787476,
-      "learning_rate": 9.999802770303427e-06,
-      "loss": 1.5106,
       "step": 12800
     },
     {
       "epoch": 0.0129,
-      "grad_norm": 1.6229153871536255,
-      "learning_rate": 9.999788425787678e-06,
-      "loss": 1.5399,
       "step": 12900
     },
     {
       "epoch": 0.013,
-      "grad_norm": 1.7483490705490112,
-      "learning_rate": 9.99977357779337e-06,
-      "loss": 1.519,
       "step": 13000
     },
     {
       "epoch": 0.013,
-      "eval_loss": 1.3341424465179443,
-      "eval_runtime": 24.5433,
-      "eval_samples_per_second": 203.722,
-      "eval_steps_per_second": 3.219,
       "step": 13000
     },
     {
       "epoch": 0.0131,
-      "grad_norm": 1.7631748914718628,
-      "learning_rate": 9.999758226322e-06,
-      "loss": 1.5232,
       "step": 13100
     },
     {
       "epoch": 0.0132,
-      "grad_norm": 1.6134735345840454,
-      "learning_rate": 9.999742371375114e-06,
-      "loss": 1.5352,
       "step": 13200
     },
     {
       "epoch": 0.0133,
-      "grad_norm": 1.8494335412979126,
-      "learning_rate": 9.999726012954308e-06,
-      "loss": 1.5254,
       "step": 13300
     },
     {
       "epoch": 0.0134,
-      "grad_norm": 1.9245802164077759,
-      "learning_rate": 9.999709151061228e-06,
-      "loss": 1.5358,
       "step": 13400
     },
     {
       "epoch": 0.0135,
-      "grad_norm": 1.755018711090088,
-      "learning_rate": 9.999691785697574e-06,
-      "loss": 1.5204,
       "step": 13500
     },
     {
       "epoch": 0.0136,
-      "grad_norm": 1.8922946453094482,
-      "learning_rate": 9.999673916865094e-06,
-      "loss": 1.5267,
       "step": 13600
     },
     {
       "epoch": 0.0137,
-      "grad_norm": 1.9781936407089233,
-      "learning_rate": 9.999655544565587e-06,
-      "loss": 1.5213,
       "step": 13700
     },
     {
       "epoch": 0.0138,
-      "grad_norm": 1.8312381505966187,
-      "learning_rate": 9.999636668800905e-06,
-      "loss": 1.517,
       "step": 13800
     },
     {
       "epoch": 0.0139,
-      "grad_norm": 1.6503413915634155,
-      "learning_rate": 9.999617289572946e-06,
-      "loss": 1.5169,
       "step": 13900
     },
     {
       "epoch": 0.014,
-      "grad_norm": 1.8612747192382812,
-      "learning_rate": 9.999597406883664e-06,
-      "loss": 1.5277,
       "step": 14000
     },
     {
       "epoch": 0.014,
-      "eval_loss": 1.3367455005645752,
-      "eval_runtime": 24.5718,
-      "eval_samples_per_second": 203.485,
-      "eval_steps_per_second": 3.215,
       "step": 14000
     },
     {
       "epoch": 0.0141,
-      "grad_norm": 1.8900790214538574,
-      "learning_rate": 9.999577020735059e-06,
-      "loss": 1.5276,
       "step": 14100
     },
     {
       "epoch": 0.0142,
-      "grad_norm": 1.720528244972229,
-      "learning_rate": 9.999556131129184e-06,
-      "loss": 1.5209,
       "step": 14200
     },
     {
       "epoch": 0.0143,
-      "grad_norm": 1.713659405708313,
-      "learning_rate": 9.999534738068145e-06,
-      "loss": 1.5194,
       "step": 14300
     },
     {
       "epoch": 0.0144,
-      "grad_norm": 1.662377119064331,
-      "learning_rate": 9.999512841554093e-06,
-      "loss": 1.5179,
       "step": 14400
     },
     {
       "epoch": 0.0145,
-      "grad_norm": 1.6507668495178223,
-      "learning_rate": 9.999490441589235e-06,
-      "loss": 1.5181,
       "step": 14500
     },
     {
       "epoch": 0.0146,
-      "grad_norm": 1.7075133323669434,
-      "learning_rate": 9.999467538175827e-06,
-      "loss": 1.5203,
       "step": 14600
     },
     {
       "epoch": 0.0147,
-      "grad_norm": 1.686068058013916,
-      "learning_rate": 9.999444131316173e-06,
-      "loss": 1.5156,
       "step": 14700
     },
     {
       "epoch": 0.0148,
-      "grad_norm": 1.6891603469848633,
-      "learning_rate": 9.999420221012635e-06,
-      "loss": 1.5195,
       "step": 14800
     },
     {
       "epoch": 0.0149,
-      "grad_norm": 1.784029245376587,
-      "learning_rate": 9.999395807267616e-06,
-      "loss": 1.509,
       "step": 14900
     },
     {
       "epoch": 0.015,
-      "grad_norm": 1.6361267566680908,
-      "learning_rate": 9.999370890083575e-06,
-      "loss": 1.5248,
       "step": 15000
     },
     {
       "epoch": 0.015,
-      "eval_loss": 1.3349226713180542,
-      "eval_runtime": 24.5747,
-      "eval_samples_per_second": 203.461,
-      "eval_steps_per_second": 3.215,
       "step": 15000
     }
   ],
   "logging_steps": 100,
@@ -1204,7 +3154,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.03079253229568e+18,
   "train_batch_size": 64,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.04,
   "eval_steps": 1000,
+  "global_step": 40000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 1e-06,
+      "grad_norm": 1.7933695316314697,
       "learning_rate": 0.0,
+      "loss": 1.4289,
       "step": 1
     },
     {
       "epoch": 0.0001,
+      "grad_norm": 1.5815099477767944,
+      "learning_rate": 4.9500000000000006e-08,
+      "loss": 1.4071,
       "step": 100
     },
     {
       "epoch": 0.0002,
+      "grad_norm": 1.9177594184875488,
+      "learning_rate": 9.950000000000001e-08,
+      "loss": 1.3975,
       "step": 200
     },
     {
       "epoch": 0.0003,
+      "grad_norm": 1.749267816543579,
+      "learning_rate": 1.495e-07,
+      "loss": 1.39,
       "step": 300
     },
     {
       "epoch": 0.0004,
+      "grad_norm": 1.783465027809143,
+      "learning_rate": 1.995e-07,
+      "loss": 1.4035,
       "step": 400
     },
     {
       "epoch": 0.0005,
+      "grad_norm": 1.727728009223938,
+      "learning_rate": 2.495e-07,
+      "loss": 1.3944,
       "step": 500
     },
     {
       "epoch": 0.0006,
+      "grad_norm": 1.7734591960906982,
+      "learning_rate": 2.9950000000000005e-07,
+      "loss": 1.3968,
       "step": 600
     },
     {
       "epoch": 0.0007,
+      "grad_norm": 1.7365233898162842,
+      "learning_rate": 3.4950000000000005e-07,
+      "loss": 1.4015,
       "step": 700
     },
     {
       "epoch": 0.0008,
+      "grad_norm": 1.898462176322937,
+      "learning_rate": 3.9950000000000005e-07,
+      "loss": 1.404,
       "step": 800
     },
     {
       "epoch": 0.0009,
+      "grad_norm": 1.5940463542938232,
+      "learning_rate": 4.495e-07,
+      "loss": 1.4013,
       "step": 900
     },
     {
       "epoch": 0.001,
+      "grad_norm": 1.948064923286438,
+      "learning_rate": 4.995e-07,
+      "loss": 1.4016,
       "step": 1000
     },
     {
       "epoch": 0.001,
+      "eval_loss": 1.4141713380813599,
+      "eval_runtime": 30.1065,
+      "eval_samples_per_second": 166.077,
+      "eval_steps_per_second": 2.624,
       "step": 1000
     },
     {
       "epoch": 0.0011,
+      "grad_norm": 1.7923760414123535,
+      "learning_rate": 5.495e-07,
+      "loss": 1.3934,
       "step": 1100
     },
     {
       "epoch": 0.0012,
+      "grad_norm": 1.8533916473388672,
+      "learning_rate": 5.995e-07,
+      "loss": 1.4183,
       "step": 1200
     },
     {
       "epoch": 0.0013,
+      "grad_norm": 1.7842605113983154,
+      "learning_rate": 6.495e-07,
+      "loss": 1.4022,
       "step": 1300
     },
     {
       "epoch": 0.0014,
+      "grad_norm": 1.6869210004806519,
+      "learning_rate": 6.995e-07,
+      "loss": 1.4055,
       "step": 1400
     },
     {
       "epoch": 0.0015,
+      "grad_norm": 1.7422460317611694,
+      "learning_rate": 7.495000000000001e-07,
+      "loss": 1.4112,
       "step": 1500
     },
     {
       "epoch": 0.0016,
+      "grad_norm": 1.8024979829788208,
+      "learning_rate": 7.995e-07,
+      "loss": 1.4002,
       "step": 1600
     },
     {
       "epoch": 0.0017,
+      "grad_norm": 1.8899842500686646,
+      "learning_rate": 8.495000000000001e-07,
+      "loss": 1.3927,
       "step": 1700
     },
     {
       "epoch": 0.0018,
+      "grad_norm": 1.9851900339126587,
+      "learning_rate": 8.995000000000001e-07,
+      "loss": 1.4143,
       "step": 1800
     },
     {
       "epoch": 0.0019,
+      "grad_norm": 1.8107774257659912,
+      "learning_rate": 9.495000000000002e-07,
+      "loss": 1.4056,
       "step": 1900
     },
     {
       "epoch": 0.002,
+      "grad_norm": 1.723138689994812,
+      "learning_rate": 9.995000000000001e-07,
+      "loss": 1.3832,
       "step": 2000
     },
     {
       "epoch": 0.002,
+      "eval_loss": 1.4142277240753174,
+      "eval_runtime": 24.9444,
+      "eval_samples_per_second": 200.446,
+      "eval_steps_per_second": 3.167,
       "step": 2000
     },
     {
       "epoch": 0.0021,
+      "grad_norm": 1.8459888696670532,
+      "learning_rate": 1.0495e-06,
+      "loss": 1.4003,
       "step": 2100
     },
     {
       "epoch": 0.0022,
+      "grad_norm": 1.8294060230255127,
+      "learning_rate": 1.0995000000000002e-06,
+      "loss": 1.4083,
       "step": 2200
     },
     {
       "epoch": 0.0023,
+      "grad_norm": 1.8045350313186646,
+      "learning_rate": 1.1495e-06,
+      "loss": 1.408,
       "step": 2300
     },
     {
       "epoch": 0.0024,
+      "grad_norm": 1.7030452489852905,
+      "learning_rate": 1.1995000000000001e-06,
+      "loss": 1.3929,
       "step": 2400
     },
     {
       "epoch": 0.0025,
+      "grad_norm": 1.6902519464492798,
+      "learning_rate": 1.2495e-06,
+      "loss": 1.3857,
       "step": 2500
     },
     {
       "epoch": 0.0026,
+      "grad_norm": 1.7536392211914062,
+      "learning_rate": 1.2995000000000002e-06,
+      "loss": 1.4034,
       "step": 2600
     },
     {
       "epoch": 0.0027,
+      "grad_norm": 1.9011949300765991,
+      "learning_rate": 1.3495e-06,
+      "loss": 1.411,
       "step": 2700
     },
     {
       "epoch": 0.0028,
+      "grad_norm": 1.8409913778305054,
+      "learning_rate": 1.3995000000000001e-06,
+      "loss": 1.4002,
       "step": 2800
     },
     {
       "epoch": 0.0029,
+      "grad_norm": 1.832072377204895,
+      "learning_rate": 1.4495e-06,
+      "loss": 1.4111,
       "step": 2900
     },
     {
       "epoch": 0.003,
+      "grad_norm": 1.7680649757385254,
+      "learning_rate": 1.4995000000000002e-06,
+      "loss": 1.3953,
       "step": 3000
     },
     {
       "epoch": 0.003,
+      "eval_loss": 1.392706274986267,
+      "eval_runtime": 23.9892,
+      "eval_samples_per_second": 208.427,
+      "eval_steps_per_second": 3.293,
       "step": 3000
     },
     {
       "epoch": 0.0031,
+      "grad_norm": 1.8069273233413696,
+      "learning_rate": 1.5495000000000002e-06,
+      "loss": 1.4041,
       "step": 3100
     },
     {
       "epoch": 0.0032,
+      "grad_norm": 1.9078863859176636,
+      "learning_rate": 1.5995000000000003e-06,
+      "loss": 1.3922,
       "step": 3200
     },
     {
       "epoch": 0.0033,
+      "grad_norm": 1.8206089735031128,
+      "learning_rate": 1.6495000000000003e-06,
+      "loss": 1.3909,
       "step": 3300
     },
     {
       "epoch": 0.0034,
+      "grad_norm": 1.850622296333313,
+      "learning_rate": 1.6995e-06,
+      "loss": 1.3866,
       "step": 3400
     },
     {
       "epoch": 0.0035,
+      "grad_norm": 1.823333978652954,
+      "learning_rate": 1.7495000000000001e-06,
+      "loss": 1.3991,
       "step": 3500
     },
     {
       "epoch": 0.0036,
+      "grad_norm": 1.7618579864501953,
+      "learning_rate": 1.7995e-06,
+      "loss": 1.3932,
       "step": 3600
     },
     {
       "epoch": 0.0037,
+      "grad_norm": 1.778365135192871,
+      "learning_rate": 1.8495000000000002e-06,
+      "loss": 1.4059,
       "step": 3700
     },
     {
       "epoch": 0.0038,
+      "grad_norm": 1.757568359375,
+      "learning_rate": 1.8995000000000002e-06,
+      "loss": 1.4031,
       "step": 3800
     },
     {
       "epoch": 0.0039,
+      "grad_norm": 1.9897911548614502,
+      "learning_rate": 1.9495e-06,
+      "loss": 1.3958,
       "step": 3900
     },
     {
       "epoch": 0.004,
+      "grad_norm": 1.789900302886963,
+      "learning_rate": 1.9995e-06,
+      "loss": 1.3836,
       "step": 4000
     },
     {
       "epoch": 0.004,
+      "eval_loss": 1.3976104259490967,
+      "eval_runtime": 24.0065,
+      "eval_samples_per_second": 208.277,
+      "eval_steps_per_second": 3.291,
       "step": 4000
     },
     {
       "epoch": 0.0041,
+      "grad_norm": 1.9247645139694214,
+      "learning_rate": 2.0495e-06,
+      "loss": 1.4072,
       "step": 4100
     },
     {
       "epoch": 0.0042,
+      "grad_norm": 1.8118959665298462,
+      "learning_rate": 2.0995e-06,
+      "loss": 1.3973,
       "step": 4200
     },
     {
       "epoch": 0.0043,
+      "grad_norm": 1.8579908609390259,
+      "learning_rate": 2.1495000000000003e-06,
+      "loss": 1.3952,
       "step": 4300
     },
     {
       "epoch": 0.0044,
+      "grad_norm": 1.7695815563201904,
+      "learning_rate": 2.1995000000000003e-06,
+      "loss": 1.4004,
       "step": 4400
     },
     {
       "epoch": 0.0045,
+      "grad_norm": 1.7015595436096191,
+      "learning_rate": 2.2495000000000002e-06,
+      "loss": 1.3951,
       "step": 4500
     },
     {
       "epoch": 0.0046,
+      "grad_norm": 1.710094690322876,
+      "learning_rate": 2.2995e-06,
+      "loss": 1.3956,
       "step": 4600
     },
     {
       "epoch": 0.0047,
+      "grad_norm": 1.748152732849121,
+      "learning_rate": 2.3495e-06,
+      "loss": 1.3922,
       "step": 4700
     },
     {
       "epoch": 0.0048,
+      "grad_norm": 1.7241308689117432,
+      "learning_rate": 2.3995e-06,
+      "loss": 1.398,
       "step": 4800
     },
     {
       "epoch": 0.0049,
+      "grad_norm": 1.6674290895462036,
+      "learning_rate": 2.4495e-06,
+      "loss": 1.3975,
       "step": 4900
     },
     {
       "epoch": 0.005,
+      "grad_norm": 1.8750745058059692,
+      "learning_rate": 2.4995000000000004e-06,
+      "loss": 1.394,
       "step": 5000
     },
     {
       "epoch": 0.005,
+      "eval_loss": 1.393296480178833,
+      "eval_runtime": 24.0133,
+      "eval_samples_per_second": 208.218,
+      "eval_steps_per_second": 3.29,
       "step": 5000
     },
     {
       "epoch": 0.0051,
+      "grad_norm": 1.6334375143051147,
+      "learning_rate": 2.5495000000000003e-06,
+      "loss": 1.3927,
       "step": 5100
     },
     {
       "epoch": 0.0052,
+      "grad_norm": 1.7477636337280273,
+      "learning_rate": 2.5995000000000003e-06,
+      "loss": 1.398,
       "step": 5200
     },
     {
       "epoch": 0.0053,
+      "grad_norm": 1.6373661756515503,
+      "learning_rate": 2.6495000000000002e-06,
+      "loss": 1.4044,
       "step": 5300
     },
     {
       "epoch": 0.0054,
+      "grad_norm": 1.9190561771392822,
+      "learning_rate": 2.6995000000000006e-06,
+      "loss": 1.4047,
       "step": 5400
     },
     {
       "epoch": 0.0055,
+      "grad_norm": 1.708715796470642,
+      "learning_rate": 2.7495000000000005e-06,
+      "loss": 1.4035,
       "step": 5500
     },
     {
       "epoch": 0.0056,
+      "grad_norm": 1.772215723991394,
+      "learning_rate": 2.7995e-06,
+      "loss": 1.3896,
       "step": 5600
     },
     {
       "epoch": 0.0057,
+      "grad_norm": 1.8555290699005127,
+      "learning_rate": 2.8495e-06,
+      "loss": 1.4112,
       "step": 5700
     },
     {
       "epoch": 0.0058,
+      "grad_norm": 1.8145737648010254,
+      "learning_rate": 2.8995e-06,
+      "loss": 1.4022,
       "step": 5800
     },
     {
       "epoch": 0.0059,
+      "grad_norm": 1.9936249256134033,
+      "learning_rate": 2.9495000000000003e-06,
+      "loss": 1.3897,
       "step": 5900
     },
     {
       "epoch": 0.006,
+      "grad_norm": 1.8583012819290161,
+      "learning_rate": 2.9995000000000003e-06,
+      "loss": 1.3981,
       "step": 6000
     },
     {
       "epoch": 0.006,
+      "eval_loss": 1.4079989194869995,
+      "eval_runtime": 24.0862,
+      "eval_samples_per_second": 207.588,
+      "eval_steps_per_second": 3.28,
       "step": 6000
     },
     {
       "epoch": 0.0061,
+      "grad_norm": 1.9386086463928223,
+      "learning_rate": 3.0495e-06,
+      "loss": 1.3999,
       "step": 6100
     },
     {
       "epoch": 0.0062,
+      "grad_norm": 1.8759804964065552,
+      "learning_rate": 3.0995e-06,
+      "loss": 1.3985,
       "step": 6200
     },
     {
       "epoch": 0.0063,
+      "grad_norm": 1.764291763305664,
+      "learning_rate": 3.1495000000000005e-06,
+      "loss": 1.3913,
       "step": 6300
     },
     {
       "epoch": 0.0064,
+      "grad_norm": 1.7575565576553345,
+      "learning_rate": 3.1995000000000005e-06,
+      "loss": 1.386,
       "step": 6400
     },
     {
       "epoch": 0.0065,
+      "grad_norm": 1.718859314918518,
+      "learning_rate": 3.2495000000000004e-06,
+      "loss": 1.3982,
       "step": 6500
     },
     {
       "epoch": 0.0066,
+      "grad_norm": 1.9030193090438843,
+      "learning_rate": 3.2995000000000003e-06,
+      "loss": 1.4042,
       "step": 6600
     },
     {
       "epoch": 0.0067,
+      "grad_norm": 2.0007975101470947,
+      "learning_rate": 3.3495000000000007e-06,
+      "loss": 1.3914,
       "step": 6700
     },
     {
       "epoch": 0.0068,
+      "grad_norm": 1.9459850788116455,
+      "learning_rate": 3.3995000000000002e-06,
+      "loss": 1.3971,
       "step": 6800
     },
     {
       "epoch": 0.0069,
+      "grad_norm": 1.8755849599838257,
+      "learning_rate": 3.4495e-06,
+      "loss": 1.3866,
       "step": 6900
     },
     {
       "epoch": 0.007,
+      "grad_norm": 1.9434016942977905,
+      "learning_rate": 3.4995e-06,
+      "loss": 1.4039,
       "step": 7000
     },
     {
       "epoch": 0.007,
+      "eval_loss": 1.4000722169876099,
+      "eval_runtime": 24.0943,
+      "eval_samples_per_second": 207.518,
+      "eval_steps_per_second": 3.279,
       "step": 7000
     },
     {
       "epoch": 0.0071,
+      "grad_norm": 1.8206807374954224,
+      "learning_rate": 3.5495e-06,
+      "loss": 1.3962,
       "step": 7100
     },
     {
       "epoch": 0.0072,
+      "grad_norm": 1.8879231214523315,
+      "learning_rate": 3.5995e-06,
+      "loss": 1.3973,
       "step": 7200
     },
     {
       "epoch": 0.0073,
+      "grad_norm": 1.8702274560928345,
+      "learning_rate": 3.6495000000000004e-06,
+      "loss": 1.401,
       "step": 7300
     },
     {
       "epoch": 0.0074,
+      "grad_norm": 1.8962584733963013,
+      "learning_rate": 3.6995000000000003e-06,
+      "loss": 1.4069,
       "step": 7400
     },
     {
       "epoch": 0.0075,
+      "grad_norm": 1.6658656597137451,
+      "learning_rate": 3.7495000000000003e-06,
+      "loss": 1.386,
       "step": 7500
     },
     {
       "epoch": 0.0076,
+      "grad_norm": 1.5985746383666992,
+      "learning_rate": 3.7995000000000002e-06,
+      "loss": 1.387,
       "step": 7600
     },
     {
       "epoch": 0.0077,
+      "grad_norm": 1.8246830701828003,
+      "learning_rate": 3.8495e-06,
+      "loss": 1.3994,
       "step": 7700
     },
     {
       "epoch": 0.0078,
+      "grad_norm": 1.775139570236206,
+      "learning_rate": 3.8995000000000005e-06,
+      "loss": 1.3901,
       "step": 7800
     },
     {
       "epoch": 0.0079,
+      "grad_norm": 1.9915419816970825,
+      "learning_rate": 3.949500000000001e-06,
+      "loss": 1.3979,
       "step": 7900
     },
     {
       "epoch": 0.008,
+      "grad_norm": 1.8422917127609253,
+      "learning_rate": 3.9995e-06,
+      "loss": 1.4017,
       "step": 8000
     },
     {
       "epoch": 0.008,
+      "eval_loss": 1.3942360877990723,
+      "eval_runtime": 24.0764,
+      "eval_samples_per_second": 207.672,
+      "eval_steps_per_second": 3.281,
       "step": 8000
     },
     {
       "epoch": 0.0081,
+      "grad_norm": 1.727967381477356,
+      "learning_rate": 4.0495e-06,
+      "loss": 1.3918,
       "step": 8100
     },
     {
       "epoch": 0.0082,
+      "grad_norm": 1.7638025283813477,
+      "learning_rate": 4.0995e-06,
+      "loss": 1.3989,
       "step": 8200
     },
     {
       "epoch": 0.0083,
+      "grad_norm": 1.8059898614883423,
+      "learning_rate": 4.1495e-06,
+      "loss": 1.3976,
       "step": 8300
     },
     {
       "epoch": 0.0084,
+      "grad_norm": 1.9645200967788696,
+      "learning_rate": 4.1995e-06,
+      "loss": 1.3845,
       "step": 8400
     },
     {
       "epoch": 0.0085,
+      "grad_norm": 1.98493230342865,
+      "learning_rate": 4.2495000000000006e-06,
+      "loss": 1.4068,
       "step": 8500
     },
     {
       "epoch": 0.0086,
+      "grad_norm": 1.8089314699172974,
+      "learning_rate": 4.2995e-06,
+      "loss": 1.4057,
       "step": 8600
     },
     {
       "epoch": 0.0087,
+      "grad_norm": 1.7342643737792969,
+      "learning_rate": 4.3495000000000005e-06,
+      "loss": 1.4077,
       "step": 8700
     },
     {
       "epoch": 0.0088,
+      "grad_norm": 1.7645128965377808,
+      "learning_rate": 4.399500000000001e-06,
+      "loss": 1.3974,
       "step": 8800
     },
     {
       "epoch": 0.0089,
+      "grad_norm": 1.7658684253692627,
+      "learning_rate": 4.4495e-06,
+      "loss": 1.3819,
       "step": 8900
     },
     {
       "epoch": 0.009,
+      "grad_norm": 1.8355954885482788,
+      "learning_rate": 4.499500000000001e-06,
+      "loss": 1.4016,
       "step": 9000
     },
     {
       "epoch": 0.009,
+      "eval_loss": 1.3868581056594849,
+      "eval_runtime": 24.1145,
+      "eval_samples_per_second": 207.344,
+      "eval_steps_per_second": 3.276,
       "step": 9000
     },
     {
       "epoch": 0.0091,
+      "grad_norm": 1.7402777671813965,
+      "learning_rate": 4.5495e-06,
+      "loss": 1.3971,
       "step": 9100
     },
     {
       "epoch": 0.0092,
+      "grad_norm": 2.0781781673431396,
+      "learning_rate": 4.599500000000001e-06,
+      "loss": 1.4013,
       "step": 9200
     },
     {
       "epoch": 0.0093,
+      "grad_norm": 1.8011542558670044,
+      "learning_rate": 4.6495e-06,
+      "loss": 1.406,
       "step": 9300
     },
     {
       "epoch": 0.0094,
+      "grad_norm": 1.9290459156036377,
+      "learning_rate": 4.6995000000000005e-06,
+      "loss": 1.4083,
       "step": 9400
     },
     {
       "epoch": 0.0095,
+      "grad_norm": 1.7394887208938599,
+      "learning_rate": 4.7495e-06,
+      "loss": 1.3898,
       "step": 9500
     },
     {
       "epoch": 0.0096,
+      "grad_norm": 1.845847249031067,
+      "learning_rate": 4.7995e-06,
+      "loss": 1.416,
       "step": 9600
     },
     {
       "epoch": 0.0097,
+      "grad_norm": 1.9406960010528564,
+      "learning_rate": 4.8495e-06,
+      "loss": 1.3893,
       "step": 9700
     },
     {
       "epoch": 0.0098,
+      "grad_norm": 1.7145395278930664,
+      "learning_rate": 4.8995e-06,
+      "loss": 1.394,
       "step": 9800
     },
     {
       "epoch": 0.0099,
+      "grad_norm": 1.748119831085205,
+      "learning_rate": 4.949500000000001e-06,
+      "loss": 1.4016,
       "step": 9900
     },
     {
       "epoch": 0.01,
+      "grad_norm": 1.7311230897903442,
+      "learning_rate": 4.9995e-06,
+      "loss": 1.3965,
       "step": 10000
     },
     {
       "epoch": 0.01,
+      "eval_loss": 1.408516764640808,
+      "eval_runtime": 24.1385,
+      "eval_samples_per_second": 207.138,
+      "eval_steps_per_second": 3.273,
       "step": 10000
     },
     {
       "epoch": 0.0101,
+      "grad_norm": 1.8347134590148926,
+      "learning_rate": 4.999999876629946e-06,
+      "loss": 1.4036,
       "step": 10100
     },
     {
       "epoch": 0.0102,
+      "grad_norm": 1.7432448863983154,
+      "learning_rate": 4.999999501522561e-06,
+      "loss": 1.3914,
       "step": 10200
     },
     {
       "epoch": 0.0103,
+      "grad_norm": 1.796655297279358,
+      "learning_rate": 4.999998874665294e-06,
+      "loss": 1.397,
       "step": 10300
     },
     {
       "epoch": 0.0104,
+      "grad_norm": 1.7895489931106567,
+      "learning_rate": 4.999997996058208e-06,
+      "loss": 1.3876,
       "step": 10400
     },
     {
       "epoch": 0.0105,
+      "grad_norm": 1.865824580192566,
+      "learning_rate": 4.999996865701393e-06,
+      "loss": 1.4007,
       "step": 10500
     },
     {
       "epoch": 0.0106,
+      "grad_norm": 1.9039396047592163,
+      "learning_rate": 4.999995483594962e-06,
+      "loss": 1.3926,
       "step": 10600
     },
     {
       "epoch": 0.0107,
+      "grad_norm": 1.7815436124801636,
+      "learning_rate": 4.9999938497390545e-06,
+      "loss": 1.4058,
       "step": 10700
     },
     {
       "epoch": 0.0108,
+      "grad_norm": 1.8762564659118652,
+      "learning_rate": 4.999991964133834e-06,
+      "loss": 1.4,
       "step": 10800
     },
     {
       "epoch": 0.0109,
+      "grad_norm": 1.8858652114868164,
+      "learning_rate": 4.999989826779491e-06,
+      "loss": 1.3878,
       "step": 10900
     },
     {
       "epoch": 0.011,
+      "grad_norm": 1.6531654596328735,
+      "learning_rate": 4.999987437676241e-06,
+      "loss": 1.3956,
       "step": 11000
     },
     {
       "epoch": 0.011,
+      "eval_loss": 1.3868032693862915,
+      "eval_runtime": 24.1662,
+      "eval_samples_per_second": 206.901,
+      "eval_steps_per_second": 3.269,
       "step": 11000
     },
     {
       "epoch": 0.0111,
+      "grad_norm": 1.8017290830612183,
+      "learning_rate": 4.999984796824326e-06,
+      "loss": 1.3955,
       "step": 11100
     },
     {
       "epoch": 0.0112,
+      "grad_norm": 1.6868501901626587,
+      "learning_rate": 4.999981904224008e-06,
+      "loss": 1.3991,
       "step": 11200
     },
     {
       "epoch": 0.0113,
+      "grad_norm": 1.729891061782837,
+      "learning_rate": 4.999978759875582e-06,
+      "loss": 1.3921,
       "step": 11300
     },
     {
       "epoch": 0.0114,
+      "grad_norm": 1.7264313697814941,
+      "learning_rate": 4.9999753637793636e-06,
+      "loss": 1.4006,
       "step": 11400
     },
     {
       "epoch": 0.0115,
+      "grad_norm": 1.9104857444763184,
+      "learning_rate": 4.999971715935694e-06,
+      "loss": 1.3952,
       "step": 11500
     },
     {
       "epoch": 0.0116,
+      "grad_norm": 1.970773458480835,
+      "learning_rate": 4.99996781634494e-06,
+      "loss": 1.4001,
       "step": 11600
     },
     {
       "epoch": 0.0117,
+      "grad_norm": 1.7760286331176758,
+      "learning_rate": 4.9999636650074965e-06,
+      "loss": 1.3916,
       "step": 11700
     },
     {
       "epoch": 0.0118,
+      "grad_norm": 1.8842190504074097,
+      "learning_rate": 4.99995926192378e-06,
+      "loss": 1.3913,
       "step": 11800
     },
     {
       "epoch": 0.0119,
+      "grad_norm": 1.7593873739242554,
+      "learning_rate": 4.999954607094235e-06,
+      "loss": 1.3962,
       "step": 11900
     },
     {
       "epoch": 0.012,
+      "grad_norm": 1.6571648120880127,
+      "learning_rate": 4.999949700519328e-06,
+      "loss": 1.3931,
       "step": 12000
     },
     {
       "epoch": 0.012,
+      "eval_loss": 1.3871841430664062,
+      "eval_runtime": 24.1968,
+      "eval_samples_per_second": 206.639,
+      "eval_steps_per_second": 3.265,
       "step": 12000
     },
     {
       "epoch": 0.0121,
+      "grad_norm": 1.843850016593933,
+      "learning_rate": 4.999944542199555e-06,
+      "loss": 1.3916,
       "step": 12100
     },
     {
       "epoch": 0.0122,
+      "grad_norm": 1.7486441135406494,
+      "learning_rate": 4.999939132135436e-06,
+      "loss": 1.3958,
       "step": 12200
     },
     {
       "epoch": 0.0123,
+      "grad_norm": 1.875173568725586,
+      "learning_rate": 4.999933470327514e-06,
+      "loss": 1.3944,
       "step": 12300
     },
     {
       "epoch": 0.0124,
+      "grad_norm": 1.9309207201004028,
+      "learning_rate": 4.99992755677636e-06,
+      "loss": 1.4007,
       "step": 12400
     },
     {
       "epoch": 0.0125,
+      "grad_norm": 1.6403874158859253,
+      "learning_rate": 4.9999213914825695e-06,
+      "loss": 1.3969,
       "step": 12500
     },
     {
       "epoch": 0.0126,
+      "grad_norm": 1.9816149473190308,
+      "learning_rate": 4.999914974446764e-06,
+      "loss": 1.3758,
       "step": 12600
     },
     {
       "epoch": 0.0127,
+      "grad_norm": 1.7469569444656372,
+      "learning_rate": 4.999908305669587e-06,
+      "loss": 1.3926,
       "step": 12700
     },
     {
       "epoch": 0.0128,
+      "grad_norm": 1.818030595779419,
+      "learning_rate": 4.999901385151713e-06,
+      "loss": 1.3845,
       "step": 12800
     },
     {
       "epoch": 0.0129,
+      "grad_norm": 1.8535512685775757,
+      "learning_rate": 4.999894212893839e-06,
+      "loss": 1.4021,
       "step": 12900
     },
     {
       "epoch": 0.013,
+      "grad_norm": 1.8381346464157104,
+      "learning_rate": 4.999886788896685e-06,
+      "loss": 1.3958,
       "step": 13000
     },
     {
       "epoch": 0.013,
+      "eval_loss": 1.3848538398742676,
+      "eval_runtime": 24.2052,
+      "eval_samples_per_second": 206.568,
+      "eval_steps_per_second": 3.264,
       "step": 13000
     },
     {
       "epoch": 0.0131,
+      "grad_norm": 1.9233248233795166,
+      "learning_rate": 4.999879113161e-06,
+      "loss": 1.3985,
       "step": 13100
     },
     {
       "epoch": 0.0132,
+      "grad_norm": 1.8300249576568604,
+      "learning_rate": 4.999871185687557e-06,
+      "loss": 1.3855,
       "step": 13200
     },
     {
       "epoch": 0.0133,
+      "grad_norm": 1.789095163345337,
+      "learning_rate": 4.999863006477154e-06,
+      "loss": 1.3905,
       "step": 13300
     },
     {
       "epoch": 0.0134,
+      "grad_norm": 1.7962862253189087,
+      "learning_rate": 4.999854575530614e-06,
+      "loss": 1.4135,
       "step": 13400
     },
     {
       "epoch": 0.0135,
+      "grad_norm": 1.8320449590682983,
+      "learning_rate": 4.999845892848787e-06,
+      "loss": 1.4024,
       "step": 13500
     },
     {
       "epoch": 0.0136,
+      "grad_norm": 1.848564863204956,
+      "learning_rate": 4.999836958432547e-06,
+      "loss": 1.4067,
       "step": 13600
     },
     {
       "epoch": 0.0137,
+      "grad_norm": 1.8709757328033447,
+      "learning_rate": 4.999827772282793e-06,
+      "loss": 1.3817,
       "step": 13700
     },
     {
       "epoch": 0.0138,
+      "grad_norm": 1.8266668319702148,
+      "learning_rate": 4.999818334400452e-06,
+      "loss": 1.3838,
       "step": 13800
     },
     {
       "epoch": 0.0139,
+      "grad_norm": 1.8056071996688843,
+      "learning_rate": 4.999808644786473e-06,
+      "loss": 1.3817,
       "step": 13900
     },
     {
       "epoch": 0.014,
+      "grad_norm": 1.703490138053894,
+      "learning_rate": 4.999798703441832e-06,
+      "loss": 1.3996,
       "step": 14000
     },
     {
       "epoch": 0.014,
+      "eval_loss": 1.3908612728118896,
+      "eval_runtime": 24.2804,
+      "eval_samples_per_second": 205.927,
+      "eval_steps_per_second": 3.254,
       "step": 14000
     },
     {
       "epoch": 0.0141,
+      "grad_norm": 1.6894255876541138,
+      "learning_rate": 4.999788510367529e-06,
+      "loss": 1.4134,
       "step": 14100
     },
     {
       "epoch": 0.0142,
+      "grad_norm": 1.7173762321472168,
+      "learning_rate": 4.999778065564592e-06,
+      "loss": 1.4046,
       "step": 14200
     },
     {
       "epoch": 0.0143,
+      "grad_norm": 1.7411465644836426,
+      "learning_rate": 4.999767369034072e-06,
+      "loss": 1.4058,
       "step": 14300
     },
     {
       "epoch": 0.0144,
+      "grad_norm": 1.9577834606170654,
+      "learning_rate": 4.999756420777047e-06,
+      "loss": 1.4061,
       "step": 14400
     },
     {
       "epoch": 0.0145,
+      "grad_norm": 1.8880082368850708,
+      "learning_rate": 4.999745220794618e-06,
+      "loss": 1.4008,
       "step": 14500
     },
     {
       "epoch": 0.0146,
+      "grad_norm": 1.7882862091064453,
+      "learning_rate": 4.999733769087913e-06,
+      "loss": 1.3991,
       "step": 14600
     },
     {
       "epoch": 0.0147,
+      "grad_norm": 1.8364261388778687,
+      "learning_rate": 4.999722065658087e-06,
+      "loss": 1.3999,
       "step": 14700
     },
     {
       "epoch": 0.0148,
+      "grad_norm": 1.8236931562423706,
+      "learning_rate": 4.9997101105063175e-06,
+      "loss": 1.4065,
       "step": 14800
     },
     {
       "epoch": 0.0149,
+      "grad_norm": 1.8798736333847046,
+      "learning_rate": 4.999697903633808e-06,
+      "loss": 1.4183,
       "step": 14900
     },
     {
       "epoch": 0.015,
+      "grad_norm": 1.9224145412445068,
+      "learning_rate": 4.999685445041788e-06,
+      "loss": 1.4229,
       "step": 15000
     },
     {
       "epoch": 0.015,
+      "eval_loss": 1.3955188989639282,
+      "eval_runtime": 24.2626,
+      "eval_samples_per_second": 206.079,
+      "eval_steps_per_second": 3.256,
       "step": 15000
+    },
+    {
+      "epoch": 0.0151,
+      "grad_norm": 1.8474870920181274,
+      "learning_rate": 4.999672734731511e-06,
+      "loss": 1.4072,
+      "step": 15100
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 1.7506942749023438,
+      "learning_rate": 4.9996597727042596e-06,
+      "loss": 1.4015,
+      "step": 15200
+    },
+    {
+      "epoch": 0.0153,
+      "grad_norm": 1.9011894464492798,
+      "learning_rate": 4.999646558961337e-06,
+      "loss": 1.4073,
+      "step": 15300
+    },
+    {
+      "epoch": 0.0154,
+      "grad_norm": 1.809842586517334,
+      "learning_rate": 4.999633093504074e-06,
+      "loss": 1.4026,
+      "step": 15400
+    },
+    {
+      "epoch": 0.0155,
+      "grad_norm": 1.9759999513626099,
+      "learning_rate": 4.999619376333827e-06,
+      "loss": 1.4043,
+      "step": 15500
+    },
+    {
+      "epoch": 0.0156,
+      "grad_norm": 1.892358422279358,
+      "learning_rate": 4.999605407451977e-06,
+      "loss": 1.4081,
+      "step": 15600
+    },
+    {
+      "epoch": 0.0157,
+      "grad_norm": 1.8095463514328003,
+      "learning_rate": 4.999591186859931e-06,
+      "loss": 1.3961,
+      "step": 15700
+    },
+    {
+      "epoch": 0.0158,
+      "grad_norm": 1.7918623685836792,
+      "learning_rate": 4.999576714559121e-06,
+      "loss": 1.4043,
+      "step": 15800
+    },
+    {
+      "epoch": 0.0159,
+      "grad_norm": 1.7738561630249023,
+      "learning_rate": 4.999561990551004e-06,
+      "loss": 1.3977,
+      "step": 15900
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.9365230798721313,
+      "learning_rate": 4.999547014837064e-06,
+      "loss": 1.3994,
+      "step": 16000
+    },
+    {
+      "epoch": 0.016,
+      "eval_loss": 1.3911411762237549,
+      "eval_runtime": 24.3275,
+      "eval_samples_per_second": 205.529,
+      "eval_steps_per_second": 3.247,
+      "step": 16000
+    },
+    {
+      "epoch": 0.0161,
+      "grad_norm": 1.869052767753601,
+      "learning_rate": 4.9995317874188065e-06,
+      "loss": 1.4078,
+      "step": 16100
+    },
+    {
+      "epoch": 0.0162,
+      "grad_norm": 1.9397051334381104,
+      "learning_rate": 4.999516308297767e-06,
+      "loss": 1.4029,
+      "step": 16200
+    },
+    {
+      "epoch": 0.0163,
+      "grad_norm": 1.6483432054519653,
+      "learning_rate": 4.999500577475504e-06,
+      "loss": 1.3943,
+      "step": 16300
+    },
+    {
+      "epoch": 0.0164,
+      "grad_norm": 2.034252166748047,
+      "learning_rate": 4.999484594953601e-06,
+      "loss": 1.4213,
+      "step": 16400
+    },
+    {
+      "epoch": 0.0165,
+      "grad_norm": 1.737054705619812,
+      "learning_rate": 4.9994683607336675e-06,
+      "loss": 1.397,
+      "step": 16500
+    },
+    {
+      "epoch": 0.0166,
+      "grad_norm": 1.789985179901123,
+      "learning_rate": 4.999451874817338e-06,
+      "loss": 1.3983,
+      "step": 16600
+    },
+    {
+      "epoch": 0.0167,
+      "grad_norm": 1.7947198152542114,
+      "learning_rate": 4.999435137206274e-06,
+      "loss": 1.4063,
+      "step": 16700
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 1.9025126695632935,
+      "learning_rate": 4.999418147902159e-06,
+      "loss": 1.4006,
+      "step": 16800
+    },
+    {
+      "epoch": 0.0169,
+      "grad_norm": 1.7516543865203857,
+      "learning_rate": 4.999400906906707e-06,
+      "loss": 1.4054,
+      "step": 16900
+    },
+    {
+      "epoch": 0.017,
+      "grad_norm": 1.6800352334976196,
+      "learning_rate": 4.99938341422165e-06,
+      "loss": 1.3978,
+      "step": 17000
+    },
+    {
+      "epoch": 0.017,
+      "eval_loss": 1.385777473449707,
+      "eval_runtime": 24.2933,
+      "eval_samples_per_second": 205.818,
+      "eval_steps_per_second": 3.252,
+      "step": 17000
+    },
+    {
+      "epoch": 0.0171,
+      "grad_norm": 1.778942584991455,
+      "learning_rate": 4.999365669848752e-06,
+      "loss": 1.3938,
+      "step": 17100
+    },
+    {
+      "epoch": 0.0172,
+      "grad_norm": 1.8309190273284912,
+      "learning_rate": 4.999347673789801e-06,
+      "loss": 1.4017,
+      "step": 17200
+    },
+    {
+      "epoch": 0.0173,
+      "grad_norm": 1.8281348943710327,
+      "learning_rate": 4.999329426046606e-06,
+      "loss": 1.3959,
+      "step": 17300
+    },
+    {
+      "epoch": 0.0174,
+      "grad_norm": 1.8240395784378052,
+      "learning_rate": 4.999310926621006e-06,
+      "loss": 1.41,
+      "step": 17400
+    },
+    {
+      "epoch": 0.0175,
+      "grad_norm": 2.054227590560913,
+      "learning_rate": 4.9992921755148646e-06,
+      "loss": 1.3994,
+      "step": 17500
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 1.9895967245101929,
+      "learning_rate": 4.99927317273007e-06,
+      "loss": 1.3991,
+      "step": 17600
+    },
+    {
+      "epoch": 0.0177,
+      "grad_norm": 1.7590620517730713,
+      "learning_rate": 4.9992539182685345e-06,
+      "loss": 1.4181,
+      "step": 17700
+    },
+    {
+      "epoch": 0.0178,
+      "grad_norm": 1.8000540733337402,
+      "learning_rate": 4.9992344121321975e-06,
+      "loss": 1.397,
+      "step": 17800
+    },
+    {
+      "epoch": 0.0179,
+      "grad_norm": 1.689989447593689,
+      "learning_rate": 4.999214654323025e-06,
+      "loss": 1.4146,
+      "step": 17900
+    },
+    {
+      "epoch": 0.018,
+      "grad_norm": 1.9092588424682617,
+      "learning_rate": 4.999194644843004e-06,
+      "loss": 1.4044,
+      "step": 18000
+    },
+    {
+      "epoch": 0.018,
+      "eval_loss": 1.3936209678649902,
+      "eval_runtime": 25.2802,
+      "eval_samples_per_second": 197.783,
+      "eval_steps_per_second": 3.125,
+      "step": 18000
+    },
+    {
+      "epoch": 0.0181,
+      "grad_norm": 1.7824491262435913,
+      "learning_rate": 4.999174383694151e-06,
+      "loss": 1.401,
+      "step": 18100
+    },
+    {
+      "epoch": 0.0182,
+      "grad_norm": 1.9961062669754028,
+      "learning_rate": 4.999153870878506e-06,
+      "loss": 1.3982,
+      "step": 18200
+    },
+    {
+      "epoch": 0.0183,
+      "grad_norm": 1.8174325227737427,
+      "learning_rate": 4.999133106398135e-06,
+      "loss": 1.4062,
+      "step": 18300
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 1.807077169418335,
+      "learning_rate": 4.999112090255129e-06,
+      "loss": 1.4022,
+      "step": 18400
+    },
+    {
+      "epoch": 0.0185,
+      "grad_norm": 1.8445507287979126,
+      "learning_rate": 4.9990908224516025e-06,
+      "loss": 1.3893,
+      "step": 18500
+    },
+    {
+      "epoch": 0.0186,
+      "grad_norm": 1.9006763696670532,
+      "learning_rate": 4.999069302989699e-06,
+      "loss": 1.3918,
+      "step": 18600
+    },
+    {
+      "epoch": 0.0187,
+      "grad_norm": 1.981102705001831,
+      "learning_rate": 4.999047531871585e-06,
+      "loss": 1.4038,
+      "step": 18700
+    },
+    {
+      "epoch": 0.0188,
+      "grad_norm": 1.8729362487792969,
+      "learning_rate": 4.9990255090994535e-06,
+      "loss": 1.4021,
+      "step": 18800
+    },
+    {
+      "epoch": 0.0189,
+      "grad_norm": 1.9796510934829712,
+      "learning_rate": 4.999003234675521e-06,
+      "loss": 1.3972,
+      "step": 18900
+    },
+    {
+      "epoch": 0.019,
+      "grad_norm": 1.8760383129119873,
+      "learning_rate": 4.998980708602031e-06,
+      "loss": 1.3998,
+      "step": 19000
+    },
+    {
+      "epoch": 0.019,
+      "eval_loss": 1.3851877450942993,
+      "eval_runtime": 24.3386,
+      "eval_samples_per_second": 205.435,
+      "eval_steps_per_second": 3.246,
+      "step": 19000
+    },
+    {
+      "epoch": 0.0191,
+      "grad_norm": 1.78969144821167,
+      "learning_rate": 4.998957930881253e-06,
+      "loss": 1.3957,
+      "step": 19100
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.8471328020095825,
+      "learning_rate": 4.998934901515479e-06,
+      "loss": 1.4144,
+      "step": 19200
+    },
+    {
+      "epoch": 0.0193,
+      "grad_norm": 1.8064422607421875,
+      "learning_rate": 4.998911620507029e-06,
+      "loss": 1.3871,
+      "step": 19300
+    },
+    {
+      "epoch": 0.0194,
+      "grad_norm": 1.900911569595337,
+      "learning_rate": 4.998888087858246e-06,
+      "loss": 1.3946,
+      "step": 19400
+    },
+    {
+      "epoch": 0.0195,
+      "grad_norm": 1.7808403968811035,
+      "learning_rate": 4.998864303571502e-06,
+      "loss": 1.394,
+      "step": 19500
+    },
+    {
+      "epoch": 0.0196,
+      "grad_norm": 1.7433531284332275,
+      "learning_rate": 4.998840267649191e-06,
+      "loss": 1.3916,
+      "step": 19600
+    },
+    {
+      "epoch": 0.0197,
+      "grad_norm": 1.9526238441467285,
+      "learning_rate": 4.998815980093734e-06,
+      "loss": 1.3901,
+      "step": 19700
+    },
+    {
+      "epoch": 0.0198,
+      "grad_norm": 1.6053208112716675,
+      "learning_rate": 4.998791440907575e-06,
+      "loss": 1.4086,
+      "step": 19800
+    },
+    {
+      "epoch": 0.0199,
+      "grad_norm": 1.9025638103485107,
+      "learning_rate": 4.9987666500931874e-06,
+      "loss": 1.4041,
+      "step": 19900
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 1.7806073427200317,
+      "learning_rate": 4.998741607653066e-06,
+      "loss": 1.3889,
+      "step": 20000
+    },
+    {
+      "epoch": 0.02,
+      "eval_loss": 1.3910343647003174,
+      "eval_runtime": 24.3678,
+      "eval_samples_per_second": 205.189,
+      "eval_steps_per_second": 3.242,
+      "step": 20000
+    },
+    {
+      "epoch": 0.0201,
+      "grad_norm": 1.8703974485397339,
+      "learning_rate": 4.9987163135897334e-06,
+      "loss": 1.3995,
+      "step": 20100
+    },
+    {
+      "epoch": 0.0202,
+      "grad_norm": 1.8790905475616455,
+      "learning_rate": 4.998690767905736e-06,
+      "loss": 1.4063,
+      "step": 20200
+    },
+    {
+      "epoch": 0.0203,
+      "grad_norm": 1.9134712219238281,
+      "learning_rate": 4.998664970603646e-06,
+      "loss": 1.4038,
+      "step": 20300
+    },
+    {
+      "epoch": 0.0204,
+      "grad_norm": 1.8452473878860474,
+      "learning_rate": 4.998638921686063e-06,
+      "loss": 1.4035,
+      "step": 20400
+    },
+    {
+      "epoch": 0.0205,
+      "grad_norm": 1.8248049020767212,
+      "learning_rate": 4.998612621155608e-06,
+      "loss": 1.4003,
+      "step": 20500
+    },
+    {
+      "epoch": 0.0206,
+      "grad_norm": 1.8242034912109375,
+      "learning_rate": 4.9985860690149316e-06,
+      "loss": 1.3927,
+      "step": 20600
+    },
+    {
+      "epoch": 0.0207,
+      "grad_norm": 1.9893760681152344,
+      "learning_rate": 4.998559265266705e-06,
+      "loss": 1.4023,
+      "step": 20700
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 1.9086755514144897,
+      "learning_rate": 4.99853220991363e-06,
+      "loss": 1.4064,
+      "step": 20800
+    },
+    {
+      "epoch": 0.0209,
+      "grad_norm": 1.9896847009658813,
+      "learning_rate": 4.998504902958429e-06,
+      "loss": 1.3995,
+      "step": 20900
+    },
+    {
+      "epoch": 0.021,
+      "grad_norm": 1.9010740518569946,
+      "learning_rate": 4.998477344403852e-06,
+      "loss": 1.4031,
+      "step": 21000
+    },
+    {
+      "epoch": 0.021,
+      "eval_loss": 1.3773479461669922,
+      "eval_runtime": 24.4357,
+      "eval_samples_per_second": 204.619,
+      "eval_steps_per_second": 3.233,
+      "step": 21000
+    },
+    {
+      "epoch": 0.0211,
+      "grad_norm": 1.8723328113555908,
+      "learning_rate": 4.9984495342526765e-06,
+      "loss": 1.4064,
+      "step": 21100
+    },
+    {
+      "epoch": 0.0212,
+      "grad_norm": 1.9470596313476562,
+      "learning_rate": 4.998421472507701e-06,
+      "loss": 1.3928,
+      "step": 21200
+    },
+    {
+      "epoch": 0.0213,
+      "grad_norm": 1.7989906072616577,
+      "learning_rate": 4.998393159171751e-06,
+      "loss": 1.3981,
+      "step": 21300
+    },
+    {
+      "epoch": 0.0214,
+      "grad_norm": 1.8689730167388916,
+      "learning_rate": 4.998364594247678e-06,
+      "loss": 1.3938,
+      "step": 21400
+    },
+    {
+      "epoch": 0.0215,
+      "grad_norm": 1.9032299518585205,
+      "learning_rate": 4.998335777738359e-06,
+      "loss": 1.3947,
+      "step": 21500
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 1.911832332611084,
+      "learning_rate": 4.998306709646695e-06,
+      "loss": 1.3973,
+      "step": 21600
+    },
+    {
+      "epoch": 0.0217,
+      "grad_norm": 1.9614136219024658,
+      "learning_rate": 4.998277389975615e-06,
+      "loss": 1.3953,
+      "step": 21700
+    },
+    {
+      "epoch": 0.0218,
+      "grad_norm": 1.9598075151443481,
+      "learning_rate": 4.998247818728069e-06,
+      "loss": 1.3869,
+      "step": 21800
+    },
+    {
+      "epoch": 0.0219,
+      "grad_norm": 1.9342235326766968,
+      "learning_rate": 4.998217995907037e-06,
+      "loss": 1.3979,
+      "step": 21900
+    },
+    {
+      "epoch": 0.022,
+      "grad_norm": 1.740506887435913,
+      "learning_rate": 4.998187921515521e-06,
+      "loss": 1.3896,
+      "step": 22000
+    },
+    {
+      "epoch": 0.022,
+      "eval_loss": 1.4018659591674805,
+      "eval_runtime": 24.3975,
+      "eval_samples_per_second": 204.939,
+      "eval_steps_per_second": 3.238,
+      "step": 22000
+    },
+    {
+      "epoch": 0.0221,
+      "grad_norm": 1.7357391119003296,
+      "learning_rate": 4.998157595556548e-06,
+      "loss": 1.3981,
+      "step": 22100
+    },
+    {
+      "epoch": 0.0222,
+      "grad_norm": 1.7457457780838013,
+      "learning_rate": 4.998127018033176e-06,
+      "loss": 1.4057,
+      "step": 22200
+    },
+    {
+      "epoch": 0.0223,
+      "grad_norm": 1.8668625354766846,
+      "learning_rate": 4.998096188948479e-06,
+      "loss": 1.4009,
+      "step": 22300
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 1.9003961086273193,
+      "learning_rate": 4.998065108305567e-06,
+      "loss": 1.3957,
+      "step": 22400
+    },
+    {
+      "epoch": 0.0225,
+      "grad_norm": 1.7045166492462158,
+      "learning_rate": 4.998033776107565e-06,
+      "loss": 1.3831,
+      "step": 22500
+    },
+    {
+      "epoch": 0.0226,
+      "grad_norm": 1.7204500436782837,
+      "learning_rate": 4.99800219235763e-06,
+      "loss": 1.393,
+      "step": 22600
+    },
+    {
+      "epoch": 0.0227,
+      "grad_norm": 1.8276758193969727,
+      "learning_rate": 4.9979703570589435e-06,
+      "loss": 1.3911,
+      "step": 22700
+    },
+    {
+      "epoch": 0.0228,
+      "grad_norm": 1.851925253868103,
+      "learning_rate": 4.9979382702147095e-06,
+      "loss": 1.3919,
+      "step": 22800
+    },
+    {
+      "epoch": 0.0229,
+      "grad_norm": 1.9704827070236206,
+      "learning_rate": 4.997905931828161e-06,
+      "loss": 1.3913,
+      "step": 22900
+    },
+    {
+      "epoch": 0.023,
+      "grad_norm": 1.9040639400482178,
+      "learning_rate": 4.997873341902552e-06,
+      "loss": 1.3898,
+      "step": 23000
+    },
+    {
+      "epoch": 0.023,
+      "eval_loss": 1.3905709981918335,
+      "eval_runtime": 24.4036,
+      "eval_samples_per_second": 204.888,
+      "eval_steps_per_second": 3.237,
+      "step": 23000
+    },
+    {
+      "epoch": 0.0231,
+      "grad_norm": 1.64853835105896,
+      "learning_rate": 4.9978405004411676e-06,
+      "loss": 1.3999,
+      "step": 23100
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 1.680251955986023,
+      "learning_rate": 4.997807407447312e-06,
+      "loss": 1.3959,
+      "step": 23200
+    },
+    {
+      "epoch": 0.0233,
+      "grad_norm": 1.79887056350708,
+      "learning_rate": 4.99777406292432e-06,
+      "loss": 1.3984,
+      "step": 23300
+    },
+    {
+      "epoch": 0.0234,
+      "grad_norm": 1.9963394403457642,
+      "learning_rate": 4.997740466875547e-06,
+      "loss": 1.3785,
+      "step": 23400
+    },
+    {
+      "epoch": 0.0235,
+      "grad_norm": 1.934024691581726,
+      "learning_rate": 4.997706619304378e-06,
+      "loss": 1.3886,
+      "step": 23500
+    },
+    {
+      "epoch": 0.0236,
+      "grad_norm": 1.8343491554260254,
+      "learning_rate": 4.9976725202142204e-06,
+      "loss": 1.3928,
+      "step": 23600
+    },
+    {
+      "epoch": 0.0237,
+      "grad_norm": 1.7215723991394043,
+      "learning_rate": 4.9976381696085086e-06,
+      "loss": 1.3933,
+      "step": 23700
+    },
+    {
+      "epoch": 0.0238,
+      "grad_norm": 1.8704638481140137,
+      "learning_rate": 4.997603567490702e-06,
+      "loss": 1.39,
+      "step": 23800
+    },
+    {
+      "epoch": 0.0239,
+      "grad_norm": 2.0038602352142334,
+      "learning_rate": 4.997568713864283e-06,
+      "loss": 1.3948,
+      "step": 23900
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 1.8034144639968872,
+      "learning_rate": 4.997533608732764e-06,
+      "loss": 1.4015,
+      "step": 24000
+    },
+    {
+      "epoch": 0.024,
+      "eval_loss": 1.404091477394104,
+      "eval_runtime": 24.4038,
+      "eval_samples_per_second": 204.886,
+      "eval_steps_per_second": 3.237,
+      "step": 24000
+    },
+    {
+      "epoch": 0.0241,
+      "grad_norm": 1.8736565113067627,
+      "learning_rate": 4.997498252099678e-06,
+      "loss": 1.3984,
+      "step": 24100
+    },
+    {
+      "epoch": 0.0242,
+      "grad_norm": 1.9209901094436646,
+      "learning_rate": 4.997462643968588e-06,
+      "loss": 1.3909,
+      "step": 24200
+    },
+    {
+      "epoch": 0.0243,
+      "grad_norm": 1.8445130586624146,
+      "learning_rate": 4.997426784343077e-06,
+      "loss": 1.403,
+      "step": 24300
+    },
+    {
+      "epoch": 0.0244,
+      "grad_norm": 1.8695659637451172,
+      "learning_rate": 4.997390673226758e-06,
+      "loss": 1.3849,
+      "step": 24400
+    },
+    {
+      "epoch": 0.0245,
+      "grad_norm": 1.918107032775879,
+      "learning_rate": 4.997354310623265e-06,
+      "loss": 1.3781,
+      "step": 24500
+    },
+    {
+      "epoch": 0.0246,
+      "grad_norm": 1.6449661254882812,
+      "learning_rate": 4.997317696536262e-06,
+      "loss": 1.4057,
+      "step": 24600
+    },
+    {
+      "epoch": 0.0247,
+      "grad_norm": 1.8361092805862427,
+      "learning_rate": 4.997280830969436e-06,
+      "loss": 1.3951,
+      "step": 24700
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 2.0154218673706055,
+      "learning_rate": 4.997243713926497e-06,
+      "loss": 1.3975,
+      "step": 24800
+    },
+    {
+      "epoch": 0.0249,
+      "grad_norm": 1.8617228269577026,
+      "learning_rate": 4.997206345411185e-06,
+      "loss": 1.3828,
+      "step": 24900
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 1.8779343366622925,
+      "learning_rate": 4.997168725427263e-06,
+      "loss": 1.3982,
+      "step": 25000
+    },
+    {
+      "epoch": 0.025,
+      "eval_loss": 1.3792742490768433,
+      "eval_runtime": 24.4701,
+      "eval_samples_per_second": 204.331,
+      "eval_steps_per_second": 3.228,
+      "step": 25000
+    },
+    {
+      "epoch": 0.0251,
+      "grad_norm": 1.9437707662582397,
+      "learning_rate": 4.997130853978519e-06,
+      "loss": 1.3881,
+      "step": 25100
+    },
+    {
+      "epoch": 0.0252,
+      "grad_norm": 1.8795158863067627,
+      "learning_rate": 4.9970927310687655e-06,
+      "loss": 1.3975,
+      "step": 25200
+    },
+    {
+      "epoch": 0.0253,
+      "grad_norm": 1.732364535331726,
+      "learning_rate": 4.997054356701842e-06,
+      "loss": 1.3951,
+      "step": 25300
+    },
+    {
+      "epoch": 0.0254,
+      "grad_norm": 1.7434965372085571,
+      "learning_rate": 4.997015730881614e-06,
+      "loss": 1.3864,
+      "step": 25400
+    },
+    {
+      "epoch": 0.0255,
+      "grad_norm": 1.9448522329330444,
+      "learning_rate": 4.99697685361197e-06,
+      "loss": 1.3962,
+      "step": 25500
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.7149606943130493,
+      "learning_rate": 4.9969377248968245e-06,
+      "loss": 1.395,
+      "step": 25600
+    },
+    {
+      "epoch": 0.0257,
+      "grad_norm": 1.8136606216430664,
+      "learning_rate": 4.996898344740119e-06,
+      "loss": 1.3984,
+      "step": 25700
+    },
+    {
+      "epoch": 0.0258,
+      "grad_norm": 1.8528047800064087,
+      "learning_rate": 4.9968587131458184e-06,
+      "loss": 1.4007,
+      "step": 25800
+    },
+    {
+      "epoch": 0.0259,
+      "grad_norm": 1.781982660293579,
+      "learning_rate": 4.996818830117914e-06,
+      "loss": 1.3933,
+      "step": 25900
+    },
+    {
+      "epoch": 0.026,
+      "grad_norm": 1.777457356452942,
+      "learning_rate": 4.996778695660421e-06,
+      "loss": 1.3912,
+      "step": 26000
+    },
+    {
+      "epoch": 0.026,
+      "eval_loss": 1.3937982320785522,
+      "eval_runtime": 24.4424,
+      "eval_samples_per_second": 204.562,
+      "eval_steps_per_second": 3.232,
+      "step": 26000
+    },
+    {
+      "epoch": 0.0261,
+      "grad_norm": 1.9622981548309326,
+      "learning_rate": 4.996738309777382e-06,
+      "loss": 1.4005,
+      "step": 26100
+    },
+    {
+      "epoch": 0.0262,
+      "grad_norm": 1.8607321977615356,
+      "learning_rate": 4.996697672472864e-06,
+      "loss": 1.4014,
+      "step": 26200
+    },
+    {
+      "epoch": 0.0263,
+      "grad_norm": 2.0541224479675293,
+      "learning_rate": 4.996656783750959e-06,
+      "loss": 1.3966,
+      "step": 26300
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 1.9274111986160278,
+      "learning_rate": 4.996615643615783e-06,
+      "loss": 1.3961,
+      "step": 26400
+    },
+    {
+      "epoch": 0.0265,
+      "grad_norm": 1.861632227897644,
+      "learning_rate": 4.99657425207148e-06,
+      "loss": 1.393,
+      "step": 26500
+    },
+    {
+      "epoch": 0.0266,
+      "grad_norm": 1.870938777923584,
+      "learning_rate": 4.996532609122219e-06,
+      "loss": 1.3987,
+      "step": 26600
+    },
+    {
+      "epoch": 0.0267,
+      "grad_norm": 1.949152946472168,
+      "learning_rate": 4.996490714772192e-06,
+      "loss": 1.3972,
+      "step": 26700
+    },
+    {
+      "epoch": 0.0268,
+      "grad_norm": 1.7645965814590454,
+      "learning_rate": 4.996448569025618e-06,
+      "loss": 1.3917,
+      "step": 26800
+    },
+    {
+      "epoch": 0.0269,
+      "grad_norm": 1.955647587776184,
+      "learning_rate": 4.9964061718867425e-06,
+      "loss": 1.406,
+      "step": 26900
+    },
+    {
+      "epoch": 0.027,
+      "grad_norm": 1.886497974395752,
+      "learning_rate": 4.996363523359833e-06,
+      "loss": 1.396,
+      "step": 27000
+    },
+    {
+      "epoch": 0.027,
+      "eval_loss": 1.3594520092010498,
+      "eval_runtime": 24.4577,
+      "eval_samples_per_second": 204.435,
+      "eval_steps_per_second": 3.23,
+      "step": 27000
+    },
+    {
+      "epoch": 0.0271,
+      "grad_norm": 1.9091280698776245,
+      "learning_rate": 4.996320623449186e-06,
+      "loss": 1.4001,
+      "step": 27100
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 1.9556379318237305,
+      "learning_rate": 4.996277472159119e-06,
+      "loss": 1.4045,
+      "step": 27200
+    },
+    {
+      "epoch": 0.0273,
+      "grad_norm": 2.0282106399536133,
+      "learning_rate": 4.99623406949398e-06,
+      "loss": 1.3927,
+      "step": 27300
+    },
+    {
+      "epoch": 0.0274,
+      "grad_norm": 1.8539470434188843,
+      "learning_rate": 4.9961904154581374e-06,
+      "loss": 1.3998,
+      "step": 27400
+    },
+    {
+      "epoch": 0.0275,
+      "grad_norm": 1.7907086610794067,
+      "learning_rate": 4.9961465100559896e-06,
+      "loss": 1.3871,
+      "step": 27500
+    },
+    {
+      "epoch": 0.0276,
+      "grad_norm": 1.8179974555969238,
+      "learning_rate": 4.996102353291956e-06,
+      "loss": 1.3898,
+      "step": 27600
+    },
+    {
+      "epoch": 0.0277,
+      "grad_norm": 1.8779699802398682,
+      "learning_rate": 4.996057945170483e-06,
+      "loss": 1.3911,
+      "step": 27700
+    },
+    {
+      "epoch": 0.0278,
+      "grad_norm": 1.8160297870635986,
+      "learning_rate": 4.996013285696044e-06,
+      "loss": 1.3985,
+      "step": 27800
+    },
+    {
+      "epoch": 0.0279,
+      "grad_norm": 1.80897057056427,
+      "learning_rate": 4.995968374873136e-06,
+      "loss": 1.3956,
+      "step": 27900
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 1.6959145069122314,
+      "learning_rate": 4.99592321270628e-06,
+      "loss": 1.3905,
+      "step": 28000
+    },
+    {
+      "epoch": 0.028,
+      "eval_loss": 1.3936872482299805,
+      "eval_runtime": 24.4635,
+      "eval_samples_per_second": 204.386,
+      "eval_steps_per_second": 3.229,
+      "step": 28000
+    },
+    {
+      "epoch": 0.0281,
+      "grad_norm": 1.7558468580245972,
+      "learning_rate": 4.9958777992000255e-06,
+      "loss": 1.3777,
+      "step": 28100
+    },
+    {
+      "epoch": 0.0282,
+      "grad_norm": 1.8235905170440674,
+      "learning_rate": 4.995832134358943e-06,
+      "loss": 1.3818,
+      "step": 28200
+    },
+    {
+      "epoch": 0.0283,
+      "grad_norm": 1.9419879913330078,
+      "learning_rate": 4.995786218187635e-06,
+      "loss": 1.4081,
+      "step": 28300
+    },
+    {
+      "epoch": 0.0284,
+      "grad_norm": 1.8814060688018799,
+      "learning_rate": 4.995740050690722e-06,
+      "loss": 1.4014,
+      "step": 28400
+    },
+    {
+      "epoch": 0.0285,
+      "grad_norm": 1.9103902578353882,
+      "learning_rate": 4.995693631872855e-06,
+      "loss": 1.3932,
+      "step": 28500
+    },
+    {
+      "epoch": 0.0286,
+      "grad_norm": 1.813238263130188,
+      "learning_rate": 4.995646961738707e-06,
+      "loss": 1.3976,
+      "step": 28600
+    },
+    {
+      "epoch": 0.0287,
+      "grad_norm": 1.9837982654571533,
+      "learning_rate": 4.995600040292978e-06,
+      "loss": 1.3953,
+      "step": 28700
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 1.8860703706741333,
+      "learning_rate": 4.995552867540394e-06,
+      "loss": 1.4003,
+      "step": 28800
+    },
+    {
+      "epoch": 0.0289,
+      "grad_norm": 2.1086032390594482,
+      "learning_rate": 4.995505443485704e-06,
+      "loss": 1.3941,
+      "step": 28900
+    },
+    {
+      "epoch": 0.029,
+      "grad_norm": 1.6460460424423218,
+      "learning_rate": 4.995457768133685e-06,
+      "loss": 1.4004,
+      "step": 29000
+    },
+    {
+      "epoch": 0.029,
+      "eval_loss": 1.3687199354171753,
+      "eval_runtime": 24.5034,
+      "eval_samples_per_second": 204.053,
+      "eval_steps_per_second": 3.224,
+      "step": 29000
+    },
+    {
+      "epoch": 0.0291,
+      "grad_norm": 1.6839542388916016,
+      "learning_rate": 4.995409841489135e-06,
+      "loss": 1.3888,
+      "step": 29100
+    },
+    {
+      "epoch": 0.0292,
+      "grad_norm": 1.784096121788025,
+      "learning_rate": 4.995361663556884e-06,
+      "loss": 1.3948,
+      "step": 29200
+    },
+    {
+      "epoch": 0.0293,
+      "grad_norm": 1.7756794691085815,
+      "learning_rate": 4.995313234341781e-06,
+      "loss": 1.3915,
+      "step": 29300
+    },
+    {
+      "epoch": 0.0294,
+      "grad_norm": 1.887779951095581,
+      "learning_rate": 4.995264553848704e-06,
+      "loss": 1.3907,
+      "step": 29400
+    },
+    {
+      "epoch": 0.0295,
+      "grad_norm": 1.8146642446517944,
+      "learning_rate": 4.9952156220825545e-06,
+      "loss": 1.3936,
+      "step": 29500
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 1.8816800117492676,
+      "learning_rate": 4.9951664390482605e-06,
+      "loss": 1.4024,
+      "step": 29600
+    },
+    {
+      "epoch": 0.0297,
+      "grad_norm": 1.7906370162963867,
+      "learning_rate": 4.995117004750774e-06,
+      "loss": 1.3899,
+      "step": 29700
+    },
+    {
+      "epoch": 0.0298,
+      "grad_norm": 1.812466025352478,
+      "learning_rate": 4.995067319195073e-06,
+      "loss": 1.3991,
+      "step": 29800
+    },
+    {
+      "epoch": 0.0299,
+      "grad_norm": 1.7394683361053467,
+      "learning_rate": 4.995017382386162e-06,
+      "loss": 1.4003,
+      "step": 29900
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.876086950302124,
+      "learning_rate": 4.994967194329069e-06,
+      "loss": 1.3957,
+      "step": 30000
+    },
+    {
+      "epoch": 0.03,
+      "eval_loss": 1.3768850564956665,
+      "eval_runtime": 24.5341,
+      "eval_samples_per_second": 203.798,
+      "eval_steps_per_second": 3.22,
+      "step": 30000
+    },
+    {
+      "epoch": 0.0301,
+      "grad_norm": 1.9874565601348877,
+      "learning_rate": 4.994916755028847e-06,
+      "loss": 1.3796,
+      "step": 30100
+    },
+    {
+      "epoch": 0.0302,
+      "grad_norm": 1.8175944089889526,
+      "learning_rate": 4.994866064490577e-06,
+      "loss": 1.3918,
+      "step": 30200
+    },
+    {
+      "epoch": 0.0303,
+      "grad_norm": 1.8819736242294312,
+      "learning_rate": 4.994815122719361e-06,
+      "loss": 1.3847,
+      "step": 30300
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 1.7215774059295654,
+      "learning_rate": 4.994763929720331e-06,
+      "loss": 1.3977,
+      "step": 30400
+    },
+    {
+      "epoch": 0.0305,
+      "grad_norm": 1.7516419887542725,
+      "learning_rate": 4.9947124854986425e-06,
+      "loss": 1.3972,
+      "step": 30500
+    },
+    {
+      "epoch": 0.0306,
+      "grad_norm": 1.7906025648117065,
+      "learning_rate": 4.9946607900594745e-06,
+      "loss": 1.3848,
+      "step": 30600
+    },
+    {
+      "epoch": 0.0307,
+      "grad_norm": 1.7155588865280151,
+      "learning_rate": 4.994608843408033e-06,
+      "loss": 1.3892,
+      "step": 30700
+    },
+    {
+      "epoch": 0.0308,
+      "grad_norm": 1.843208909034729,
+      "learning_rate": 4.994556645549549e-06,
+      "loss": 1.404,
+      "step": 30800
+    },
+    {
+      "epoch": 0.0309,
+      "grad_norm": 1.7222075462341309,
+      "learning_rate": 4.994504196489279e-06,
+      "loss": 1.3807,
+      "step": 30900
+    },
+    {
+      "epoch": 0.031,
+      "grad_norm": 1.783629298210144,
+      "learning_rate": 4.994451496232505e-06,
+      "loss": 1.3677,
+      "step": 31000
+    },
+    {
+      "epoch": 0.031,
+      "eval_loss": 1.3631515502929688,
+      "eval_runtime": 25.4272,
+      "eval_samples_per_second": 196.64,
+      "eval_steps_per_second": 3.107,
+      "step": 31000
+    },
+    {
+      "epoch": 0.0311,
+      "grad_norm": 2.0314207077026367,
+      "learning_rate": 4.9943985447845336e-06,
+      "loss": 1.3947,
+      "step": 31100
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 1.8006716966629028,
+      "learning_rate": 4.9943453421506975e-06,
+      "loss": 1.3883,
+      "step": 31200
+    },
+    {
+      "epoch": 0.0313,
+      "grad_norm": 1.8027557134628296,
+      "learning_rate": 4.9942918883363525e-06,
+      "loss": 1.3893,
+      "step": 31300
+    },
+    {
+      "epoch": 0.0314,
+      "grad_norm": 1.8935391902923584,
+      "learning_rate": 4.994238183346883e-06,
+      "loss": 1.3904,
+      "step": 31400
+    },
+    {
+      "epoch": 0.0315,
+      "grad_norm": 1.7756006717681885,
+      "learning_rate": 4.9941842271876975e-06,
+      "loss": 1.3877,
+      "step": 31500
+    },
+    {
+      "epoch": 0.0316,
+      "grad_norm": 1.7972420454025269,
+      "learning_rate": 4.994130019864228e-06,
+      "loss": 1.4124,
+      "step": 31600
+    },
+    {
+      "epoch": 0.0317,
+      "grad_norm": 1.966475248336792,
+      "learning_rate": 4.994075561381934e-06,
+      "loss": 1.4365,
+      "step": 31700
+    },
+    {
+      "epoch": 0.0318,
+      "grad_norm": 1.8581668138504028,
+      "learning_rate": 4.994020851746298e-06,
+      "loss": 1.4377,
+      "step": 31800
+    },
+    {
+      "epoch": 0.0319,
+      "grad_norm": 2.0146875381469727,
+      "learning_rate": 4.993965890962832e-06,
+      "loss": 1.4372,
+      "step": 31900
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 2.064978837966919,
+      "learning_rate": 4.993910679037069e-06,
+      "loss": 1.4547,
+      "step": 32000
+    },
+    {
+      "epoch": 0.032,
+      "eval_loss": 1.3730239868164062,
+      "eval_runtime": 24.4948,
+      "eval_samples_per_second": 204.125,
+      "eval_steps_per_second": 3.225,
+      "step": 32000
+    },
+    {
+      "epoch": 0.0321,
+      "grad_norm": 2.0712766647338867,
+      "learning_rate": 4.993855215974568e-06,
+      "loss": 1.4353,
+      "step": 32100
+    },
+    {
+      "epoch": 0.0322,
+      "grad_norm": 1.8807464838027954,
+      "learning_rate": 4.9937995017809145e-06,
+      "loss": 1.4403,
+      "step": 32200
+    },
+    {
+      "epoch": 0.0323,
+      "grad_norm": 2.0456089973449707,
+      "learning_rate": 4.993743536461721e-06,
+      "loss": 1.4426,
+      "step": 32300
+    },
+    {
+      "epoch": 0.0324,
+      "grad_norm": 1.8087421655654907,
+      "learning_rate": 4.993687320022621e-06,
+      "loss": 1.4273,
+      "step": 32400
+    },
+    {
+      "epoch": 0.0325,
+      "grad_norm": 1.8107932806015015,
+      "learning_rate": 4.993630852469275e-06,
+      "loss": 1.434,
+      "step": 32500
+    },
+    {
+      "epoch": 0.0326,
+      "grad_norm": 1.9879577159881592,
+      "learning_rate": 4.9935741338073715e-06,
+      "loss": 1.4412,
+      "step": 32600
+    },
+    {
+      "epoch": 0.0327,
+      "grad_norm": 1.936676025390625,
+      "learning_rate": 4.993517164042621e-06,
+      "loss": 1.4545,
+      "step": 32700
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 2.0299065113067627,
+      "learning_rate": 4.99345994318076e-06,
+      "loss": 1.4361,
+      "step": 32800
+    },
+    {
+      "epoch": 0.0329,
+      "grad_norm": 1.900130033493042,
+      "learning_rate": 4.993402471227551e-06,
+      "loss": 1.4438,
+      "step": 32900
+    },
+    {
+      "epoch": 0.033,
+      "grad_norm": 1.991466760635376,
+      "learning_rate": 4.993344748188782e-06,
+      "loss": 1.4412,
+      "step": 33000
+    },
+    {
+      "epoch": 0.033,
+      "eval_loss": 1.3792095184326172,
+      "eval_runtime": 24.4986,
+      "eval_samples_per_second": 204.093,
+      "eval_steps_per_second": 3.225,
+      "step": 33000
+    },
+    {
+      "epoch": 0.0331,
+      "grad_norm": 2.0281729698181152,
+      "learning_rate": 4.993286774070264e-06,
+      "loss": 1.4363,
+      "step": 33100
+    },
+    {
+      "epoch": 0.0332,
+      "grad_norm": 1.8294743299484253,
+      "learning_rate": 4.993228548877837e-06,
+      "loss": 1.4507,
+      "step": 33200
+    },
+    {
+      "epoch": 0.0333,
+      "grad_norm": 2.0411036014556885,
+      "learning_rate": 4.993170072617362e-06,
+      "loss": 1.4462,
+      "step": 33300
+    },
+    {
+      "epoch": 0.0334,
+      "grad_norm": 1.9206632375717163,
+      "learning_rate": 4.99311134529473e-06,
+      "loss": 1.4385,
+      "step": 33400
+    },
+    {
+      "epoch": 0.0335,
+      "grad_norm": 1.8697141408920288,
+      "learning_rate": 4.993052366915853e-06,
+      "loss": 1.4387,
+      "step": 33500
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 1.9599472284317017,
+      "learning_rate": 4.9929931374866715e-06,
+      "loss": 1.4374,
+      "step": 33600
+    },
+    {
+      "epoch": 0.0337,
+      "grad_norm": 1.9199376106262207,
+      "learning_rate": 4.992933657013149e-06,
+      "loss": 1.417,
+      "step": 33700
+    },
+    {
+      "epoch": 0.0338,
+      "grad_norm": 1.742351770401001,
+      "learning_rate": 4.992873925501276e-06,
+      "loss": 1.4296,
+      "step": 33800
+    },
+    {
+      "epoch": 0.0339,
+      "grad_norm": 2.089339017868042,
+      "learning_rate": 4.992813942957067e-06,
+      "loss": 1.4346,
+      "step": 33900
+    },
+    {
+      "epoch": 0.034,
+      "grad_norm": 1.9663175344467163,
+      "learning_rate": 4.992753709386562e-06,
+      "loss": 1.4296,
+      "step": 34000
+    },
+    {
+      "epoch": 0.034,
+      "eval_loss": 1.3703665733337402,
+      "eval_runtime": 24.5247,
+      "eval_samples_per_second": 203.876,
+      "eval_steps_per_second": 3.221,
+      "step": 34000
+    },
+    {
+      "epoch": 0.0341,
+      "grad_norm": 1.81997549533844,
+      "learning_rate": 4.992693224795826e-06,
+      "loss": 1.4413,
+      "step": 34100
+    },
+    {
+      "epoch": 0.0342,
+      "grad_norm": 1.925018310546875,
+      "learning_rate": 4.992632489190951e-06,
+      "loss": 1.4287,
+      "step": 34200
+    },
+    {
+      "epoch": 0.0343,
+      "grad_norm": 1.9179668426513672,
+      "learning_rate": 4.992571502578052e-06,
+      "loss": 1.4394,
+      "step": 34300
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 1.8218015432357788,
+      "learning_rate": 4.992510264963271e-06,
+      "loss": 1.4283,
+      "step": 34400
+    },
+    {
+      "epoch": 0.0345,
+      "grad_norm": 2.0702309608459473,
+      "learning_rate": 4.992448776352775e-06,
+      "loss": 1.4336,
+      "step": 34500
+    },
+    {
+      "epoch": 0.0346,
+      "grad_norm": 1.958945393562317,
+      "learning_rate": 4.992387036752755e-06,
+      "loss": 1.4357,
+      "step": 34600
+    },
+    {
+      "epoch": 0.0347,
+      "grad_norm": 1.986783742904663,
+      "learning_rate": 4.992325046169429e-06,
+      "loss": 1.4514,
+      "step": 34700
+    },
+    {
+      "epoch": 0.0348,
+      "grad_norm": 1.9416749477386475,
+      "learning_rate": 4.9922628046090385e-06,
+      "loss": 1.4339,
+      "step": 34800
+    },
+    {
+      "epoch": 0.0349,
+      "grad_norm": 2.0083744525909424,
+      "learning_rate": 4.992200312077852e-06,
+      "loss": 1.4341,
+      "step": 34900
+    },
+    {
+      "epoch": 0.035,
+      "grad_norm": 1.904085636138916,
+      "learning_rate": 4.992137568582162e-06,
+      "loss": 1.4381,
+      "step": 35000
+    },
+    {
+      "epoch": 0.035,
+      "eval_loss": 1.3807413578033447,
+      "eval_runtime": 24.5144,
+      "eval_samples_per_second": 203.962,
+      "eval_steps_per_second": 3.223,
+      "step": 35000
+    },
+    {
+      "epoch": 0.0351,
+      "grad_norm": 1.923017978668213,
+      "learning_rate": 4.9920745741282886e-06,
+      "loss": 1.4372,
+      "step": 35100
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 2.0158944129943848,
+      "learning_rate": 4.992011328722572e-06,
+      "loss": 1.4409,
+      "step": 35200
+    },
+    {
+      "epoch": 0.0353,
+      "grad_norm": 1.9902615547180176,
+      "learning_rate": 4.991947832371384e-06,
+      "loss": 1.4395,
+      "step": 35300
+    },
+    {
+      "epoch": 0.0354,
+      "grad_norm": 1.8978103399276733,
+      "learning_rate": 4.9918840850811155e-06,
+      "loss": 1.4217,
+      "step": 35400
+    },
+    {
+      "epoch": 0.0355,
+      "grad_norm": 2.14467453956604,
+      "learning_rate": 4.99182008685819e-06,
+      "loss": 1.4306,
+      "step": 35500
+    },
+    {
+      "epoch": 0.0356,
+      "grad_norm": 2.108886241912842,
+      "learning_rate": 4.991755837709049e-06,
+      "loss": 1.429,
+      "step": 35600
+    },
+    {
+      "epoch": 0.0357,
+      "grad_norm": 1.7032333612442017,
+      "learning_rate": 4.991691337640163e-06,
+      "loss": 1.4332,
+      "step": 35700
+    },
+    {
+      "epoch": 0.0358,
+      "grad_norm": 1.8070473670959473,
+      "learning_rate": 4.991626586658028e-06,
+      "loss": 1.432,
+      "step": 35800
+    },
+    {
+      "epoch": 0.0359,
+      "grad_norm": 1.8170287609100342,
+      "learning_rate": 4.991561584769164e-06,
+      "loss": 1.4158,
+      "step": 35900
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 2.1401708126068115,
+      "learning_rate": 4.991496331980116e-06,
+      "loss": 1.4398,
+      "step": 36000
+    },
+    {
+      "epoch": 0.036,
+      "eval_loss": 1.3770599365234375,
+      "eval_runtime": 24.5619,
+      "eval_samples_per_second": 203.567,
+      "eval_steps_per_second": 3.216,
+      "step": 36000
+    },
+    {
+      "epoch": 0.0361,
+      "grad_norm": 1.8093080520629883,
+      "learning_rate": 4.991430828297456e-06,
+      "loss": 1.4422,
+      "step": 36100
+    },
+    {
+      "epoch": 0.0362,
+      "grad_norm": 2.0092973709106445,
+      "learning_rate": 4.99136507372778e-06,
+      "loss": 1.4297,
+      "step": 36200
+    },
+    {
+      "epoch": 0.0363,
+      "grad_norm": 1.928898572921753,
+      "learning_rate": 4.991299068277709e-06,
+      "loss": 1.43,
+      "step": 36300
+    },
+    {
+      "epoch": 0.0364,
+      "grad_norm": 1.8470439910888672,
+      "learning_rate": 4.99123281195389e-06,
+      "loss": 1.4143,
+      "step": 36400
+    },
+    {
+      "epoch": 0.0365,
+      "grad_norm": 1.7525688409805298,
+      "learning_rate": 4.991166304762994e-06,
+      "loss": 1.4316,
+      "step": 36500
+    },
+    {
+      "epoch": 0.0366,
+      "grad_norm": 1.9879366159439087,
+      "learning_rate": 4.9910995467117205e-06,
+      "loss": 1.4501,
+      "step": 36600
+    },
+    {
+      "epoch": 0.0367,
+      "grad_norm": 2.008660316467285,
+      "learning_rate": 4.99103253780679e-06,
+      "loss": 1.4331,
+      "step": 36700
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 1.977774977684021,
+      "learning_rate": 4.990965278054952e-06,
+      "loss": 1.4271,
+      "step": 36800
+    },
+    {
+      "epoch": 0.0369,
+      "grad_norm": 1.8421337604522705,
+      "learning_rate": 4.990897767462978e-06,
+      "loss": 1.4363,
+      "step": 36900
+    },
+    {
+      "epoch": 0.037,
+      "grad_norm": 1.959547519683838,
+      "learning_rate": 4.990830006037667e-06,
+      "loss": 1.4322,
+      "step": 37000
+    },
+    {
+      "epoch": 0.037,
+      "eval_loss": 1.3740063905715942,
+      "eval_runtime": 24.5809,
+      "eval_samples_per_second": 203.41,
+      "eval_steps_per_second": 3.214,
+      "step": 37000
+    },
+    {
+      "epoch": 0.0371,
+      "grad_norm": 1.9228745698928833,
+      "learning_rate": 4.9907619937858435e-06,
+      "loss": 1.4269,
+      "step": 37100
+    },
+    {
+      "epoch": 0.0372,
+      "grad_norm": 1.996996521949768,
+      "learning_rate": 4.990693730714354e-06,
+      "loss": 1.4289,
+      "step": 37200
+    },
+    {
+      "epoch": 0.0373,
+      "grad_norm": 1.9373116493225098,
+      "learning_rate": 4.9906252168300755e-06,
+      "loss": 1.4184,
+      "step": 37300
+    },
+    {
+      "epoch": 0.0374,
+      "grad_norm": 1.7933897972106934,
+      "learning_rate": 4.9905564521399046e-06,
+      "loss": 1.4376,
+      "step": 37400
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 2.018866539001465,
+      "learning_rate": 4.9904874366507674e-06,
+      "loss": 1.4241,
+      "step": 37500
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 2.0614266395568848,
+      "learning_rate": 4.990418170369613e-06,
+      "loss": 1.4441,
+      "step": 37600
+    },
+    {
+      "epoch": 0.0377,
+      "grad_norm": 1.9219199419021606,
+      "learning_rate": 4.990348653303417e-06,
+      "loss": 1.4285,
+      "step": 37700
+    },
+    {
+      "epoch": 0.0378,
+      "grad_norm": 2.0715043544769287,
+      "learning_rate": 4.990278885459181e-06,
+      "loss": 1.4361,
+      "step": 37800
+    },
+    {
+      "epoch": 0.0379,
+      "grad_norm": 1.9379918575286865,
+      "learning_rate": 4.9902088668439284e-06,
+      "loss": 1.4371,
+      "step": 37900
+    },
+    {
+      "epoch": 0.038,
+      "grad_norm": 1.9890401363372803,
+      "learning_rate": 4.990138597464711e-06,
+      "loss": 1.4273,
+      "step": 38000
+    },
+    {
+      "epoch": 0.038,
+      "eval_loss": 1.387099266052246,
+      "eval_runtime": 24.5479,
+      "eval_samples_per_second": 203.683,
+      "eval_steps_per_second": 3.218,
+      "step": 38000
+    },
+    {
+      "epoch": 0.0381,
+      "grad_norm": 2.0055789947509766,
+      "learning_rate": 4.990068077328606e-06,
+      "loss": 1.422,
+      "step": 38100
+    },
+    {
+      "epoch": 0.0382,
+      "grad_norm": 2.1018118858337402,
+      "learning_rate": 4.989997306442712e-06,
+      "loss": 1.4257,
+      "step": 38200
+    },
+    {
+      "epoch": 0.0383,
+      "grad_norm": 1.901002049446106,
+      "learning_rate": 4.989926284814158e-06,
+      "loss": 1.4173,
+      "step": 38300
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 1.8262403011322021,
+      "learning_rate": 4.989855012450096e-06,
+      "loss": 1.4332,
+      "step": 38400
+    },
+    {
+      "epoch": 0.0385,
+      "grad_norm": 2.0205042362213135,
+      "learning_rate": 4.989783489357703e-06,
+      "loss": 1.4454,
+      "step": 38500
+    },
+    {
+      "epoch": 0.0386,
+      "grad_norm": 2.0293779373168945,
+      "learning_rate": 4.989711715544179e-06,
+      "loss": 1.4315,
+      "step": 38600
+    },
+    {
+      "epoch": 0.0387,
+      "grad_norm": 1.9387352466583252,
+      "learning_rate": 4.989639691016754e-06,
+      "loss": 1.4266,
+      "step": 38700
+    },
+    {
+      "epoch": 0.0388,
+      "grad_norm": 1.9366799592971802,
+      "learning_rate": 4.98956741578268e-06,
+      "loss": 1.425,
+      "step": 38800
+    },
+    {
+      "epoch": 0.0389,
+      "grad_norm": 1.973802089691162,
+      "learning_rate": 4.989494889849236e-06,
+      "loss": 1.4382,
+      "step": 38900
+    },
+    {
+      "epoch": 0.039,
+      "grad_norm": 1.9763784408569336,
+      "learning_rate": 4.989422113223724e-06,
+      "loss": 1.4255,
+      "step": 39000
+    },
+    {
+      "epoch": 0.039,
+      "eval_loss": 1.3631179332733154,
+      "eval_runtime": 24.5541,
+      "eval_samples_per_second": 203.632,
+      "eval_steps_per_second": 3.217,
+      "step": 39000
+    },
+    {
+      "epoch": 0.0391,
+      "grad_norm": 1.9820029735565186,
+      "learning_rate": 4.989349085913474e-06,
+      "loss": 1.4326,
+      "step": 39100
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 1.9469517469406128,
+      "learning_rate": 4.989275807925838e-06,
+      "loss": 1.4299,
+      "step": 39200
+    },
+    {
+      "epoch": 0.0393,
+      "grad_norm": 1.8591476678848267,
+      "learning_rate": 4.989202279268198e-06,
+      "loss": 1.4096,
+      "step": 39300
+    },
+    {
+      "epoch": 0.0394,
+      "grad_norm": 1.9623719453811646,
+      "learning_rate": 4.989128499947956e-06,
+      "loss": 1.4368,
+      "step": 39400
+    },
+    {
+      "epoch": 0.0395,
+      "grad_norm": 1.8734135627746582,
+      "learning_rate": 4.989054469972541e-06,
+      "loss": 1.4442,
+      "step": 39500
+    },
+    {
+      "epoch": 0.0396,
+      "grad_norm": 1.8288686275482178,
+      "learning_rate": 4.98898018934941e-06,
+      "loss": 1.4371,
+      "step": 39600
+    },
+    {
+      "epoch": 0.0397,
+      "grad_norm": 2.0335354804992676,
+      "learning_rate": 4.988905658086041e-06,
+      "loss": 1.4147,
+      "step": 39700
+    },
+    {
+      "epoch": 0.0398,
+      "grad_norm": 1.8424173593521118,
+      "learning_rate": 4.988830876189941e-06,
+      "loss": 1.4347,
+      "step": 39800
+    },
+    {
+      "epoch": 0.0399,
+      "grad_norm": 1.9924767017364502,
+      "learning_rate": 4.9887558436686415e-06,
+      "loss": 1.4396,
+      "step": 39900
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.015306234359741,
+      "learning_rate": 4.988680560529695e-06,
+      "loss": 1.4322,
+      "step": 40000
+    },
+    {
+      "epoch": 0.04,
+      "eval_loss": 1.3669300079345703,
+      "eval_runtime": 24.5711,
+      "eval_samples_per_second": 203.491,
+      "eval_steps_per_second": 3.215,
+      "step": 40000
     }
   ],
   "logging_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 5.41544675278848e+18,
   "train_batch_size": 64,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5045bb023fc9f9ce18adc5b4d8a1c05111e1d7d6f92b7d0e1888eae4ede00e23
 size 5777

 version https://git-lfs.github.com/spec/v1
+oid sha256:a1ed4792935f60351e97f9f7d12d441a439016de4467a0443320af5be32108cf
 size 5777