Upload 9 files

Browse files

Files changed (6) hide show

config.json +1 -1
model.safetensors +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +1132 -1132
training_args.bin +1 -1

config.json CHANGED Viewed

@@ -41,6 +41,6 @@
   "sep_token_id": 50282,
   "sparse_pred_ignore_index": -100,
   "sparse_prediction": false,
-  "transformers_version": "4.56.0",
   "vocab_size": 50368
 }

   "sep_token_id": 50282,
   "sparse_pred_ignore_index": -100,
   "sparse_prediction": false,
+  "transformers_version": "4.56.1",
   "vocab_size": 50368
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:954170b1fad48a196e31d3546782ff96db097725c8af15848d7f16d81276ef73
 size 1583544840

 version https://git-lfs.github.com/spec/v1
+oid sha256:4c513c87136b7061f89a0058cf57e10feabc8eaa6dc84ac77ff0f5a223c2f19c
 size 1583544840

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5d98adafc4abe7b7680c12defd6f5a399e2070f42e277fc2f67f1547179234cd
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:449e44f9adf4d083aec6625b9110f6a9a09baba982e3a32de94ff0c135c00f4d
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2ed749c4fee6e9346a27fe219e7901c7d0d1eadfb8abff3040bfb1e1b1961b12
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:2d0df6d48ac6c8d2a3fe965d9b7a645f9b425ec23c31765b3bbc57f64cf0fee9
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.7142857142857143,
   "eval_steps": 1000,
   "global_step": 25000,
   "is_hyper_param_search": false,
@@ -10,1965 +10,1965 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 2.857142857142857e-05,
-      "grad_norm": 1.1264785528182983,
       "learning_rate": 0.0,
-      "loss": 1.4622,
       "step": 1
     },
     {
-      "epoch": 0.002857142857142857,
-      "grad_norm": 1.0415701866149902,
-      "learning_rate": 1.4142857142857144e-06,
-      "loss": 1.4319,
       "step": 100
     },
     {
-      "epoch": 0.005714285714285714,
-      "grad_norm": 1.1746091842651367,
-      "learning_rate": 2.8428571428571432e-06,
-      "loss": 1.4189,
       "step": 200
     },
     {
-      "epoch": 0.008571428571428572,
-      "grad_norm": 1.1301525831222534,
-      "learning_rate": 4.271428571428572e-06,
-      "loss": 1.4293,
       "step": 300
     },
     {
-      "epoch": 0.011428571428571429,
-      "grad_norm": 1.0607796907424927,
-      "learning_rate": 4.9999753285470756e-06,
-      "loss": 1.4205,
       "step": 400
     },
     {
-      "epoch": 0.014285714285714285,
-      "grad_norm": 1.1491715908050537,
-      "learning_rate": 4.999771876927458e-06,
-      "loss": 1.4197,
       "step": 500
     },
     {
-      "epoch": 0.017142857142857144,
-      "grad_norm": 1.0873078107833862,
-      "learning_rate": 4.999362935318198e-06,
-      "loss": 1.4364,
       "step": 600
     },
     {
-      "epoch": 0.02,
-      "grad_norm": 1.0659881830215454,
-      "learning_rate": 4.998748537335728e-06,
-      "loss": 1.4507,
       "step": 700
     },
     {
-      "epoch": 0.022857142857142857,
-      "grad_norm": 1.1764490604400635,
-      "learning_rate": 4.99792873348571e-06,
-      "loss": 1.4398,
       "step": 800
     },
     {
-      "epoch": 0.025714285714285714,
-      "grad_norm": 1.0576765537261963,
-      "learning_rate": 4.996903591158886e-06,
-      "loss": 1.4203,
       "step": 900
     },
     {
-      "epoch": 0.02857142857142857,
-      "grad_norm": 1.111843228340149,
-      "learning_rate": 4.995673194625541e-06,
-      "loss": 1.4203,
       "step": 1000
     },
     {
-      "epoch": 0.02857142857142857,
-      "eval_loss": 1.4505008459091187,
-      "eval_runtime": 103.009,
-      "eval_samples_per_second": 132.95,
-      "eval_steps_per_second": 2.077,
       "step": 1000
     },
     {
-      "epoch": 0.03142857142857143,
-      "grad_norm": 1.037828803062439,
-      "learning_rate": 4.994237645028573e-06,
-      "loss": 1.443,
       "step": 1100
     },
     {
-      "epoch": 0.03428571428571429,
-      "grad_norm": 1.1225452423095703,
-      "learning_rate": 4.992597060375177e-06,
-      "loss": 1.4519,
       "step": 1200
     },
     {
-      "epoch": 0.037142857142857144,
-      "grad_norm": 1.032313346862793,
-      "learning_rate": 4.990751575527151e-06,
-      "loss": 1.4358,
       "step": 1300
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 1.1252490282058716,
-      "learning_rate": 4.988701342189802e-06,
-      "loss": 1.4102,
       "step": 1400
     },
     {
-      "epoch": 0.04285714285714286,
-      "grad_norm": 1.0545426607131958,
-      "learning_rate": 4.986446528899478e-06,
-      "loss": 1.4142,
       "step": 1500
     },
     {
-      "epoch": 0.045714285714285714,
-      "grad_norm": 1.08208429813385,
-      "learning_rate": 4.983987321009718e-06,
-      "loss": 1.4247,
       "step": 1600
     },
     {
-      "epoch": 0.04857142857142857,
-      "grad_norm": 1.042827844619751,
-      "learning_rate": 4.98132392067601e-06,
-      "loss": 1.4078,
       "step": 1700
     },
     {
-      "epoch": 0.05142857142857143,
-      "grad_norm": 1.029168725013733,
-      "learning_rate": 4.978456546839175e-06,
-      "loss": 1.4255,
       "step": 1800
     },
     {
-      "epoch": 0.054285714285714284,
-      "grad_norm": 1.1674017906188965,
-      "learning_rate": 4.975385435207367e-06,
-      "loss": 1.4428,
       "step": 1900
     },
     {
-      "epoch": 0.05714285714285714,
-      "grad_norm": 1.2838454246520996,
-      "learning_rate": 4.972110838236704e-06,
-      "loss": 1.4327,
       "step": 2000
     },
     {
-      "epoch": 0.05714285714285714,
-      "eval_loss": 1.4425562620162964,
-      "eval_runtime": 97.037,
-      "eval_samples_per_second": 141.132,
-      "eval_steps_per_second": 2.205,
       "step": 2000
     },
     {
-      "epoch": 0.06,
-      "grad_norm": 1.0720206499099731,
-      "learning_rate": 4.968633025110507e-06,
-      "loss": 1.4312,
       "step": 2100
     },
     {
-      "epoch": 0.06285714285714286,
-      "grad_norm": 1.0312304496765137,
-      "learning_rate": 4.964952281717177e-06,
-      "loss": 1.4405,
       "step": 2200
     },
     {
-      "epoch": 0.06571428571428571,
-      "grad_norm": 1.0791317224502563,
-      "learning_rate": 4.961068910626692e-06,
-      "loss": 1.4407,
       "step": 2300
     },
     {
-      "epoch": 0.06857142857142857,
-      "grad_norm": 1.0809016227722168,
-      "learning_rate": 4.956983231065733e-06,
-      "loss": 1.434,
       "step": 2400
     },
     {
-      "epoch": 0.07142857142857142,
-      "grad_norm": 1.059635043144226,
-      "learning_rate": 4.952695578891449e-06,
-      "loss": 1.4114,
       "step": 2500
     },
     {
-      "epoch": 0.07428571428571429,
-      "grad_norm": 1.0659129619598389,
-      "learning_rate": 4.948206306563842e-06,
-      "loss": 1.4374,
       "step": 2600
     },
     {
-      "epoch": 0.07714285714285714,
-      "grad_norm": 1.0818511247634888,
-      "learning_rate": 4.943515783116794e-06,
-      "loss": 1.4196,
       "step": 2700
     },
     {
-      "epoch": 0.08,
-      "grad_norm": 1.1003646850585938,
-      "learning_rate": 4.9386243941277374e-06,
-      "loss": 1.4508,
       "step": 2800
     },
     {
-      "epoch": 0.08285714285714285,
-      "grad_norm": 1.086207628250122,
-      "learning_rate": 4.933532541685949e-06,
-      "loss": 1.4354,
       "step": 2900
     },
     {
-      "epoch": 0.08571428571428572,
-      "grad_norm": 1.0702838897705078,
-      "learning_rate": 4.928240644359507e-06,
-      "loss": 1.4262,
       "step": 3000
     },
     {
-      "epoch": 0.08571428571428572,
-      "eval_loss": 1.438844919204712,
-      "eval_runtime": 96.9421,
-      "eval_samples_per_second": 141.27,
-      "eval_steps_per_second": 2.208,
       "step": 3000
     },
     {
-      "epoch": 0.08857142857142856,
-      "grad_norm": 1.1206424236297607,
-      "learning_rate": 4.922749137160875e-06,
-      "loss": 1.4445,
       "step": 3100
     },
     {
-      "epoch": 0.09142857142857143,
-      "grad_norm": 1.0971518754959106,
-      "learning_rate": 4.917058471511149e-06,
-      "loss": 1.4117,
       "step": 3200
     },
     {
-      "epoch": 0.09428571428571429,
-      "grad_norm": 1.1263982057571411,
-      "learning_rate": 4.9111691152029436e-06,
-      "loss": 1.4294,
       "step": 3300
     },
     {
-      "epoch": 0.09714285714285714,
-      "grad_norm": 1.0150455236434937,
-      "learning_rate": 4.905081552361943e-06,
-      "loss": 1.4357,
       "step": 3400
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 1.0511361360549927,
-      "learning_rate": 4.898796283407099e-06,
-      "loss": 1.438,
       "step": 3500
     },
     {
-      "epoch": 0.10285714285714286,
-      "grad_norm": 1.1033008098602295,
-      "learning_rate": 4.892313825009499e-06,
-      "loss": 1.4162,
       "step": 3600
     },
     {
-      "epoch": 0.10571428571428572,
-      "grad_norm": 1.1107470989227295,
-      "learning_rate": 4.885634710049891e-06,
-      "loss": 1.4267,
       "step": 3700
     },
     {
-      "epoch": 0.10857142857142857,
-      "grad_norm": 1.0580041408538818,
-      "learning_rate": 4.878759487574882e-06,
-      "loss": 1.4287,
       "step": 3800
     },
     {
-      "epoch": 0.11142857142857143,
-      "grad_norm": 1.0198274850845337,
-      "learning_rate": 4.871688722751799e-06,
-      "loss": 1.438,
       "step": 3900
     },
     {
-      "epoch": 0.11428571428571428,
-      "grad_norm": 1.1063220500946045,
-      "learning_rate": 4.864422996822239e-06,
-      "loss": 1.4078,
       "step": 4000
     },
     {
-      "epoch": 0.11428571428571428,
-      "eval_loss": 1.4405826330184937,
-      "eval_runtime": 97.1575,
-      "eval_samples_per_second": 140.957,
-      "eval_steps_per_second": 2.203,
       "step": 4000
     },
     {
-      "epoch": 0.11714285714285715,
-      "grad_norm": 1.0360065698623657,
-      "learning_rate": 4.8569629070542775e-06,
-      "loss": 1.414,
       "step": 4100
     },
     {
-      "epoch": 0.12,
-      "grad_norm": 1.0539647340774536,
-      "learning_rate": 4.849309066693382e-06,
-      "loss": 1.3992,
       "step": 4200
     },
     {
-      "epoch": 0.12285714285714286,
-      "grad_norm": 1.0678602457046509,
-      "learning_rate": 4.8414621049119935e-06,
-      "loss": 1.4226,
       "step": 4300
     },
     {
-      "epoch": 0.12571428571428572,
-      "grad_norm": 1.1174051761627197,
-      "learning_rate": 4.833422666757811e-06,
-      "loss": 1.4149,
       "step": 4400
     },
     {
-      "epoch": 0.12857142857142856,
-      "grad_norm": 1.1076269149780273,
-      "learning_rate": 4.825191413100764e-06,
-      "loss": 1.4219,
       "step": 4500
     },
     {
-      "epoch": 0.13142857142857142,
-      "grad_norm": 1.0237882137298584,
-      "learning_rate": 4.816769020578685e-06,
-      "loss": 1.4063,
       "step": 4600
     },
     {
-      "epoch": 0.13428571428571429,
-      "grad_norm": 1.0634537935256958,
-      "learning_rate": 4.808156181541694e-06,
-      "loss": 1.4077,
       "step": 4700
     },
     {
-      "epoch": 0.13714285714285715,
-      "grad_norm": 1.1134625673294067,
-      "learning_rate": 4.799353603995275e-06,
-      "loss": 1.4589,
       "step": 4800
     },
     {
-      "epoch": 0.14,
-      "grad_norm": 1.069698691368103,
-      "learning_rate": 4.790362011542085e-06,
-      "loss": 1.4063,
       "step": 4900
     },
     {
-      "epoch": 0.14285714285714285,
-      "grad_norm": 1.1093010902404785,
-      "learning_rate": 4.7811821433224665e-06,
-      "loss": 1.4225,
       "step": 5000
     },
     {
-      "epoch": 0.14285714285714285,
-      "eval_loss": 1.4438061714172363,
-      "eval_runtime": 98.2976,
-      "eval_samples_per_second": 139.322,
-      "eval_steps_per_second": 2.177,
       "step": 5000
     },
     {
-      "epoch": 0.1457142857142857,
-      "grad_norm": 1.0884599685668945,
-      "learning_rate": 4.7718147539536865e-06,
-      "loss": 1.4347,
       "step": 5100
     },
     {
-      "epoch": 0.14857142857142858,
-      "grad_norm": 1.0088622570037842,
-      "learning_rate": 4.762260613467909e-06,
-      "loss": 1.4254,
       "step": 5200
     },
     {
-      "epoch": 0.15142857142857144,
-      "grad_norm": 1.1340473890304565,
-      "learning_rate": 4.75252050724889e-06,
-      "loss": 1.4101,
       "step": 5300
     },
     {
-      "epoch": 0.15428571428571428,
-      "grad_norm": 1.093491554260254,
-      "learning_rate": 4.7425952359674225e-06,
-      "loss": 1.4256,
       "step": 5400
     },
     {
-      "epoch": 0.15714285714285714,
-      "grad_norm": 1.0808088779449463,
-      "learning_rate": 4.732485615515511e-06,
-      "loss": 1.4093,
       "step": 5500
     },
     {
-      "epoch": 0.16,
-      "grad_norm": 1.100080132484436,
-      "learning_rate": 4.722192476939309e-06,
-      "loss": 1.4263,
       "step": 5600
     },
     {
-      "epoch": 0.16285714285714287,
-      "grad_norm": 1.096901774406433,
-      "learning_rate": 4.7117166663708025e-06,
-      "loss": 1.4084,
       "step": 5700
     },
     {
-      "epoch": 0.1657142857142857,
-      "grad_norm": 1.1885929107666016,
-      "learning_rate": 4.7010590449582525e-06,
-      "loss": 1.4146,
       "step": 5800
     },
     {
-      "epoch": 0.16857142857142857,
-      "grad_norm": 1.082043170928955,
-      "learning_rate": 4.690220488795406e-06,
-      "loss": 1.4201,
       "step": 5900
     },
     {
-      "epoch": 0.17142857142857143,
-      "grad_norm": 1.0647767782211304,
-      "learning_rate": 4.679201888849481e-06,
-      "loss": 1.436,
       "step": 6000
     },
     {
-      "epoch": 0.17142857142857143,
-      "eval_loss": 1.4336808919906616,
-      "eval_runtime": 97.8921,
-      "eval_samples_per_second": 139.899,
-      "eval_steps_per_second": 2.186,
       "step": 6000
     },
     {
-      "epoch": 0.1742857142857143,
-      "grad_norm": 1.1146217584609985,
-      "learning_rate": 4.668004150887924e-06,
-      "loss": 1.4132,
       "step": 6100
     },
     {
-      "epoch": 0.17714285714285713,
-      "grad_norm": 1.0890520811080933,
-      "learning_rate": 4.656628195403952e-06,
-      "loss": 1.4047,
       "step": 6200
     },
     {
-      "epoch": 0.18,
-      "grad_norm": 1.033389687538147,
-      "learning_rate": 4.645074957540887e-06,
-      "loss": 1.4272,
       "step": 6300
     },
     {
-      "epoch": 0.18285714285714286,
-      "grad_norm": 1.1013028621673584,
-      "learning_rate": 4.63334538701528e-06,
-      "loss": 1.4402,
       "step": 6400
     },
     {
-      "epoch": 0.18571428571428572,
-      "grad_norm": 1.0814400911331177,
-      "learning_rate": 4.6214404480388455e-06,
-      "loss": 1.4031,
       "step": 6500
     },
     {
-      "epoch": 0.18857142857142858,
-      "grad_norm": 1.0447463989257812,
-      "learning_rate": 4.609361119239197e-06,
-      "loss": 1.4453,
       "step": 6600
     },
     {
-      "epoch": 0.19142857142857142,
-      "grad_norm": 1.1220800876617432,
-      "learning_rate": 4.5971083935794026e-06,
-      "loss": 1.4148,
       "step": 6700
     },
     {
-      "epoch": 0.19428571428571428,
-      "grad_norm": 1.107762098312378,
-      "learning_rate": 4.584683278276356e-06,
-      "loss": 1.4285,
       "step": 6800
     },
     {
-      "epoch": 0.19714285714285715,
-      "grad_norm": 1.1005544662475586,
-      "learning_rate": 4.572086794717985e-06,
-      "loss": 1.4328,
       "step": 6900
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 1.033148169517517,
-      "learning_rate": 4.559319978379287e-06,
-      "loss": 1.4111,
       "step": 7000
     },
     {
-      "epoch": 0.2,
-      "eval_loss": 1.4392390251159668,
-      "eval_runtime": 97.5914,
-      "eval_samples_per_second": 140.33,
-      "eval_steps_per_second": 2.193,
       "step": 7000
     },
     {
-      "epoch": 0.20285714285714285,
-      "grad_norm": 1.052509069442749,
-      "learning_rate": 4.546383878737207e-06,
-      "loss": 1.4113,
       "step": 7100
     },
     {
-      "epoch": 0.2057142857142857,
-      "grad_norm": 1.0561904907226562,
-      "learning_rate": 4.533279559184373e-06,
-      "loss": 1.4275,
       "step": 7200
     },
     {
-      "epoch": 0.20857142857142857,
-      "grad_norm": 1.0787992477416992,
-      "learning_rate": 4.520008096941676e-06,
-      "loss": 1.4084,
       "step": 7300
     },
     {
-      "epoch": 0.21142857142857144,
-      "grad_norm": 1.0198429822921753,
-      "learning_rate": 4.506570582969719e-06,
-      "loss": 1.4029,
       "step": 7400
     },
     {
-      "epoch": 0.21428571428571427,
-      "grad_norm": 1.0664575099945068,
-      "learning_rate": 4.492968121879142e-06,
-      "loss": 1.4049,
       "step": 7500
     },
     {
-      "epoch": 0.21714285714285714,
-      "grad_norm": 1.0929675102233887,
-      "learning_rate": 4.479201831839812e-06,
-      "loss": 1.4169,
       "step": 7600
     },
     {
-      "epoch": 0.22,
-      "grad_norm": 1.1445673704147339,
-      "learning_rate": 4.465272844488908e-06,
-      "loss": 1.4033,
       "step": 7700
     },
     {
-      "epoch": 0.22285714285714286,
-      "grad_norm": 1.064433217048645,
-      "learning_rate": 4.4511823048378986e-06,
-      "loss": 1.43,
       "step": 7800
     },
     {
-      "epoch": 0.2257142857142857,
-      "grad_norm": 1.0845831632614136,
-      "learning_rate": 4.436931371178416e-06,
-      "loss": 1.4441,
       "step": 7900
     },
     {
-      "epoch": 0.22857142857142856,
-      "grad_norm": 1.0980095863342285,
-      "learning_rate": 4.42252121498704e-06,
-      "loss": 1.4015,
       "step": 8000
     },
     {
-      "epoch": 0.22857142857142856,
-      "eval_loss": 1.4309405088424683,
-      "eval_runtime": 97.7381,
-      "eval_samples_per_second": 140.119,
-      "eval_steps_per_second": 2.19,
       "step": 8000
     },
     {
-      "epoch": 0.23142857142857143,
-      "grad_norm": 1.1431641578674316,
-      "learning_rate": 4.407953020829001e-06,
-      "loss": 1.4249,
       "step": 8100
     },
     {
-      "epoch": 0.2342857142857143,
-      "grad_norm": 1.0139048099517822,
-      "learning_rate": 4.393227986260801e-06,
-      "loss": 1.3958,
       "step": 8200
     },
     {
-      "epoch": 0.23714285714285716,
-      "grad_norm": 1.0676871538162231,
-      "learning_rate": 4.378347321731773e-06,
-      "loss": 1.4204,
       "step": 8300
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 1.1097986698150635,
-      "learning_rate": 4.363312250484577e-06,
-      "loss": 1.4335,
       "step": 8400
     },
     {
-      "epoch": 0.24285714285714285,
-      "grad_norm": 1.083742380142212,
-      "learning_rate": 4.348124008454644e-06,
-      "loss": 1.436,
       "step": 8500
     },
     {
-      "epoch": 0.24571428571428572,
-      "grad_norm": 1.072716474533081,
-      "learning_rate": 4.332783844168581e-06,
-      "loss": 1.424,
       "step": 8600
     },
     {
-      "epoch": 0.24857142857142858,
-      "grad_norm": 1.1168031692504883,
-      "learning_rate": 4.317293018641536e-06,
-      "loss": 1.4262,
       "step": 8700
     },
     {
-      "epoch": 0.25142857142857145,
-      "grad_norm": 1.1102938652038574,
-      "learning_rate": 4.301652805273535e-06,
-      "loss": 1.4141,
       "step": 8800
     },
     {
-      "epoch": 0.2542857142857143,
-      "grad_norm": 1.1052049398422241,
-      "learning_rate": 4.285864489744809e-06,
-      "loss": 1.4221,
       "step": 8900
     },
     {
-      "epoch": 0.2571428571428571,
-      "grad_norm": 1.0475815534591675,
-      "learning_rate": 4.269929369910103e-06,
-      "loss": 1.4145,
       "step": 9000
     },
     {
-      "epoch": 0.2571428571428571,
-      "eval_loss": 1.428357481956482,
-      "eval_runtime": 98.1292,
-      "eval_samples_per_second": 139.561,
-      "eval_steps_per_second": 2.181,
       "step": 9000
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 1.0066262483596802,
-      "learning_rate": 4.253848755691992e-06,
-      "loss": 1.4049,
       "step": 9100
     },
     {
-      "epoch": 0.26285714285714284,
-      "grad_norm": 1.131996512413025,
-      "learning_rate": 4.2376239689731955e-06,
-      "loss": 1.3991,
       "step": 9200
     },
     {
-      "epoch": 0.26571428571428574,
-      "grad_norm": 1.1413109302520752,
-      "learning_rate": 4.2212563434879175e-06,
-      "loss": 1.3744,
       "step": 9300
     },
     {
-      "epoch": 0.26857142857142857,
-      "grad_norm": 1.073792576789856,
-      "learning_rate": 4.204747224712209e-06,
-      "loss": 1.422,
       "step": 9400
     },
     {
-      "epoch": 0.2714285714285714,
-      "grad_norm": 1.0397651195526123,
-      "learning_rate": 4.188097969753363e-06,
-      "loss": 1.4064,
       "step": 9500
     },
     {
-      "epoch": 0.2742857142857143,
-      "grad_norm": 1.1306557655334473,
-      "learning_rate": 4.171309947238357e-06,
-      "loss": 1.4408,
       "step": 9600
     },
     {
-      "epoch": 0.27714285714285714,
-      "grad_norm": 1.1982935667037964,
-      "learning_rate": 4.154384537201347e-06,
-      "loss": 1.4151,
       "step": 9700
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 1.1465263366699219,
-      "learning_rate": 4.137323130970225e-06,
-      "loss": 1.4211,
       "step": 9800
     },
     {
-      "epoch": 0.28285714285714286,
-      "grad_norm": 0.9817516803741455,
-      "learning_rate": 4.120127131052244e-06,
-      "loss": 1.4089,
       "step": 9900
     },
     {
-      "epoch": 0.2857142857142857,
-      "grad_norm": 1.150546908378601,
-      "learning_rate": 4.1027979510187285e-06,
-      "loss": 1.4191,
       "step": 10000
     },
     {
-      "epoch": 0.2857142857142857,
-      "eval_loss": 1.429203748703003,
-      "eval_runtime": 98.6571,
-      "eval_samples_per_second": 138.814,
-      "eval_steps_per_second": 2.169,
       "step": 10000
     },
     {
-      "epoch": 0.2885714285714286,
-      "grad_norm": 1.0928316116333008,
-      "learning_rate": 4.085337015388876e-06,
-      "loss": 1.4155,
       "step": 10100
     },
     {
-      "epoch": 0.2914285714285714,
-      "grad_norm": 1.1372452974319458,
-      "learning_rate": 4.067745759512654e-06,
-      "loss": 1.4229,
       "step": 10200
     },
     {
-      "epoch": 0.29428571428571426,
-      "grad_norm": 1.1249101161956787,
-      "learning_rate": 4.0500256294528084e-06,
-      "loss": 1.4178,
       "step": 10300
     },
     {
-      "epoch": 0.29714285714285715,
-      "grad_norm": 1.1340339183807373,
-      "learning_rate": 4.032178081865995e-06,
-      "loss": 1.4125,
       "step": 10400
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 1.0652027130126953,
-      "learning_rate": 4.014204583883038e-06,
-      "loss": 1.4283,
       "step": 10500
     },
     {
-      "epoch": 0.3028571428571429,
-      "grad_norm": 1.1057724952697754,
-      "learning_rate": 3.996106612988321e-06,
-      "loss": 1.4046,
       "step": 10600
     },
     {
-      "epoch": 0.3057142857142857,
-      "grad_norm": 1.089181661605835,
-      "learning_rate": 3.977885656898337e-06,
-      "loss": 1.4199,
       "step": 10700
     },
     {
-      "epoch": 0.30857142857142855,
-      "grad_norm": 1.0804879665374756,
-      "learning_rate": 3.959543213439393e-06,
-      "loss": 1.4259,
       "step": 10800
     },
     {
-      "epoch": 0.31142857142857144,
-      "grad_norm": 1.0948872566223145,
-      "learning_rate": 3.941080790424483e-06,
-      "loss": 1.4143,
       "step": 10900
     },
     {
-      "epoch": 0.3142857142857143,
-      "grad_norm": 1.1653496026992798,
-      "learning_rate": 3.92249990552934e-06,
-      "loss": 1.4343,
       "step": 11000
     },
     {
-      "epoch": 0.3142857142857143,
-      "eval_loss": 1.4226573705673218,
-      "eval_runtime": 98.9066,
-      "eval_samples_per_second": 138.464,
-      "eval_steps_per_second": 2.164,
       "step": 11000
     },
     {
-      "epoch": 0.3171428571428571,
-      "grad_norm": 1.0654685497283936,
-      "learning_rate": 3.903802086167676e-06,
-      "loss": 1.4102,
       "step": 11100
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 1.01749849319458,
-      "learning_rate": 3.884988869365626e-06,
-      "loss": 1.408,
       "step": 11200
     },
     {
-      "epoch": 0.32285714285714284,
-      "grad_norm": 1.1105825901031494,
-      "learning_rate": 3.866061801635399e-06,
-      "loss": 1.4276,
       "step": 11300
     },
     {
-      "epoch": 0.32571428571428573,
-      "grad_norm": 1.0666981935501099,
-      "learning_rate": 3.8470224388481485e-06,
-      "loss": 1.3964,
       "step": 11400
     },
     {
-      "epoch": 0.32857142857142857,
-      "grad_norm": 1.090728759765625,
-      "learning_rate": 3.827872346106073e-06,
-      "loss": 1.3981,
       "step": 11500
     },
     {
-      "epoch": 0.3314285714285714,
-      "grad_norm": 1.069846272468567,
-      "learning_rate": 3.808613097613759e-06,
-      "loss": 1.4053,
       "step": 11600
     },
     {
-      "epoch": 0.3342857142857143,
-      "grad_norm": 1.1465699672698975,
-      "learning_rate": 3.7892462765487836e-06,
-      "loss": 1.3944,
       "step": 11700
     },
     {
-      "epoch": 0.33714285714285713,
-      "grad_norm": 1.068352222442627,
-      "learning_rate": 3.769773474931558e-06,
-      "loss": 1.4284,
       "step": 11800
     },
     {
-      "epoch": 0.34,
-      "grad_norm": 1.1487313508987427,
-      "learning_rate": 3.7501962934944704e-06,
-      "loss": 1.3894,
       "step": 11900
     },
     {
-      "epoch": 0.34285714285714286,
-      "grad_norm": 1.1034648418426514,
-      "learning_rate": 3.7305163415502936e-06,
-      "loss": 1.4184,
       "step": 12000
     },
     {
-      "epoch": 0.34285714285714286,
-      "eval_loss": 1.4204550981521606,
-      "eval_runtime": 99.0133,
-      "eval_samples_per_second": 138.315,
-      "eval_steps_per_second": 2.161,
       "step": 12000
     },
     {
-      "epoch": 0.3457142857142857,
-      "grad_norm": 1.08807373046875,
-      "learning_rate": 3.710735236859895e-06,
-      "loss": 1.4208,
       "step": 12100
     },
     {
-      "epoch": 0.3485714285714286,
-      "grad_norm": 1.142823338508606,
-      "learning_rate": 3.6908546054992523e-06,
-      "loss": 1.4292,
       "step": 12200
     },
     {
-      "epoch": 0.3514285714285714,
-      "grad_norm": 1.0997464656829834,
-      "learning_rate": 3.670876081725784e-06,
-      "loss": 1.4058,
       "step": 12300
     },
     {
-      "epoch": 0.35428571428571426,
-      "grad_norm": 1.1083920001983643,
-      "learning_rate": 3.650801307844004e-06,
-      "loss": 1.4152,
       "step": 12400
     },
     {
-      "epoch": 0.35714285714285715,
-      "grad_norm": 1.1371042728424072,
-      "learning_rate": 3.630631934070527e-06,
-      "loss": 1.4259,
       "step": 12500
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 1.0470432043075562,
-      "learning_rate": 3.610369618398404e-06,
-      "loss": 1.3952,
       "step": 12600
     },
     {
-      "epoch": 0.3628571428571429,
-      "grad_norm": 1.0853626728057861,
-      "learning_rate": 3.5900160264608395e-06,
-      "loss": 1.4005,
       "step": 12700
     },
     {
-      "epoch": 0.3657142857142857,
-      "grad_norm": 1.0409729480743408,
-      "learning_rate": 3.569572831394265e-06,
-      "loss": 1.431,
       "step": 12800
     },
     {
-      "epoch": 0.36857142857142855,
-      "grad_norm": 1.1226378679275513,
-      "learning_rate": 3.5490417137007997e-06,
-      "loss": 1.4112,
       "step": 12900
     },
     {
-      "epoch": 0.37142857142857144,
-      "grad_norm": 1.0430322885513306,
-      "learning_rate": 3.528424361110115e-06,
-      "loss": 1.3999,
       "step": 13000
     },
     {
-      "epoch": 0.37142857142857144,
-      "eval_loss": 1.423007845878601,
-      "eval_runtime": 99.2113,
-      "eval_samples_per_second": 138.039,
-      "eval_steps_per_second": 2.157,
       "step": 13000
     },
     {
-      "epoch": 0.3742857142857143,
-      "grad_norm": 1.1154820919036865,
-      "learning_rate": 3.507722468440688e-06,
-      "loss": 1.4097,
       "step": 13100
     },
     {
-      "epoch": 0.37714285714285717,
-      "grad_norm": 1.1299182176589966,
-      "learning_rate": 3.4869377374604886e-06,
-      "loss": 1.4064,
       "step": 13200
     },
     {
-      "epoch": 0.38,
-      "grad_norm": 1.1046215295791626,
-      "learning_rate": 3.4660718767470854e-06,
-      "loss": 1.4234,
       "step": 13300
     },
     {
-      "epoch": 0.38285714285714284,
-      "grad_norm": 1.0251668691635132,
-      "learning_rate": 3.445126601547193e-06,
-      "loss": 1.4097,
       "step": 13400
     },
     {
-      "epoch": 0.38571428571428573,
-      "grad_norm": 1.0839489698410034,
-      "learning_rate": 3.4241036336356757e-06,
-      "loss": 1.401,
       "step": 13500
     },
     {
-      "epoch": 0.38857142857142857,
-      "grad_norm": 1.0709606409072876,
-      "learning_rate": 3.40300470117401e-06,
-      "loss": 1.4164,
       "step": 13600
     },
     {
-      "epoch": 0.3914285714285714,
-      "grad_norm": 1.0628767013549805,
-      "learning_rate": 3.3818315385682255e-06,
-      "loss": 1.409,
       "step": 13700
     },
     {
-      "epoch": 0.3942857142857143,
-      "grad_norm": 1.0831209421157837,
-      "learning_rate": 3.3605858863263274e-06,
-      "loss": 1.4073,
       "step": 13800
     },
     {
-      "epoch": 0.39714285714285713,
-      "grad_norm": 1.1459494829177856,
-      "learning_rate": 3.339269490915223e-06,
-      "loss": 1.4147,
       "step": 13900
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 1.0614882707595825,
-      "learning_rate": 3.317884104617155e-06,
-      "loss": 1.4089,
       "step": 14000
     },
     {
-      "epoch": 0.4,
-      "eval_loss": 1.4181102514266968,
-      "eval_runtime": 99.8701,
-      "eval_samples_per_second": 137.128,
-      "eval_steps_per_second": 2.143,
       "step": 14000
     },
     {
-      "epoch": 0.40285714285714286,
-      "grad_norm": 1.0587329864501953,
-      "learning_rate": 3.2964314853856593e-06,
-      "loss": 1.3895,
       "step": 14100
     },
     {
-      "epoch": 0.4057142857142857,
-      "grad_norm": 1.1020365953445435,
-      "learning_rate": 3.2749133967010545e-06,
-      "loss": 1.4037,
       "step": 14200
     },
     {
-      "epoch": 0.4085714285714286,
-      "grad_norm": 1.1230683326721191,
-      "learning_rate": 3.253331607425475e-06,
-      "loss": 1.4018,
       "step": 14300
     },
     {
-      "epoch": 0.4114285714285714,
-      "grad_norm": 1.0774966478347778,
-      "learning_rate": 3.231687891657469e-06,
-      "loss": 1.4087,
       "step": 14400
     },
     {
-      "epoch": 0.4142857142857143,
-      "grad_norm": 1.0514012575149536,
-      "learning_rate": 3.209984028586157e-06,
-      "loss": 1.3861,
       "step": 14500
     },
     {
-      "epoch": 0.41714285714285715,
-      "grad_norm": 1.1025465726852417,
-      "learning_rate": 3.188221802344978e-06,
-      "loss": 1.4038,
       "step": 14600
     },
     {
-      "epoch": 0.42,
-      "grad_norm": 1.139419436454773,
-      "learning_rate": 3.16640300186503e-06,
-      "loss": 1.4033,
       "step": 14700
     },
     {
-      "epoch": 0.4228571428571429,
-      "grad_norm": 1.043289303779602,
-      "learning_rate": 3.1445294207280093e-06,
-      "loss": 1.3867,
       "step": 14800
     },
     {
-      "epoch": 0.4257142857142857,
-      "grad_norm": 1.101967453956604,
-      "learning_rate": 3.1226028570187737e-06,
-      "loss": 1.391,
       "step": 14900
     },
     {
-      "epoch": 0.42857142857142855,
-      "grad_norm": 1.0626415014266968,
-      "learning_rate": 3.1006251131775342e-06,
-      "loss": 1.3949,
       "step": 15000
     },
     {
-      "epoch": 0.42857142857142855,
-      "eval_loss": 1.4195818901062012,
-      "eval_runtime": 99.5817,
-      "eval_samples_per_second": 137.525,
-      "eval_steps_per_second": 2.149,
       "step": 15000
     },
     {
-      "epoch": 0.43142857142857144,
-      "grad_norm": 1.1212193965911865,
-      "learning_rate": 3.078597995851689e-06,
-      "loss": 1.4007,
       "step": 15100
     },
     {
-      "epoch": 0.4342857142857143,
-      "grad_norm": 1.0601767301559448,
-      "learning_rate": 3.056523315747308e-06,
-      "loss": 1.4098,
       "step": 15200
     },
     {
-      "epoch": 0.43714285714285717,
-      "grad_norm": 1.0668915510177612,
-      "learning_rate": 3.034402887480287e-06,
-      "loss": 1.3885,
       "step": 15300
     },
     {
-      "epoch": 0.44,
-      "grad_norm": 1.0714190006256104,
-      "learning_rate": 3.012238529427181e-06,
-      "loss": 1.4018,
       "step": 15400
     },
     {
-      "epoch": 0.44285714285714284,
-      "grad_norm": 1.1230597496032715,
-      "learning_rate": 2.9900320635757293e-06,
-      "loss": 1.4086,
       "step": 15500
     },
     {
-      "epoch": 0.44571428571428573,
-      "grad_norm": 1.0094853639602661,
-      "learning_rate": 2.9677853153750763e-06,
-      "loss": 1.3801,
       "step": 15600
     },
     {
-      "epoch": 0.44857142857142857,
-      "grad_norm": 1.0972274541854858,
-      "learning_rate": 2.9455001135857194e-06,
-      "loss": 1.3985,
       "step": 15700
     },
     {
-      "epoch": 0.4514285714285714,
-      "grad_norm": 1.0266581773757935,
-      "learning_rate": 2.9231782901291726e-06,
-      "loss": 1.4124,
       "step": 15800
     },
     {
-      "epoch": 0.4542857142857143,
-      "grad_norm": 1.138675332069397,
-      "learning_rate": 2.900821679937382e-06,
-      "loss": 1.4173,
       "step": 15900
     },
     {
-      "epoch": 0.45714285714285713,
-      "grad_norm": 1.1691060066223145,
-      "learning_rate": 2.8784321208018817e-06,
-      "loss": 1.4123,
       "step": 16000
     },
     {
-      "epoch": 0.45714285714285713,
-      "eval_loss": 1.4248454570770264,
-      "eval_runtime": 99.8569,
-      "eval_samples_per_second": 137.146,
-      "eval_steps_per_second": 2.143,
       "step": 16000
     },
     {
-      "epoch": 0.46,
-      "grad_norm": 1.1149132251739502,
-      "learning_rate": 2.8560114532227262e-06,
-      "loss": 1.4171,
       "step": 16100
     },
     {
-      "epoch": 0.46285714285714286,
-      "grad_norm": 1.0276226997375488,
-      "learning_rate": 2.8335615202571927e-06,
-      "loss": 1.4177,
       "step": 16200
     },
     {
-      "epoch": 0.4657142857142857,
-      "grad_norm": 1.0828535556793213,
-      "learning_rate": 2.811084167368276e-06,
-      "loss": 1.3762,
       "step": 16300
     },
     {
-      "epoch": 0.4685714285714286,
-      "grad_norm": 1.171616554260254,
-      "learning_rate": 2.788581242272983e-06,
-      "loss": 1.3965,
       "step": 16400
     },
     {
-      "epoch": 0.4714285714285714,
-      "grad_norm": 1.0692201852798462,
-      "learning_rate": 2.7660545947904464e-06,
-      "loss": 1.4066,
       "step": 16500
     },
     {
-      "epoch": 0.4742857142857143,
-      "grad_norm": 1.1563397645950317,
-      "learning_rate": 2.7435060766898614e-06,
-      "loss": 1.4008,
       "step": 16600
     },
     {
-      "epoch": 0.47714285714285715,
-      "grad_norm": 1.1032534837722778,
-      "learning_rate": 2.7209375415382655e-06,
-      "loss": 1.3905,
       "step": 16700
     },
     {
-      "epoch": 0.48,
-      "grad_norm": 1.1357022523880005,
-      "learning_rate": 2.698350844548168e-06,
-      "loss": 1.406,
       "step": 16800
     },
     {
-      "epoch": 0.4828571428571429,
-      "grad_norm": 1.0574637651443481,
-      "learning_rate": 2.6757478424250417e-06,
-      "loss": 1.4049,
       "step": 16900
     },
     {
-      "epoch": 0.4857142857142857,
-      "grad_norm": 1.0180025100708008,
-      "learning_rate": 2.653130393214702e-06,
-      "loss": 1.3979,
       "step": 17000
     },
     {
-      "epoch": 0.4857142857142857,
-      "eval_loss": 1.4195657968521118,
-      "eval_runtime": 99.9485,
-      "eval_samples_per_second": 137.021,
-      "eval_steps_per_second": 2.141,
       "step": 17000
     },
     {
-      "epoch": 0.48857142857142855,
-      "grad_norm": 1.0153673887252808,
-      "learning_rate": 2.630500356150565e-06,
-      "loss": 1.4138,
       "step": 17100
     },
     {
-      "epoch": 0.49142857142857144,
-      "grad_norm": 1.0832693576812744,
-      "learning_rate": 2.6078595915008096e-06,
-      "loss": 1.3934,
       "step": 17200
     },
     {
-      "epoch": 0.4942857142857143,
-      "grad_norm": 1.1552319526672363,
-      "learning_rate": 2.585209960415464e-06,
-      "loss": 1.414,
       "step": 17300
     },
     {
-      "epoch": 0.49714285714285716,
-      "grad_norm": 1.1260509490966797,
-      "learning_rate": 2.562553324773404e-06,
-      "loss": 1.3988,
       "step": 17400
     },
     {
-      "epoch": 0.5,
-      "grad_norm": 1.1187398433685303,
-      "learning_rate": 2.5398915470293077e-06,
-      "loss": 1.4048,
       "step": 17500
     },
     {
-      "epoch": 0.5028571428571429,
-      "grad_norm": 1.0673401355743408,
-      "learning_rate": 2.5172264900605497e-06,
-      "loss": 1.4012,
       "step": 17600
     },
     {
-      "epoch": 0.5057142857142857,
-      "grad_norm": 1.098514199256897,
-      "learning_rate": 2.49456001701407e-06,
-      "loss": 1.4021,
       "step": 17700
     },
     {
-      "epoch": 0.5085714285714286,
-      "grad_norm": 1.1217247247695923,
-      "learning_rate": 2.471893991153216e-06,
-      "loss": 1.4041,
       "step": 17800
     },
     {
-      "epoch": 0.5114285714285715,
-      "grad_norm": 1.1324173212051392,
-      "learning_rate": 2.4492302757045705e-06,
-      "loss": 1.3942,
       "step": 17900
     },
     {
-      "epoch": 0.5142857142857142,
-      "grad_norm": 1.1281129121780396,
-      "learning_rate": 2.426570733704798e-06,
-      "loss": 1.4046,
       "step": 18000
     },
     {
-      "epoch": 0.5142857142857142,
-      "eval_loss": 1.4171615839004517,
-      "eval_runtime": 99.7442,
-      "eval_samples_per_second": 137.301,
-      "eval_steps_per_second": 2.145,
       "step": 18000
     },
     {
-      "epoch": 0.5171428571428571,
-      "grad_norm": 1.084283471107483,
-      "learning_rate": 2.4039172278474864e-06,
-      "loss": 1.4183,
       "step": 18100
     },
     {
-      "epoch": 0.52,
-      "grad_norm": 1.0714788436889648,
-      "learning_rate": 2.381271620330034e-06,
-      "loss": 1.3793,
       "step": 18200
     },
     {
-      "epoch": 0.5228571428571429,
-      "grad_norm": 1.1440812349319458,
-      "learning_rate": 2.358635772700567e-06,
-      "loss": 1.3765,
       "step": 18300
     },
     {
-      "epoch": 0.5257142857142857,
-      "grad_norm": 1.0656503438949585,
-      "learning_rate": 2.336011545704916e-06,
-      "loss": 1.4153,
       "step": 18400
     },
     {
-      "epoch": 0.5285714285714286,
-      "grad_norm": 1.1328638792037964,
-      "learning_rate": 2.3134007991336523e-06,
-      "loss": 1.3873,
       "step": 18500
     },
     {
-      "epoch": 0.5314285714285715,
-      "grad_norm": 1.0806158781051636,
-      "learning_rate": 2.290805391669212e-06,
-      "loss": 1.3774,
       "step": 18600
     },
     {
-      "epoch": 0.5342857142857143,
-      "grad_norm": 1.069150686264038,
-      "learning_rate": 2.2682271807331003e-06,
-      "loss": 1.3918,
       "step": 18700
     },
     {
-      "epoch": 0.5371428571428571,
-      "grad_norm": 1.1267215013504028,
-      "learning_rate": 2.2456680223332103e-06,
-      "loss": 1.3845,
       "step": 18800
     },
     {
-      "epoch": 0.54,
-      "grad_norm": 1.142121434211731,
-      "learning_rate": 2.2231297709112496e-06,
-      "loss": 1.4109,
       "step": 18900
     },
     {
-      "epoch": 0.5428571428571428,
-      "grad_norm": 1.0814783573150635,
-      "learning_rate": 2.2006142791902957e-06,
-      "loss": 1.4098,
       "step": 19000
     },
     {
-      "epoch": 0.5428571428571428,
-      "eval_loss": 1.416707158088684,
-      "eval_runtime": 100.0528,
-      "eval_samples_per_second": 136.878,
-      "eval_steps_per_second": 2.139,
       "step": 19000
     },
     {
-      "epoch": 0.5457142857142857,
-      "grad_norm": 1.0706247091293335,
-      "learning_rate": 2.1781233980225035e-06,
-      "loss": 1.4188,
       "step": 19100
     },
     {
-      "epoch": 0.5485714285714286,
-      "grad_norm": 1.021273136138916,
-      "learning_rate": 2.1556589762369518e-06,
-      "loss": 1.3989,
       "step": 19200
     },
     {
-      "epoch": 0.5514285714285714,
-      "grad_norm": 1.1904112100601196,
-      "learning_rate": 2.133222860487667e-06,
-      "loss": 1.4393,
       "step": 19300
     },
     {
-      "epoch": 0.5542857142857143,
-      "grad_norm": 1.1062791347503662,
-      "learning_rate": 2.1108168951018186e-06,
-      "loss": 1.4045,
       "step": 19400
     },
     {
-      "epoch": 0.5571428571428572,
-      "grad_norm": 1.1809172630310059,
-      "learning_rate": 2.088442921928113e-06,
-      "loss": 1.3958,
       "step": 19500
     },
     {
-      "epoch": 0.56,
-      "grad_norm": 1.0156745910644531,
-      "learning_rate": 2.066102780185383e-06,
-      "loss": 1.398,
       "step": 19600
     },
     {
-      "epoch": 0.5628571428571428,
-      "grad_norm": 1.1121779680252075,
-      "learning_rate": 2.0437983063114013e-06,
-      "loss": 1.4122,
       "step": 19700
     },
     {
-      "epoch": 0.5657142857142857,
-      "grad_norm": 1.0523419380187988,
-      "learning_rate": 2.021531333811914e-06,
-      "loss": 1.4063,
       "step": 19800
     },
     {
-      "epoch": 0.5685714285714286,
-      "grad_norm": 1.099584698677063,
-      "learning_rate": 1.9993036931099265e-06,
-      "loss": 1.409,
       "step": 19900
     },
     {
-      "epoch": 0.5714285714285714,
-      "grad_norm": 1.1999467611312866,
-      "learning_rate": 1.9771172113952327e-06,
-      "loss": 1.4,
       "step": 20000
     },
     {
-      "epoch": 0.5714285714285714,
-      "eval_loss": 1.415099024772644,
-      "eval_runtime": 99.755,
-      "eval_samples_per_second": 137.286,
-      "eval_steps_per_second": 2.145,
       "step": 20000
     },
     {
-      "epoch": 0.5742857142857143,
-      "grad_norm": 1.0494403839111328,
-      "learning_rate": 1.9549737124742104e-06,
-      "loss": 1.4095,
       "step": 20100
     },
     {
-      "epoch": 0.5771428571428572,
-      "grad_norm": 1.1081063747406006,
-      "learning_rate": 1.9328750166199046e-06,
-      "loss": 1.3992,
       "step": 20200
     },
     {
-      "epoch": 0.58,
-      "grad_norm": 1.1197865009307861,
-      "learning_rate": 1.91082294042239e-06,
-      "loss": 1.3917,
       "step": 20300
     },
     {
-      "epoch": 0.5828571428571429,
-      "grad_norm": 1.140148639678955,
-      "learning_rate": 1.8888192966394448e-06,
-      "loss": 1.3907,
       "step": 20400
     },
     {
-      "epoch": 0.5857142857142857,
-      "grad_norm": 1.0425162315368652,
-      "learning_rate": 1.8668658940475298e-06,
-      "loss": 1.4006,
       "step": 20500
     },
     {
-      "epoch": 0.5885714285714285,
-      "grad_norm": 1.1035826206207275,
-      "learning_rate": 1.8449645372931068e-06,
-      "loss": 1.4033,
       "step": 20600
     },
     {
-      "epoch": 0.5914285714285714,
-      "grad_norm": 1.1139192581176758,
-      "learning_rate": 1.823117026744287e-06,
-      "loss": 1.3964,
       "step": 20700
     },
     {
-      "epoch": 0.5942857142857143,
-      "grad_norm": 1.1130657196044922,
-      "learning_rate": 1.8013251583428366e-06,
-      "loss": 1.3972,
       "step": 20800
     },
     {
-      "epoch": 0.5971428571428572,
-      "grad_norm": 1.0860106945037842,
-      "learning_rate": 1.7795907234565385e-06,
-      "loss": 1.3931,
       "step": 20900
     },
     {
-      "epoch": 0.6,
-      "grad_norm": 1.05580472946167,
-      "learning_rate": 1.7579155087319443e-06,
-      "loss": 1.3874,
       "step": 21000
     },
     {
-      "epoch": 0.6,
-      "eval_loss": 1.4096276760101318,
-      "eval_runtime": 99.8984,
-      "eval_samples_per_second": 137.089,
-      "eval_steps_per_second": 2.142,
       "step": 21000
     },
     {
-      "epoch": 0.6028571428571429,
-      "grad_norm": 1.1223632097244263,
-      "learning_rate": 1.7363012959475e-06,
-      "loss": 1.3793,
       "step": 21100
     },
     {
-      "epoch": 0.6057142857142858,
-      "grad_norm": 1.115355372428894,
-      "learning_rate": 1.7147498618670778e-06,
-      "loss": 1.4093,
       "step": 21200
     },
     {
-      "epoch": 0.6085714285714285,
-      "grad_norm": 1.0437370538711548,
-      "learning_rate": 1.6932629780939225e-06,
-      "loss": 1.3875,
       "step": 21300
     },
     {
-      "epoch": 0.6114285714285714,
-      "grad_norm": 1.0260958671569824,
-      "learning_rate": 1.6718424109250154e-06,
-      "loss": 1.4035,
       "step": 21400
     },
     {
-      "epoch": 0.6142857142857143,
-      "grad_norm": 0.9281340837478638,
-      "learning_rate": 1.6504899212058837e-06,
-      "loss": 1.3853,
       "step": 21500
     },
     {
-      "epoch": 0.6171428571428571,
-      "grad_norm": 1.1064680814743042,
-      "learning_rate": 1.6292072641858478e-06,
-      "loss": 1.4016,
       "step": 21600
     },
     {
-      "epoch": 0.62,
-      "grad_norm": 1.0704963207244873,
-      "learning_rate": 1.6079961893737384e-06,
-      "loss": 1.3848,
       "step": 21700
     },
     {
-      "epoch": 0.6228571428571429,
-      "grad_norm": 1.0652328729629517,
-      "learning_rate": 1.5868584403940768e-06,
-      "loss": 1.3749,
       "step": 21800
     },
     {
-      "epoch": 0.6257142857142857,
-      "grad_norm": 1.0742926597595215,
-      "learning_rate": 1.5657957548437447e-06,
-      "loss": 1.404,
       "step": 21900
     },
     {
-      "epoch": 0.6285714285714286,
-      "grad_norm": 1.0579770803451538,
-      "learning_rate": 1.5448098641491487e-06,
-      "loss": 1.4036,
       "step": 22000
     },
     {
-      "epoch": 0.6285714285714286,
-      "eval_loss": 1.4158315658569336,
-      "eval_runtime": 100.3868,
-      "eval_samples_per_second": 136.422,
-      "eval_steps_per_second": 2.132,
       "step": 22000
     },
     {
-      "epoch": 0.6314285714285715,
-      "grad_norm": 1.101526141166687,
-      "learning_rate": 1.5239024934238874e-06,
-      "loss": 1.4188,
       "step": 22100
     },
     {
-      "epoch": 0.6342857142857142,
-      "grad_norm": 1.0752556324005127,
-      "learning_rate": 1.5030753613269455e-06,
-      "loss": 1.3847,
       "step": 22200
     },
     {
-      "epoch": 0.6371428571428571,
-      "grad_norm": 1.0786316394805908,
-      "learning_rate": 1.4823301799214101e-06,
-      "loss": 1.3867,
       "step": 22300
     },
     {
-      "epoch": 0.64,
-      "grad_norm": 1.0339590311050415,
-      "learning_rate": 1.4616686545337374e-06,
-      "loss": 1.3662,
       "step": 22400
     },
     {
-      "epoch": 0.6428571428571429,
-      "grad_norm": 1.0901203155517578,
-      "learning_rate": 1.4410924836135625e-06,
-      "loss": 1.3836,
       "step": 22500
     },
     {
-      "epoch": 0.6457142857142857,
-      "grad_norm": 1.0557289123535156,
-      "learning_rate": 1.4206033585940895e-06,
-      "loss": 1.375,
       "step": 22600
     },
     {
-      "epoch": 0.6485714285714286,
-      "grad_norm": 1.049706220626831,
-      "learning_rate": 1.40020296375304e-06,
-      "loss": 1.3977,
       "step": 22700
     },
     {
-      "epoch": 0.6514285714285715,
-      "grad_norm": 1.170900821685791,
-      "learning_rate": 1.379892976074209e-06,
-      "loss": 1.384,
       "step": 22800
     },
     {
-      "epoch": 0.6542857142857142,
-      "grad_norm": 1.10288667678833,
-      "learning_rate": 1.3596750651096047e-06,
-      "loss": 1.4045,
       "step": 22900
     },
     {
-      "epoch": 0.6571428571428571,
-      "grad_norm": 1.0909626483917236,
-      "learning_rate": 1.3395508928422074e-06,
-      "loss": 1.4018,
       "step": 23000
     },
     {
-      "epoch": 0.6571428571428571,
-      "eval_loss": 1.4156948328018188,
-      "eval_runtime": 100.678,
-      "eval_samples_per_second": 136.028,
-      "eval_steps_per_second": 2.126,
       "step": 23000
     },
     {
-      "epoch": 0.66,
-      "grad_norm": 1.1391985416412354,
-      "learning_rate": 1.3195221135493503e-06,
-      "loss": 1.372,
       "step": 23100
     },
     {
-      "epoch": 0.6628571428571428,
-      "grad_norm": 1.124377965927124,
-      "learning_rate": 1.2995903736667267e-06,
-      "loss": 1.3998,
       "step": 23200
     },
     {
-      "epoch": 0.6657142857142857,
-      "grad_norm": 1.1056832075119019,
-      "learning_rate": 1.279757311653056e-06,
-      "loss": 1.3677,
       "step": 23300
     },
     {
-      "epoch": 0.6685714285714286,
-      "grad_norm": 1.0959793329238892,
-      "learning_rate": 1.2600245578553866e-06,
-      "loss": 1.3801,
       "step": 23400
     },
     {
-      "epoch": 0.6714285714285714,
-      "grad_norm": 1.0466543436050415,
-      "learning_rate": 1.240393734375086e-06,
-      "loss": 1.3866,
       "step": 23500
     },
     {
-      "epoch": 0.6742857142857143,
-      "grad_norm": 1.0811994075775146,
-      "learning_rate": 1.2208664549344884e-06,
-      "loss": 1.3885,
       "step": 23600
     },
     {
-      "epoch": 0.6771428571428572,
-      "grad_norm": 1.1533517837524414,
-      "learning_rate": 1.2014443247442498e-06,
-      "loss": 1.3684,
       "step": 23700
     },
     {
-      "epoch": 0.68,
-      "grad_norm": 1.0400276184082031,
-      "learning_rate": 1.1821289403713865e-06,
-      "loss": 1.3733,
       "step": 23800
     },
     {
-      "epoch": 0.6828571428571428,
-      "grad_norm": 1.0742322206497192,
-      "learning_rate": 1.1629218896080382e-06,
-      "loss": 1.3884,
       "step": 23900
     },
     {
-      "epoch": 0.6857142857142857,
-      "grad_norm": 1.0781975984573364,
-      "learning_rate": 1.1438247513409423e-06,
-      "loss": 1.3611,
       "step": 24000
     },
     {
-      "epoch": 0.6857142857142857,
-      "eval_loss": 1.4142818450927734,
-      "eval_runtime": 100.8413,
-      "eval_samples_per_second": 135.807,
-      "eval_steps_per_second": 2.122,
       "step": 24000
     },
     {
-      "epoch": 0.6885714285714286,
-      "grad_norm": 1.1411370038986206,
-      "learning_rate": 1.1248390954216437e-06,
-      "loss": 1.3838,
       "step": 24100
     },
     {
-      "epoch": 0.6914285714285714,
-      "grad_norm": 1.0584548711776733,
-      "learning_rate": 1.1059664825374511e-06,
-      "loss": 1.3482,
       "step": 24200
     },
     {
-      "epoch": 0.6942857142857143,
-      "grad_norm": 1.096170425415039,
-      "learning_rate": 1.0872084640831356e-06,
-      "loss": 1.3704,
       "step": 24300
     },
     {
-      "epoch": 0.6971428571428572,
-      "grad_norm": 1.0241279602050781,
-      "learning_rate": 1.068566582033411e-06,
-      "loss": 1.3735,
       "step": 24400
     },
     {
-      "epoch": 0.7,
-      "grad_norm": 1.0666210651397705,
-      "learning_rate": 1.050042368816168e-06,
-      "loss": 1.3893,
       "step": 24500
     },
     {
-      "epoch": 0.7028571428571428,
-      "grad_norm": 1.0070935487747192,
-      "learning_rate": 1.0316373471865108e-06,
-      "loss": 1.3807,
       "step": 24600
     },
     {
-      "epoch": 0.7057142857142857,
-      "grad_norm": 1.0485628843307495,
-      "learning_rate": 1.013353030101576e-06,
-      "loss": 1.3817,
       "step": 24700
     },
     {
-      "epoch": 0.7085714285714285,
-      "grad_norm": 0.9520274996757507,
-      "learning_rate": 9.951909205961665e-07,
-      "loss": 1.3201,
       "step": 24800
     },
     {
-      "epoch": 0.7114285714285714,
-      "grad_norm": 1.0479100942611694,
-      "learning_rate": 9.77152511659194e-07,
-      "loss": 1.2627,
       "step": 24900
     },
     {
-      "epoch": 0.7142857142857143,
-      "grad_norm": 1.0204826593399048,
-      "learning_rate": 9.59239286110952e-07,
-      "loss": 1.2352,
       "step": 25000
     },
     {
-      "epoch": 0.7142857142857143,
-      "eval_loss": 1.4112086296081543,
-      "eval_runtime": 101.0868,
-      "eval_samples_per_second": 135.478,
-      "eval_steps_per_second": 2.117,
       "step": 25000
     }
   ],
   "logging_steps": 100,
-  "max_steps": 35000,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 9223372036854775807,
   "save_steps": 5000,
@@ -1979,12 +1979,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 3.3846542204928e+18,
   "train_batch_size": 64,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.08784,
   "eval_steps": 1000,
   "global_step": 25000,
   "is_hyper_param_search": false,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 4e-05,
+      "grad_norm": 0.911555290222168,
       "learning_rate": 0.0,
+      "loss": 0.7505,
       "step": 1
     },
     {
+      "epoch": 0.004,
+      "grad_norm": 1.2557882070541382,
+      "learning_rate": 9.9e-07,
+      "loss": 0.831,
       "step": 100
     },
     {
+      "epoch": 0.008,
+      "grad_norm": 0.9086900353431702,
+      "learning_rate": 1.9900000000000004e-06,
+      "loss": 0.8295,
       "step": 200
     },
     {
+      "epoch": 0.012,
+      "grad_norm": 0.9221948385238647,
+      "learning_rate": 2.4999758220143106e-06,
+      "loss": 0.8411,
       "step": 300
     },
     {
+      "epoch": 0.016,
+      "grad_norm": 0.8809811472892761,
+      "learning_rate": 2.4997764426529066e-06,
+      "loss": 0.8288,
       "step": 400
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 1.3145067691802979,
+      "learning_rate": 2.499375702067717e-06,
+      "loss": 0.8312,
       "step": 500
     },
     {
+      "epoch": 0.024,
+      "grad_norm": 0.9034631252288818,
+      "learning_rate": 2.4987736648251815e-06,
+      "loss": 0.8385,
       "step": 600
     },
     {
+      "epoch": 0.028,
+      "grad_norm": 0.8681179881095886,
+      "learning_rate": 2.497970427924213e-06,
+      "loss": 0.8175,
       "step": 700
     },
     {
+      "epoch": 0.032,
+      "grad_norm": 0.9303165674209595,
+      "learning_rate": 2.496966120780569e-06,
+      "loss": 0.8281,
       "step": 800
     },
     {
+      "epoch": 0.036,
+      "grad_norm": 0.9573058485984802,
+      "learning_rate": 2.4957609052060012e-06,
+      "loss": 0.8326,
       "step": 900
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 0.9730055928230286,
+      "learning_rate": 2.4943549753821847e-06,
+      "loss": 0.8391,
       "step": 1000
     },
     {
+      "epoch": 0.04,
+      "eval_loss": 1.5264503955841064,
+      "eval_runtime": 104.8997,
+      "eval_samples_per_second": 130.553,
+      "eval_steps_per_second": 2.04,
       "step": 1000
     },
     {
+      "epoch": 0.044,
+      "grad_norm": 0.8237825632095337,
+      "learning_rate": 2.4927485578294313e-06,
+      "loss": 0.8176,
       "step": 1100
     },
     {
+      "epoch": 0.048,
+      "grad_norm": 0.9133234620094299,
+      "learning_rate": 2.4909419113701947e-06,
+      "loss": 0.8303,
       "step": 1200
     },
     {
+      "epoch": 0.052,
+      "grad_norm": 0.9377557635307312,
+      "learning_rate": 2.4889353270873663e-06,
+      "loss": 0.8159,
       "step": 1300
     },
     {
+      "epoch": 0.056,
+      "grad_norm": 0.9034435749053955,
+      "learning_rate": 2.4867291282773805e-06,
+      "loss": 0.8145,
       "step": 1400
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 1.0601003170013428,
+      "learning_rate": 2.4843236703981235e-06,
+      "loss": 0.8317,
       "step": 1500
     },
     {
+      "epoch": 0.064,
+      "grad_norm": 0.9157763719558716,
+      "learning_rate": 2.481719341011662e-06,
+      "loss": 0.8355,
       "step": 1600
     },
     {
+      "epoch": 0.068,
+      "grad_norm": 0.9011576175689697,
+      "learning_rate": 2.4789165597218035e-06,
+      "loss": 0.8319,
       "step": 1700
     },
     {
+      "epoch": 0.072,
+      "grad_norm": 0.8954268097877502,
+      "learning_rate": 2.475915778106486e-06,
+      "loss": 0.8156,
       "step": 1800
     },
     {
+      "epoch": 0.076,
+      "grad_norm": 0.8911709189414978,
+      "learning_rate": 2.4727174796450266e-06,
+      "loss": 0.8365,
       "step": 1900
     },
     {
+      "epoch": 0.08,
+      "grad_norm": 0.9407449960708618,
+      "learning_rate": 2.4693221796402166e-06,
+      "loss": 0.8288,
       "step": 2000
     },
     {
+      "epoch": 0.08,
+      "eval_loss": 1.5217734575271606,
+      "eval_runtime": 98.2235,
+      "eval_samples_per_second": 139.427,
+      "eval_steps_per_second": 2.179,
       "step": 2000
     },
     {
+      "epoch": 0.084,
+      "grad_norm": 0.8769101500511169,
+      "learning_rate": 2.4657304251353047e-06,
+      "loss": 0.8131,
       "step": 2100
     },
     {
+      "epoch": 0.088,
+      "grad_norm": 0.8608514070510864,
+      "learning_rate": 2.4619427948258547e-06,
+      "loss": 0.8088,
       "step": 2200
     },
     {
+      "epoch": 0.092,
+      "grad_norm": 0.9365686178207397,
+      "learning_rate": 2.4579598989665065e-06,
+      "loss": 0.8286,
       "step": 2300
     },
     {
+      "epoch": 0.096,
+      "grad_norm": 0.928945779800415,
+      "learning_rate": 2.453782379272657e-06,
+      "loss": 0.8109,
       "step": 2400
     },
     {
+      "epoch": 0.1,
+      "grad_norm": 0.9162323474884033,
+      "learning_rate": 2.449410908817064e-06,
+      "loss": 0.806,
       "step": 2500
     },
     {
+      "epoch": 0.104,
+      "grad_norm": 0.9436105489730835,
+      "learning_rate": 2.444846191921406e-06,
+      "loss": 0.7969,
       "step": 2600
     },
     {
+      "epoch": 0.108,
+      "grad_norm": 0.9459385871887207,
+      "learning_rate": 2.4400889640427992e-06,
+      "loss": 0.8315,
       "step": 2700
     },
     {
+      "epoch": 0.112,
+      "grad_norm": 0.9575082063674927,
+      "learning_rate": 2.435139991655308e-06,
+      "loss": 0.8324,
       "step": 2800
     },
     {
+      "epoch": 0.116,
+      "grad_norm": 0.927148163318634,
+      "learning_rate": 2.4300000721264466e-06,
+      "loss": 0.8267,
       "step": 2900
     },
     {
+      "epoch": 0.12,
+      "grad_norm": 0.9774505496025085,
+      "learning_rate": 2.4246700335887123e-06,
+      "loss": 0.8262,
       "step": 3000
     },
     {
+      "epoch": 0.12,
+      "eval_loss": 1.5202959775924683,
+      "eval_runtime": 98.5199,
+      "eval_samples_per_second": 139.007,
+      "eval_steps_per_second": 2.172,
       "step": 3000
     },
     {
+      "epoch": 0.124,
+      "grad_norm": 0.9433075785636902,
+      "learning_rate": 2.4191507348061575e-06,
+      "loss": 0.803,
       "step": 3100
     },
     {
+      "epoch": 0.128,
+      "grad_norm": 0.9418466091156006,
+      "learning_rate": 2.4134430650360284e-06,
+      "loss": 0.8088,
       "step": 3200
     },
     {
+      "epoch": 0.132,
+      "grad_norm": 0.9223436713218689,
+      "learning_rate": 2.407547943885489e-06,
+      "loss": 0.8116,
       "step": 3300
     },
     {
+      "epoch": 0.136,
+      "grad_norm": 0.9359924793243408,
+      "learning_rate": 2.4014663211634552e-06,
+      "loss": 0.8232,
       "step": 3400
     },
     {
+      "epoch": 0.14,
+      "grad_norm": 0.9347231388092041,
+      "learning_rate": 2.395199176727567e-06,
+      "loss": 0.8131,
       "step": 3500
     },
     {
+      "epoch": 0.144,
+      "grad_norm": 0.9255951046943665,
+      "learning_rate": 2.388747520326311e-06,
+      "loss": 0.8064,
       "step": 3600
     },
     {
+      "epoch": 0.148,
+      "grad_norm": 0.8580342531204224,
+      "learning_rate": 2.3821123914363374e-06,
+      "loss": 0.8247,
       "step": 3700
     },
     {
+      "epoch": 0.152,
+      "grad_norm": 0.8920683860778809,
+      "learning_rate": 2.3752948590949766e-06,
+      "loss": 0.8058,
       "step": 3800
     },
     {
+      "epoch": 0.156,
+      "grad_norm": 0.8848472237586975,
+      "learning_rate": 2.368296021728002e-06,
+      "loss": 0.8209,
       "step": 3900
     },
     {
+      "epoch": 0.16,
+      "grad_norm": 0.9708815217018127,
+      "learning_rate": 2.3611170069726532e-06,
+      "loss": 0.8216,
       "step": 4000
     },
     {
+      "epoch": 0.16,
+      "eval_loss": 1.5283503532409668,
+      "eval_runtime": 98.9755,
+      "eval_samples_per_second": 138.368,
+      "eval_steps_per_second": 2.162,
       "step": 4000
     },
     {
+      "epoch": 0.164,
+      "grad_norm": 0.8715313673019409,
+      "learning_rate": 2.3537589714959523e-06,
+      "loss": 0.8185,
       "step": 4100
     },
     {
+      "epoch": 0.168,
+      "grad_norm": 0.9748795032501221,
+      "learning_rate": 2.346223100808346e-06,
+      "loss": 0.8172,
       "step": 4200
     },
     {
+      "epoch": 0.172,
+      "grad_norm": 0.900182843208313,
+      "learning_rate": 2.3385106090726974e-06,
+      "loss": 0.8101,
       "step": 4300
     },
     {
+      "epoch": 0.176,
+      "grad_norm": 0.8882376551628113,
+      "learning_rate": 2.330622738908663e-06,
+      "loss": 0.8004,
       "step": 4400
     },
     {
+      "epoch": 0.18,
+      "grad_norm": 0.9087768793106079,
+      "learning_rate": 2.322560761192485e-06,
+      "loss": 0.8028,
       "step": 4500
     },
     {
+      "epoch": 0.184,
+      "grad_norm": 0.9928045868873596,
+      "learning_rate": 2.3143259748522308e-06,
+      "loss": 0.8257,
       "step": 4600
     },
     {
+      "epoch": 0.188,
+      "grad_norm": 0.9519675970077515,
+      "learning_rate": 2.3059197066585126e-06,
+      "loss": 0.817,
       "step": 4700
     },
     {
+      "epoch": 0.192,
+      "grad_norm": 0.970738410949707,
+      "learning_rate": 2.297343311010719e-06,
+      "loss": 0.8109,
       "step": 4800
     },
     {
+      "epoch": 0.196,
+      "grad_norm": 0.9740980267524719,
+      "learning_rate": 2.2885981697188002e-06,
+      "loss": 0.8168,
       "step": 4900
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 0.9454805850982666,
+      "learning_rate": 2.2796856917806313e-06,
+      "loss": 0.8305,
       "step": 5000
     },
     {
+      "epoch": 0.2,
+      "eval_loss": 1.5317082405090332,
+      "eval_runtime": 98.9715,
+      "eval_samples_per_second": 138.373,
+      "eval_steps_per_second": 2.162,
       "step": 5000
     },
     {
+      "epoch": 0.204,
+      "grad_norm": 0.9181498289108276,
+      "learning_rate": 2.270607313155e-06,
+      "loss": 0.807,
       "step": 5100
     },
     {
+      "epoch": 0.208,
+      "grad_norm": 0.8452897071838379,
+      "learning_rate": 2.2613644965302456e-06,
+      "loss": 0.802,
       "step": 5200
     },
     {
+      "epoch": 0.212,
+      "grad_norm": 0.8827036619186401,
+      "learning_rate": 2.251958731088596e-06,
+      "loss": 0.8001,
       "step": 5300
     },
     {
+      "epoch": 0.216,
+      "grad_norm": 0.8728039264678955,
+      "learning_rate": 2.242391532266232e-06,
+      "loss": 0.8211,
       "step": 5400
     },
     {
+      "epoch": 0.22,
+      "grad_norm": 0.9410618543624878,
+      "learning_rate": 2.2326644415091264e-06,
+      "loss": 0.7996,
       "step": 5500
     },
     {
+      "epoch": 0.224,
+      "grad_norm": 0.9829330444335938,
+      "learning_rate": 2.2227790260246856e-06,
+      "loss": 0.7971,
       "step": 5600
     },
     {
+      "epoch": 0.228,
+      "grad_norm": 0.9688398241996765,
+      "learning_rate": 2.2127368785292484e-06,
+      "loss": 0.7854,
       "step": 5700
     },
     {
+      "epoch": 0.232,
+      "grad_norm": 0.864470362663269,
+      "learning_rate": 2.2025396169914697e-06,
+      "loss": 0.8192,
       "step": 5800
     },
     {
+      "epoch": 0.236,
+      "grad_norm": 0.9038395881652832,
+      "learning_rate": 2.1921888843716356e-06,
+      "loss": 0.8005,
       "step": 5900
     },
     {
+      "epoch": 0.24,
+      "grad_norm": 0.8807651996612549,
+      "learning_rate": 2.181686348356955e-06,
+      "loss": 0.806,
       "step": 6000
     },
     {
+      "epoch": 0.24,
+      "eval_loss": 1.524116039276123,
+      "eval_runtime": 99.2477,
+      "eval_samples_per_second": 137.988,
+      "eval_steps_per_second": 2.156,
       "step": 6000
     },
     {
+      "epoch": 0.244,
+      "grad_norm": 1.0644515752792358,
+      "learning_rate": 2.1710337010928655e-06,
+      "loss": 0.8232,
       "step": 6100
     },
     {
+      "epoch": 0.248,
+      "grad_norm": 0.9187564253807068,
+      "learning_rate": 2.1602326589103967e-06,
+      "loss": 0.8036,
       "step": 6200
     },
     {
+      "epoch": 0.252,
+      "grad_norm": 0.9233301877975464,
+      "learning_rate": 2.1492849620496414e-06,
+      "loss": 0.8118,
       "step": 6300
     },
     {
+      "epoch": 0.256,
+      "grad_norm": 0.9559895396232605,
+      "learning_rate": 2.13819237437937e-06,
+      "loss": 0.7959,
       "step": 6400
     },
     {
+      "epoch": 0.26,
+      "grad_norm": 0.8455320000648499,
+      "learning_rate": 2.126956683112842e-06,
+      "loss": 0.8254,
       "step": 6500
     },
     {
+      "epoch": 0.264,
+      "grad_norm": 0.942471444606781,
+      "learning_rate": 2.1155796985198495e-06,
+      "loss": 0.808,
       "step": 6600
     },
     {
+      "epoch": 0.268,
+      "grad_norm": 0.8535305261611938,
+      "learning_rate": 2.1040632536350573e-06,
+      "loss": 0.8182,
       "step": 6700
     },
     {
+      "epoch": 0.272,
+      "grad_norm": 0.8879380226135254,
+      "learning_rate": 2.092409203962663e-06,
+      "loss": 0.8177,
       "step": 6800
     },
     {
+      "epoch": 0.276,
+      "grad_norm": 0.8684147000312805,
+      "learning_rate": 2.080619427177443e-06,
+      "loss": 0.7982,
       "step": 6900
     },
     {
+      "epoch": 0.28,
+      "grad_norm": 0.9437069892883301,
+      "learning_rate": 2.0686958228222298e-06,
+      "loss": 0.7984,
       "step": 7000
     },
     {
+      "epoch": 0.28,
+      "eval_loss": 1.530232548713684,
+      "eval_runtime": 99.3518,
+      "eval_samples_per_second": 137.844,
+      "eval_steps_per_second": 2.154,
       "step": 7000
     },
     {
+      "epoch": 0.284,
+      "grad_norm": 0.9226755499839783,
+      "learning_rate": 2.056640312001856e-06,
+      "loss": 0.8072,
       "step": 7100
     },
     {
+      "epoch": 0.288,
+      "grad_norm": 0.9192745685577393,
+      "learning_rate": 2.0444548370736335e-06,
+      "loss": 0.8081,
       "step": 7200
     },
     {
+      "epoch": 0.292,
+      "grad_norm": 1.026985764503479,
+      "learning_rate": 2.032141361334406e-06,
+      "loss": 0.8074,
       "step": 7300
     },
     {
+      "epoch": 0.296,
+      "grad_norm": 0.8428290486335754,
+      "learning_rate": 2.019701868704224e-06,
+      "loss": 0.8081,
       "step": 7400
     },
     {
+      "epoch": 0.3,
+      "grad_norm": 0.9866459369659424,
+      "learning_rate": 2.007138363406702e-06,
+      "loss": 0.8241,
       "step": 7500
     },
     {
+      "epoch": 0.304,
+      "grad_norm": 0.9240759015083313,
+      "learning_rate": 1.9944528696461016e-06,
+      "loss": 0.8089,
       "step": 7600
     },
     {
+      "epoch": 0.308,
+      "grad_norm": 0.8980386853218079,
+      "learning_rate": 1.9816474312811984e-06,
+      "loss": 0.7995,
       "step": 7700
     },
     {
+      "epoch": 0.312,
+      "grad_norm": 0.9766695499420166,
+      "learning_rate": 1.9687241114959753e-06,
+      "loss": 0.7969,
       "step": 7800
     },
     {
+      "epoch": 0.316,
+      "grad_norm": 0.8739997148513794,
+      "learning_rate": 1.955684992467211e-06,
+      "loss": 0.8053,
       "step": 7900
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 0.9071422219276428,
+      "learning_rate": 1.942532175029003e-06,
+      "loss": 0.7896,
       "step": 8000
     },
     {
+      "epoch": 0.32,
+      "eval_loss": 1.5243619680404663,
+      "eval_runtime": 99.5243,
+      "eval_samples_per_second": 137.605,
+      "eval_steps_per_second": 2.15,
       "step": 8000
     },
     {
+      "epoch": 0.324,
+      "grad_norm": 0.9778127670288086,
+      "learning_rate": 1.929267778334285e-06,
+      "loss": 0.7878,
       "step": 8100
     },
     {
+      "epoch": 0.328,
+      "grad_norm": 0.9122934937477112,
+      "learning_rate": 1.915893939513396e-06,
+      "loss": 0.7967,
       "step": 8200
     },
     {
+      "epoch": 0.332,
+      "grad_norm": 0.90513676404953,
+      "learning_rate": 1.9024128133297467e-06,
+      "loss": 0.8048,
       "step": 8300
     },
     {
+      "epoch": 0.336,
+      "grad_norm": 0.9107154607772827,
+      "learning_rate": 1.8888265718326532e-06,
+      "loss": 0.7944,
       "step": 8400
     },
     {
+      "epoch": 0.34,
+      "grad_norm": 0.8964477777481079,
+      "learning_rate": 1.8751374040073774e-06,
+      "loss": 0.7958,
       "step": 8500
     },
     {
+      "epoch": 0.344,
+      "grad_norm": 0.9018213152885437,
+      "learning_rate": 1.8613475154224456e-06,
+      "loss": 0.8065,
       "step": 8600
     },
     {
+      "epoch": 0.348,
+      "grad_norm": 0.9653429985046387,
+      "learning_rate": 1.8474591278742894e-06,
+      "loss": 0.8194,
       "step": 8700
     },
     {
+      "epoch": 0.352,
+      "grad_norm": 0.9324017763137817,
+      "learning_rate": 1.8334744790292766e-06,
+      "loss": 0.796,
       "step": 8800
     },
     {
+      "epoch": 0.356,
+      "grad_norm": 1.0298709869384766,
+      "learning_rate": 1.8193958220631833e-06,
+      "loss": 0.8268,
       "step": 8900
     },
     {
+      "epoch": 0.36,
+      "grad_norm": 0.8846196532249451,
+      "learning_rate": 1.805225425298166e-06,
+      "loss": 0.825,
       "step": 9000
     },
     {
+      "epoch": 0.36,
+      "eval_loss": 1.5243308544158936,
+      "eval_runtime": 100.5198,
+      "eval_samples_per_second": 136.242,
+      "eval_steps_per_second": 2.129,
       "step": 9000
     },
     {
+      "epoch": 0.364,
+      "grad_norm": 0.8830705881118774,
+      "learning_rate": 1.790965571837296e-06,
+      "loss": 0.8233,
       "step": 9100
     },
     {
+      "epoch": 0.368,
+      "grad_norm": 0.9197975993156433,
+      "learning_rate": 1.7766185591967092e-06,
+      "loss": 0.8299,
       "step": 9200
     },
     {
+      "epoch": 0.372,
+      "grad_norm": 1.0428673028945923,
+      "learning_rate": 1.762186698935437e-06,
+      "loss": 0.8182,
       "step": 9300
     },
     {
+      "epoch": 0.376,
+      "grad_norm": 0.9466006755828857,
+      "learning_rate": 1.7476723162829723e-06,
+      "loss": 0.8255,
       "step": 9400
     },
     {
+      "epoch": 0.38,
+      "grad_norm": 0.9237021803855896,
+      "learning_rate": 1.7330777497646328e-06,
+      "loss": 0.7672,
       "step": 9500
     },
     {
+      "epoch": 0.384,
+      "grad_norm": 0.917202353477478,
+      "learning_rate": 1.7184053508247853e-06,
+      "loss": 0.8427,
       "step": 9600
     },
     {
+      "epoch": 0.388,
+      "grad_norm": 0.9462612271308899,
+      "learning_rate": 1.703657483447983e-06,
+      "loss": 0.8409,
       "step": 9700
     },
     {
+      "epoch": 0.392,
+      "grad_norm": 0.8924245834350586,
+      "learning_rate": 1.6888365237780886e-06,
+      "loss": 0.8335,
       "step": 9800
     },
     {
+      "epoch": 0.396,
+      "grad_norm": 0.9719087481498718,
+      "learning_rate": 1.6739448597354327e-06,
+      "loss": 0.826,
       "step": 9900
     },
     {
+      "epoch": 0.4,
+      "grad_norm": 0.8893173336982727,
+      "learning_rate": 1.6589848906320794e-06,
+      "loss": 0.8326,
       "step": 10000
     },
     {
+      "epoch": 0.4,
+      "eval_loss": 1.5264792442321777,
+      "eval_runtime": 101.5699,
+      "eval_samples_per_second": 134.833,
+      "eval_steps_per_second": 2.107,
       "step": 10000
     },
     {
+      "epoch": 0.404,
+      "grad_norm": 0.8719335198402405,
+      "learning_rate": 1.6439590267852528e-06,
+      "loss": 0.8198,
       "step": 10100
     },
     {
+      "epoch": 0.408,
+      "grad_norm": 0.8997857570648193,
+      "learning_rate": 1.6288696891289938e-06,
+      "loss": 0.8103,
       "step": 10200
     },
     {
+      "epoch": 0.412,
+      "grad_norm": 0.9756138920783997,
+      "learning_rate": 1.6137193088241021e-06,
+      "loss": 0.8245,
       "step": 10300
     },
     {
+      "epoch": 0.416,
+      "grad_norm": 1.009027123451233,
+      "learning_rate": 1.598510326866435e-06,
+      "loss": 0.8226,
       "step": 10400
     },
     {
+      "epoch": 0.42,
+      "grad_norm": 0.9941139221191406,
+      "learning_rate": 1.583245193693619e-06,
+      "loss": 0.8154,
       "step": 10500
     },
     {
+      "epoch": 0.424,
+      "grad_norm": 0.9156614542007446,
+      "learning_rate": 1.5679263687902402e-06,
+      "loss": 0.8194,
       "step": 10600
     },
     {
+      "epoch": 0.428,
+      "grad_norm": 0.9270005226135254,
+      "learning_rate": 1.552556320291578e-06,
+      "loss": 0.8144,
       "step": 10700
     },
     {
+      "epoch": 0.432,
+      "grad_norm": 0.9664807915687561,
+      "learning_rate": 1.5371375245859446e-06,
+      "loss": 0.823,
       "step": 10800
     },
     {
+      "epoch": 0.436,
+      "grad_norm": 0.9909628629684448,
+      "learning_rate": 1.5216724659156944e-06,
+      "loss": 0.8319,
       "step": 10900
     },
     {
+      "epoch": 0.44,
+      "grad_norm": 1.0144808292388916,
+      "learning_rate": 1.506163635976969e-06,
+      "loss": 0.8272,
       "step": 11000
     },
     {
+      "epoch": 0.44,
+      "eval_loss": 1.5209919214248657,
+      "eval_runtime": 101.3638,
+      "eval_samples_per_second": 135.107,
+      "eval_steps_per_second": 2.111,
       "step": 11000
     },
     {
+      "epoch": 0.444,
+      "grad_norm": 0.9689117074012756,
+      "learning_rate": 1.49061353351824e-06,
+      "loss": 0.8408,
       "step": 11100
     },
     {
+      "epoch": 0.448,
+      "grad_norm": 1.0267921686172485,
+      "learning_rate": 1.4750246639377161e-06,
+      "loss": 0.8362,
       "step": 11200
     },
     {
+      "epoch": 0.452,
+      "grad_norm": 0.920600175857544,
+      "learning_rate": 1.4593995388796797e-06,
+      "loss": 0.8343,
       "step": 11300
     },
     {
+      "epoch": 0.456,
+      "grad_norm": 1.025995135307312,
+      "learning_rate": 1.4437406758298156e-06,
+      "loss": 0.8255,
       "step": 11400
     },
     {
+      "epoch": 0.46,
+      "grad_norm": 0.889402449131012,
+      "learning_rate": 1.428050597709599e-06,
+      "loss": 0.839,
       "step": 11500
     },
     {
+      "epoch": 0.464,
+      "grad_norm": 0.8957056999206543,
+      "learning_rate": 1.412331832469809e-06,
+      "loss": 0.8304,
       "step": 11600
     },
     {
+      "epoch": 0.468,
+      "grad_norm": 0.9389684796333313,
+      "learning_rate": 1.39658691268323e-06,
+      "loss": 0.8523,
       "step": 11700
     },
     {
+      "epoch": 0.472,
+      "grad_norm": 0.9115435481071472,
+      "learning_rate": 1.3808183751366089e-06,
+      "loss": 0.8421,
       "step": 11800
     },
     {
+      "epoch": 0.476,
+      "grad_norm": 0.9521908164024353,
+      "learning_rate": 1.3650287604219342e-06,
+      "loss": 0.8704,
       "step": 11900
     },
     {
+      "epoch": 0.48,
+      "grad_norm": 0.9166862964630127,
+      "learning_rate": 1.3492206125271016e-06,
+      "loss": 0.8527,
       "step": 12000
     },
     {
+      "epoch": 0.48,
+      "eval_loss": 1.5229912996292114,
+      "eval_runtime": 101.6086,
+      "eval_samples_per_second": 134.782,
+      "eval_steps_per_second": 2.106,
       "step": 12000
     },
     {
+      "epoch": 0.484,
+      "grad_norm": 0.9557492733001709,
+      "learning_rate": 1.333396478426031e-06,
+      "loss": 0.8499,
       "step": 12100
     },
     {
+      "epoch": 0.488,
+      "grad_norm": 0.9957550764083862,
+      "learning_rate": 1.317558907668306e-06,
+      "loss": 0.8534,
       "step": 12200
     },
     {
+      "epoch": 0.492,
+      "grad_norm": 1.1370068788528442,
+      "learning_rate": 1.3017104519683932e-06,
+      "loss": 0.8336,
       "step": 12300
     },
     {
+      "epoch": 0.496,
+      "grad_norm": 0.9006808400154114,
+      "learning_rate": 1.285853664794518e-06,
+      "loss": 0.8196,
       "step": 12400
     },
     {
+      "epoch": 0.5,
+      "grad_norm": 0.9441719651222229,
+      "learning_rate": 1.269991100957254e-06,
+      "loss": 0.844,
       "step": 12500
     },
     {
+      "epoch": 0.504,
+      "grad_norm": 0.8616164922714233,
+      "learning_rate": 1.2541253161978986e-06,
+      "loss": 0.8319,
       "step": 12600
     },
     {
+      "epoch": 0.508,
+      "grad_norm": 0.9243165850639343,
+      "learning_rate": 1.238258866776697e-06,
+      "loss": 0.8307,
       "step": 12700
     },
     {
+      "epoch": 0.512,
+      "grad_norm": 0.9617546796798706,
+      "learning_rate": 1.222394309060982e-06,
+      "loss": 0.8562,
       "step": 12800
     },
     {
+      "epoch": 0.516,
+      "grad_norm": 0.8897221684455872,
+      "learning_rate": 1.2065341991133013e-06,
+      "loss": 0.8344,
       "step": 12900
     },
     {
+      "epoch": 0.52,
+      "grad_norm": 0.8364721536636353,
+      "learning_rate": 1.1906810922795864e-06,
+      "loss": 0.8389,
       "step": 13000
     },
     {
+      "epoch": 0.52,
+      "eval_loss": 1.5288289785385132,
+      "eval_runtime": 101.5647,
+      "eval_samples_per_second": 134.84,
+      "eval_steps_per_second": 2.107,
       "step": 13000
     },
     {
+      "epoch": 0.524,
+      "grad_norm": 1.0084967613220215,
+      "learning_rate": 1.1748375427774422e-06,
+      "loss": 0.8498,
       "step": 13100
     },
     {
+      "epoch": 0.528,
+      "grad_norm": 0.9439749717712402,
+      "learning_rate": 1.1590061032846182e-06,
+      "loss": 0.8509,
       "step": 13200
     },
     {
+      "epoch": 0.532,
+      "grad_norm": 0.8930461406707764,
+      "learning_rate": 1.1431893245277262e-06,
+      "loss": 0.8384,
       "step": 13300
     },
     {
+      "epoch": 0.536,
+      "grad_norm": 1.0605283975601196,
+      "learning_rate": 1.1273897548712726e-06,
+      "loss": 0.8557,
       "step": 13400
     },
     {
+      "epoch": 0.54,
+      "grad_norm": 0.8892098069190979,
+      "learning_rate": 1.11160993990707e-06,
+      "loss": 0.8378,
       "step": 13500
     },
     {
+      "epoch": 0.544,
+      "grad_norm": 0.9008782505989075,
+      "learning_rate": 1.0958524220440999e-06,
+      "loss": 0.8437,
       "step": 13600
     },
     {
+      "epoch": 0.548,
+      "grad_norm": 0.8771668672561646,
+      "learning_rate": 1.0801197400988838e-06,
+      "loss": 0.8512,
       "step": 13700
     },
     {
+      "epoch": 0.552,
+      "grad_norm": 0.9245998859405518,
+      "learning_rate": 1.0644144288864352e-06,
+      "loss": 0.8671,
       "step": 13800
     },
     {
+      "epoch": 0.556,
+      "grad_norm": 0.9122968912124634,
+      "learning_rate": 1.048739018811855e-06,
+      "loss": 0.8328,
       "step": 13900
     },
     {
+      "epoch": 0.56,
+      "grad_norm": 0.9968782067298889,
+      "learning_rate": 1.0330960354626384e-06,
+      "loss": 0.851,
       "step": 14000
     },
     {
+      "epoch": 0.56,
+      "eval_loss": 1.5260618925094604,
+      "eval_runtime": 101.9042,
+      "eval_samples_per_second": 134.391,
+      "eval_steps_per_second": 2.1,
       "step": 14000
     },
     {
+      "epoch": 0.564,
+      "grad_norm": 1.0338596105575562,
+      "learning_rate": 1.0174879992017586e-06,
+      "loss": 0.8374,
       "step": 14100
     },
     {
+      "epoch": 0.568,
+      "grad_norm": 0.9291728734970093,
+      "learning_rate": 1.0019174247615919e-06,
+      "loss": 0.8356,
       "step": 14200
     },
     {
+      "epoch": 0.572,
+      "grad_norm": 0.8955647945404053,
+      "learning_rate": 9.863868208387473e-07,
+      "loss": 0.839,
       "step": 14300
     },
     {
+      "epoch": 0.576,
+      "grad_norm": 0.9726178050041199,
+      "learning_rate": 9.708986896898727e-07,
+      "loss": 0.8396,
       "step": 14400
     },
     {
+      "epoch": 0.58,
+      "grad_norm": 0.9720205068588257,
+      "learning_rate": 9.554555267284956e-07,
+      "loss": 0.8334,
       "step": 14500
     },
     {
+      "epoch": 0.584,
+      "grad_norm": 0.9503899216651917,
+      "learning_rate": 9.400598201229705e-07,
+      "loss": 0.8165,
       "step": 14600
     },
     {
+      "epoch": 0.588,
+      "grad_norm": 0.8789735436439514,
+      "learning_rate": 9.247140503955863e-07,
+      "loss": 0.8262,
       "step": 14700
     },
     {
+      "epoch": 0.592,
+      "grad_norm": 1.4387589693069458,
+      "learning_rate": 9.09420690022913e-07,
+      "loss": 0.8378,
       "step": 14800
     },
     {
+      "epoch": 0.596,
+      "grad_norm": 1.1762765645980835,
+      "learning_rate": 8.941822030374405e-07,
+      "loss": 0.8428,
       "step": 14900
     },
     {
+      "epoch": 0.6,
+      "grad_norm": 0.880807638168335,
+      "learning_rate": 8.790010446305814e-07,
+      "loss": 0.8254,
       "step": 15000
     },
     {
+      "epoch": 0.6,
+      "eval_loss": 1.5283499956130981,
+      "eval_runtime": 103.2419,
+      "eval_samples_per_second": 132.65,
+      "eval_steps_per_second": 2.073,
       "step": 15000
     },
     {
+      "epoch": 0.604,
+      "grad_norm": 0.9635188579559326,
+      "learning_rate": 8.63879660757092e-07,
+      "loss": 0.798,
       "step": 15100
     },
     {
+      "epoch": 0.608,
+      "grad_norm": 0.9472705721855164,
+      "learning_rate": 8.488204877409884e-07,
+      "loss": 0.8033,
       "step": 15200
     },
     {
+      "epoch": 0.612,
+      "grad_norm": 0.8378113508224487,
+      "learning_rate": 8.338259518830106e-07,
+      "loss": 0.8012,
       "step": 15300
     },
     {
+      "epoch": 0.616,
+      "grad_norm": 0.9451029300689697,
+      "learning_rate": 8.18898469069703e-07,
+      "loss": 0.8047,
       "step": 15400
     },
     {
+      "epoch": 0.62,
+      "grad_norm": 0.9609344005584717,
+      "learning_rate": 8.040404443841701e-07,
+      "loss": 0.7927,
       "step": 15500
     },
     {
+      "epoch": 0.624,
+      "grad_norm": 0.8947242498397827,
+      "learning_rate": 7.892542717185766e-07,
+      "loss": 0.7885,
       "step": 15600
     },
     {
+      "epoch": 0.628,
+      "grad_norm": 0.9105751514434814,
+      "learning_rate": 7.745423333884483e-07,
+      "loss": 0.801,
       "step": 15700
     },
     {
+      "epoch": 0.632,
+      "grad_norm": 0.899936854839325,
+      "learning_rate": 7.599069997488386e-07,
+      "loss": 0.8005,
       "step": 15800
     },
     {
+      "epoch": 0.636,
+      "grad_norm": 1.0273375511169434,
+      "learning_rate": 7.453506288124224e-07,
+      "loss": 0.8015,
       "step": 15900
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 0.8960332274436951,
+      "learning_rate": 7.308755658695775e-07,
+      "loss": 0.8074,
       "step": 16000
     },
     {
+      "epoch": 0.64,
+      "eval_loss": 1.5343570709228516,
+      "eval_runtime": 102.3372,
+      "eval_samples_per_second": 133.822,
+      "eval_steps_per_second": 2.091,
       "step": 16000
     },
     {
+      "epoch": 0.644,
+      "grad_norm": 0.8942509293556213,
+      "learning_rate": 7.164841431105172e-07,
+      "loss": 0.796,
       "step": 16100
     },
     {
+      "epoch": 0.648,
+      "grad_norm": 0.9353269934654236,
+      "learning_rate": 7.021786792495325e-07,
+      "loss": 0.8196,
       "step": 16200
     },
     {
+      "epoch": 0.652,
+      "grad_norm": 0.985683262348175,
+      "learning_rate": 6.879614791514075e-07,
+      "loss": 0.808,
       "step": 16300
     },
     {
+      "epoch": 0.656,
+      "grad_norm": 0.8981220722198486,
+      "learning_rate": 6.738348334600634e-07,
+      "loss": 0.8015,
       "step": 16400
     },
     {
+      "epoch": 0.66,
+      "grad_norm": 0.9412031173706055,
+      "learning_rate": 6.598010182294938e-07,
+      "loss": 0.8009,
       "step": 16500
     },
     {
+      "epoch": 0.664,
+      "grad_norm": 0.8926331996917725,
+      "learning_rate": 6.458622945570538e-07,
+      "loss": 0.783,
       "step": 16600
     },
     {
+      "epoch": 0.668,
+      "grad_norm": 0.8715830445289612,
+      "learning_rate": 6.320209082191569e-07,
+      "loss": 0.8127,
       "step": 16700
     },
     {
+      "epoch": 0.672,
+      "grad_norm": 0.8215272426605225,
+      "learning_rate": 6.182790893094402e-07,
+      "loss": 0.7958,
       "step": 16800
     },
     {
+      "epoch": 0.676,
+      "grad_norm": 0.9258244037628174,
+      "learning_rate": 6.046390518794556e-07,
+      "loss": 0.7931,
       "step": 16900
     },
     {
+      "epoch": 0.68,
+      "grad_norm": 0.8930866122245789,
+      "learning_rate": 5.911029935819468e-07,
+      "loss": 0.7811,
       "step": 17000
     },
     {
+      "epoch": 0.68,
+      "eval_loss": 1.5324440002441406,
+      "eval_runtime": 102.3251,
+      "eval_samples_per_second": 133.838,
+      "eval_steps_per_second": 2.091,
       "step": 17000
     },
     {
+      "epoch": 0.684,
+      "grad_norm": 0.9415869116783142,
+      "learning_rate": 5.776730953167705e-07,
+      "loss": 0.8003,
       "step": 17100
     },
     {
+      "epoch": 0.688,
+      "grad_norm": 0.892819344997406,
+      "learning_rate": 5.643515208795141e-07,
+      "loss": 0.7943,
       "step": 17200
     },
     {
+      "epoch": 0.692,
+      "grad_norm": 0.9383297562599182,
+      "learning_rate": 5.511404166128647e-07,
+      "loss": 0.7998,
       "step": 17300
     },
     {
+      "epoch": 0.696,
+      "grad_norm": 0.8630228638648987,
+      "learning_rate": 5.380419110608033e-07,
+      "loss": 0.7949,
       "step": 17400
     },
     {
+      "epoch": 0.7,
+      "grad_norm": 0.9032106995582581,
+      "learning_rate": 5.250581146256524e-07,
+      "loss": 0.7928,
       "step": 17500
     },
     {
+      "epoch": 0.704,
+      "grad_norm": 0.9039574265480042,
+      "learning_rate": 5.121911192280557e-07,
+      "loss": 0.8012,
       "step": 17600
     },
     {
+      "epoch": 0.708,
+      "grad_norm": 0.9616802334785461,
+      "learning_rate": 4.994429979699302e-07,
+      "loss": 0.7964,
       "step": 17700
     },
     {
+      "epoch": 0.712,
+      "grad_norm": 0.9427072405815125,
+      "learning_rate": 4.868158048004537e-07,
+      "loss": 0.805,
       "step": 17800
     },
     {
+      "epoch": 0.716,
+      "grad_norm": 0.9399961829185486,
+      "learning_rate": 4.743115741851383e-07,
+      "loss": 0.7913,
       "step": 17900
     },
     {
+      "epoch": 0.72,
+      "grad_norm": 0.8884769678115845,
+      "learning_rate": 4.6193232077804006e-07,
+      "loss": 0.7985,
       "step": 18000
     },
     {
+      "epoch": 0.72,
+      "eval_loss": 1.5309633016586304,
+      "eval_runtime": 102.8962,
+      "eval_samples_per_second": 133.095,
+      "eval_steps_per_second": 2.08,
       "step": 18000
     },
     {
+      "epoch": 0.724,
+      "grad_norm": 0.9725548028945923,
+      "learning_rate": 4.4968003909716243e-07,
+      "loss": 0.8162,
       "step": 18100
     },
     {
+      "epoch": 0.728,
+      "grad_norm": 1.0075186491012573,
+      "learning_rate": 4.3755670320310443e-07,
+      "loss": 0.8054,
       "step": 18200
     },
     {
+      "epoch": 0.732,
+      "grad_norm": 0.8749048113822937,
+      "learning_rate": 4.2556426638100555e-07,
+      "loss": 0.8056,
       "step": 18300
     },
     {
+      "epoch": 0.736,
+      "grad_norm": 0.9941290616989136,
+      "learning_rate": 4.1370466082583353e-07,
+      "loss": 0.8052,
       "step": 18400
     },
     {
+      "epoch": 0.74,
+      "grad_norm": 0.8676705956459045,
+      "learning_rate": 4.0197979733107755e-07,
+      "loss": 0.7861,
       "step": 18500
     },
     {
+      "epoch": 0.744,
+      "grad_norm": 0.9036993980407715,
+      "learning_rate": 3.903915649808812e-07,
+      "loss": 0.8081,
       "step": 18600
     },
     {
+      "epoch": 0.748,
+      "grad_norm": 0.9067134261131287,
+      "learning_rate": 3.789418308456812e-07,
+      "loss": 0.7956,
       "step": 18700
     },
     {
+      "epoch": 0.752,
+      "grad_norm": 0.8146563768386841,
+      "learning_rate": 3.676324396813856e-07,
+      "loss": 0.8031,
       "step": 18800
     },
     {
+      "epoch": 0.756,
+      "grad_norm": 0.9973321557044983,
+      "learning_rate": 3.5646521363215447e-07,
+      "loss": 0.794,
       "step": 18900
     },
     {
+      "epoch": 0.76,
+      "grad_norm": 0.9761902689933777,
+      "learning_rate": 3.4544195193681615e-07,
+      "loss": 0.7816,
       "step": 19000
     },
     {
+      "epoch": 0.76,
+      "eval_loss": 1.5294893980026245,
+      "eval_runtime": 102.4113,
+      "eval_samples_per_second": 133.726,
+      "eval_steps_per_second": 2.09,
       "step": 19000
     },
     {
+      "epoch": 0.764,
+      "grad_norm": 0.8643273115158081,
+      "learning_rate": 3.3456443063898157e-07,
+      "loss": 0.7917,
       "step": 19100
     },
     {
+      "epoch": 0.768,
+      "grad_norm": 0.9306071400642395,
+      "learning_rate": 3.238344023008888e-07,
+      "loss": 0.8012,
       "step": 19200
     },
     {
+      "epoch": 0.772,
+      "grad_norm": 0.9324482083320618,
+      "learning_rate": 3.132535957210366e-07,
+      "loss": 0.7929,
       "step": 19300
     },
     {
+      "epoch": 0.776,
+      "grad_norm": 0.8625467419624329,
+      "learning_rate": 3.0282371565564324e-07,
+      "loss": 0.7815,
       "step": 19400
     },
     {
+      "epoch": 0.78,
+      "grad_norm": 0.8669098019599915,
+      "learning_rate": 2.925464425439789e-07,
+      "loss": 0.8214,
       "step": 19500
     },
     {
+      "epoch": 0.784,
+      "grad_norm": 0.8781657814979553,
+      "learning_rate": 2.824234322376185e-07,
+      "loss": 0.7941,
       "step": 19600
     },
     {
+      "epoch": 0.788,
+      "grad_norm": 0.8899013996124268,
+      "learning_rate": 2.724563157336542e-07,
+      "loss": 0.7966,
       "step": 19700
     },
     {
+      "epoch": 0.792,
+      "grad_norm": 0.9773925542831421,
+      "learning_rate": 2.626466989119131e-07,
+      "loss": 0.8009,
       "step": 19800
     },
     {
+      "epoch": 0.796,
+      "grad_norm": 0.912438690662384,
+      "learning_rate": 2.5299616227621946e-07,
+      "loss": 0.7902,
       "step": 19900
     },
     {
+      "epoch": 0.8,
+      "grad_norm": 0.9557161927223206,
+      "learning_rate": 2.435062606997499e-07,
+      "loss": 0.7889,
       "step": 20000
     },
     {
+      "epoch": 0.8,
+      "eval_loss": 1.5292094945907593,
+      "eval_runtime": 102.5763,
+      "eval_samples_per_second": 133.51,
+      "eval_steps_per_second": 2.086,
       "step": 20000
     },
     {
+      "epoch": 0.804,
+      "grad_norm": 0.8561129570007324,
+      "learning_rate": 2.3417852317451418e-07,
+      "loss": 0.8033,
       "step": 20100
     },
     {
+      "epoch": 0.808,
+      "grad_norm": 0.9422599673271179,
+      "learning_rate": 2.250144525650086e-07,
+      "loss": 0.7985,
       "step": 20200
     },
     {
+      "epoch": 0.812,
+      "grad_norm": 0.8980026245117188,
+      "learning_rate": 2.160155253660759e-07,
+      "loss": 0.7951,
       "step": 20300
     },
     {
+      "epoch": 0.816,
+      "grad_norm": 0.8675551414489746,
+      "learning_rate": 2.071831914650173e-07,
+      "loss": 0.7994,
       "step": 20400
     },
     {
+      "epoch": 0.82,
+      "grad_norm": 0.8988806009292603,
+      "learning_rate": 1.9851887390798922e-07,
+      "loss": 0.7875,
       "step": 20500
     },
     {
+      "epoch": 0.824,
+      "grad_norm": 0.9102202653884888,
+      "learning_rate": 1.9002396867072587e-07,
+      "loss": 0.7993,
       "step": 20600
     },
     {
+      "epoch": 0.828,
+      "grad_norm": 0.9096868634223938,
+      "learning_rate": 1.816998444336214e-07,
+      "loss": 0.7704,
       "step": 20700
     },
     {
+      "epoch": 0.832,
+      "grad_norm": 0.9461880922317505,
+      "learning_rate": 1.7354784236121206e-07,
+      "loss": 0.7853,
       "step": 20800
     },
     {
+      "epoch": 0.836,
+      "grad_norm": 0.9219881296157837,
+      "learning_rate": 1.6556927588609078e-07,
+      "loss": 0.7857,
       "step": 20900
     },
     {
+      "epoch": 0.84,
+      "grad_norm": 0.8964762687683105,
+      "learning_rate": 1.577654304972899e-07,
+      "loss": 0.7872,
       "step": 21000
     },
     {
+      "epoch": 0.84,
+      "eval_loss": 1.524131178855896,
+      "eval_runtime": 102.4749,
+      "eval_samples_per_second": 133.642,
+      "eval_steps_per_second": 2.088,
       "step": 21000
     },
     {
+      "epoch": 0.844,
+      "grad_norm": 0.9355736970901489,
+      "learning_rate": 1.501375635331652e-07,
+      "loss": 0.7957,
       "step": 21100
     },
     {
+      "epoch": 0.848,
+      "grad_norm": 0.8686819076538086,
+      "learning_rate": 1.4268690397881675e-07,
+      "loss": 0.793,
       "step": 21200
     },
     {
+      "epoch": 0.852,
+      "grad_norm": 0.874756395816803,
+      "learning_rate": 1.3541465226807813e-07,
+      "loss": 0.7878,
       "step": 21300
     },
     {
+      "epoch": 0.856,
+      "grad_norm": 0.9285154342651367,
+      "learning_rate": 1.283219800901045e-07,
+      "loss": 0.7547,
       "step": 21400
     },
     {
+      "epoch": 0.86,
+      "grad_norm": 0.9496791958808899,
+      "learning_rate": 1.2141003020059273e-07,
+      "loss": 0.7885,
       "step": 21500
     },
     {
+      "epoch": 0.864,
+      "grad_norm": 0.879410445690155,
+      "learning_rate": 1.1467991623766287e-07,
+      "loss": 0.8123,
       "step": 21600
     },
     {
+      "epoch": 0.868,
+      "grad_norm": 0.942361056804657,
+      "learning_rate": 1.081327225424321e-07,
+      "loss": 0.817,
       "step": 21700
     },
     {
+      "epoch": 0.872,
+      "grad_norm": 0.9548047184944153,
+      "learning_rate": 1.0176950398430752e-07,
+      "loss": 0.7925,
       "step": 21800
     },
     {
+      "epoch": 0.876,
+      "grad_norm": 0.8643764853477478,
+      "learning_rate": 9.559128579102767e-08,
+      "loss": 0.7985,
       "step": 21900
     },
     {
+      "epoch": 0.88,
+      "grad_norm": 0.9450801014900208,
+      "learning_rate": 8.959906338348007e-08,
+      "loss": 0.7975,
       "step": 22000
     },
     {
+      "epoch": 0.88,
+      "eval_loss": 1.5321519374847412,
+      "eval_runtime": 103.5374,
+      "eval_samples_per_second": 132.271,
+      "eval_steps_per_second": 2.067,
       "step": 22000
     },
     {
+      "epoch": 0.884,
+      "grad_norm": 0.9130359292030334,
+      "learning_rate": 8.37938022153223e-08,
+      "loss": 0.8005,
       "step": 22100
     },
     {
+      "epoch": 0.888,
+      "grad_norm": 0.8732690215110779,
+      "learning_rate": 7.817643761742891e-08,
+      "loss": 0.7857,
       "step": 22200
     },
     {
+      "epoch": 0.892,
+      "grad_norm": 0.9094323515892029,
+      "learning_rate": 7.274787464719338e-08,
+      "loss": 0.8096,
       "step": 22300
     },
     {
+      "epoch": 0.896,
+      "grad_norm": 0.8987523913383484,
+      "learning_rate": 6.75089879427078e-08,
+      "loss": 0.8072,
       "step": 22400
     },
     {
+      "epoch": 0.9,
+      "grad_norm": 0.9105306267738342,
+      "learning_rate": 6.246062158184241e-08,
+      "loss": 0.7968,
       "step": 22500
     },
     {
+      "epoch": 0.904,
+      "grad_norm": 0.8889061808586121,
+      "learning_rate": 5.7603588946250064e-08,
+      "loss": 0.7971,
       "step": 22600
     },
     {
+      "epoch": 0.908,
+      "grad_norm": 0.9296440482139587,
+      "learning_rate": 5.293867259031568e-08,
+      "loss": 0.7896,
       "step": 22700
     },
     {
+      "epoch": 0.912,
+      "grad_norm": 1.0374181270599365,
+      "learning_rate": 4.8466624115073164e-08,
+      "loss": 0.808,
       "step": 22800
     },
     {
+      "epoch": 1.00384,
+      "grad_norm": 0.8791893124580383,
+      "learning_rate": 4.4188164047108403e-08,
+      "loss": 0.7835,
       "step": 22900
     },
     {
+      "epoch": 1.00784,
+      "grad_norm": 0.8789498209953308,
+      "learning_rate": 4.010398172247104e-08,
+      "loss": 0.7987,
       "step": 23000
     },
     {
+      "epoch": 1.00784,
+      "eval_loss": 1.5310029983520508,
+      "eval_runtime": 101.8479,
+      "eval_samples_per_second": 134.465,
+      "eval_steps_per_second": 2.101,
       "step": 23000
     },
     {
+      "epoch": 1.01184,
+      "grad_norm": 0.9262071847915649,
+      "learning_rate": 3.6214735175608004e-08,
+      "loss": 0.7966,
       "step": 23100
     },
     {
+      "epoch": 1.01584,
+      "grad_norm": 0.8986383676528931,
+      "learning_rate": 3.252105103334499e-08,
+      "loss": 0.7954,
       "step": 23200
     },
     {
+      "epoch": 1.01984,
+      "grad_norm": 0.9548205733299255,
+      "learning_rate": 2.9023524413923365e-08,
+      "loss": 0.7934,
       "step": 23300
     },
     {
+      "epoch": 1.02384,
+      "grad_norm": 0.9211428165435791,
+      "learning_rate": 2.5722718831117656e-08,
+      "loss": 0.8068,
       "step": 23400
     },
     {
+      "epoch": 1.02784,
+      "grad_norm": 0.9240931272506714,
+      "learning_rate": 2.26191661034425e-08,
+      "loss": 0.787,
       "step": 23500
     },
     {
+      "epoch": 1.03184,
+      "grad_norm": 0.9866804480552673,
+      "learning_rate": 1.9713366268468148e-08,
+      "loss": 0.7929,
       "step": 23600
     },
     {
+      "epoch": 1.03584,
+      "grad_norm": 0.9947385787963867,
+      "learning_rate": 1.700578750225432e-08,
+      "loss": 0.7973,
       "step": 23700
     },
     {
+      "epoch": 1.03984,
+      "grad_norm": 0.8872534036636353,
+      "learning_rate": 1.4496866043919865e-08,
+      "loss": 0.7995,
       "step": 23800
     },
     {
+      "epoch": 1.04384,
+      "grad_norm": 0.8726480007171631,
+      "learning_rate": 1.2187006125356087e-08,
+      "loss": 0.7929,
       "step": 23900
     },
     {
+      "epoch": 1.04784,
+      "grad_norm": 0.881963849067688,
+      "learning_rate": 1.0076579906098255e-08,
+      "loss": 0.8044,
       "step": 24000
     },
     {
+      "epoch": 1.04784,
+      "eval_loss": 1.5276943445205688,
+      "eval_runtime": 99.4561,
+      "eval_samples_per_second": 137.699,
+      "eval_steps_per_second": 2.152,
       "step": 24000
     },
     {
+      "epoch": 1.0518399999999999,
+      "grad_norm": 0.8809722065925598,
+      "learning_rate": 8.16592741336386e-09,
+      "loss": 0.7832,
       "step": 24100
     },
     {
+      "epoch": 1.05584,
+      "grad_norm": 0.8471363186836243,
+      "learning_rate": 6.455356487267833e-09,
+      "loss": 0.7815,
       "step": 24200
     },
     {
+      "epoch": 1.05984,
+      "grad_norm": 0.9595879912376404,
+      "learning_rate": 4.9451427312251224e-09,
+      "loss": 0.7943,
       "step": 24300
     },
     {
+      "epoch": 1.06384,
+      "grad_norm": 0.8937146663665771,
+      "learning_rate": 3.635529467544696e-09,
+      "loss": 0.8066,
       "step": 24400
     },
     {
+      "epoch": 1.06784,
+      "grad_norm": 0.9749945998191833,
+      "learning_rate": 2.526727698227288e-09,
+      "loss": 0.802,
       "step": 24500
     },
     {
+      "epoch": 1.07184,
+      "grad_norm": 0.919170081615448,
+      "learning_rate": 1.6189160709680074e-09,
+      "loss": 0.79,
       "step": 24600
     },
     {
+      "epoch": 1.07584,
+      "grad_norm": 0.9579231142997742,
+      "learning_rate": 9.122408503739466e-10,
+      "loss": 0.8092,
       "step": 24700
     },
     {
+      "epoch": 1.07984,
+      "grad_norm": 0.8257275223731995,
+      "learning_rate": 4.0681589439789395e-10,
+      "loss": 0.8028,
       "step": 24800
     },
     {
+      "epoch": 1.08384,
+      "grad_norm": 0.8641030788421631,
+      "learning_rate": 1.0272263599411803e-10,
+      "loss": 0.7852,
       "step": 24900
     },
     {
+      "epoch": 1.08784,
+      "grad_norm": 0.929093062877655,
+      "learning_rate": 1.006999733599301e-14,
+      "loss": 0.7867,
       "step": 25000
     },
     {
+      "epoch": 1.08784,
+      "eval_loss": 1.5288399457931519,
+      "eval_runtime": 99.9402,
+      "eval_samples_per_second": 137.032,
+      "eval_steps_per_second": 2.141,
       "step": 25000
     }
   ],
   "logging_steps": 100,
+  "max_steps": 25000,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 9223372036854775807,
   "save_steps": 5000,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 3.3846277778817024e+18,
   "train_batch_size": 64,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:399a4e8079b62b554066d23eba050b8905a3c51a8603b79dbe178069ff2eff81
 size 5841

 version https://git-lfs.github.com/spec/v1
+oid sha256:8468e002e14d69abcb2f7de8e401f9fa2561c9e9f59ee528d9d623ec438f38ae
 size 5841