Upload 10 files

Browse files

Files changed (6) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +3358 -4
training_args.bin +1 -1

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c4e557f28dc70179a12b755e5e60b628849ed5aef82d5494c23999f1e52f3551
 size 598635032

 version https://git-lfs.github.com/spec/v1
+oid sha256:20566ce2cded3cc02fb93eee498468296e195bde2f327717d82d2153bf039a5c
 size 598635032

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:93bba69e56e6b384a3969e59f495cf9a1964ef6b3dd15dc118184bc7ad1cbb79
 size 1197359627

 version https://git-lfs.github.com/spec/v1
+oid sha256:c4bb4ca88b8d2d3ea2dfd7303f13e3a8cd59c49a916572576e2dc64da5c07512
 size 1197359627

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:119a4626b2861d53d5e22a804e127273c20e502df505ffeabd204f28a1b0f1bb
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:50c7b18601d8312eab9dd312837f003a894f9f32c0a047b958e34fe83b5149bb
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1273113bfb573ac8637333edb62886abe994c2a9319c15312bcea1286ef43c5b
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:9f734a52ddcea7feef7729d4ad9d1d723abcc8fb15cbcedadde156471860e8d2
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.0446530735268672,
   "eval_steps": 1000,
-  "global_step": 270000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -21075,10 +21075,3364 @@
       "eval_samples_per_second": 195.474,
       "eval_steps_per_second": 1.534,
       "step": 270000
     }
   ],
   "logging_steps": 100,
-  "max_steps": 358318,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 9223372036854775807,
   "save_steps": 1000,
@@ -21094,7 +24448,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.356355071475712e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.02,
   "eval_steps": 1000,
+  "global_step": 313000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 195.474,
       "eval_steps_per_second": 1.534,
       "step": 270000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 0.9838644862174988,
+      "learning_rate": 7.406365143716071e-06,
+      "loss": 1.9287,
+      "step": 270100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 1.0142942667007446,
+      "learning_rate": 7.390642289947644e-06,
+      "loss": 1.9146,
+      "step": 270200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 1.0048279762268066,
+      "learning_rate": 7.374933247575938e-06,
+      "loss": 1.921,
+      "step": 270300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 1.061614751815796,
+      "learning_rate": 7.359238028921914e-06,
+      "loss": 1.9098,
+      "step": 270400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 1.0184416770935059,
+      "learning_rate": 7.343556646295647e-06,
+      "loss": 1.9307,
+      "step": 270500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 1.0567371845245361,
+      "learning_rate": 7.327889111996397e-06,
+      "loss": 1.9093,
+      "step": 270600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 1.0020757913589478,
+      "learning_rate": 7.312235438312537e-06,
+      "loss": 1.9089,
+      "step": 270700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 0.9947327375411987,
+      "learning_rate": 7.296595637521581e-06,
+      "loss": 1.9175,
+      "step": 270800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 0.9927939176559448,
+      "learning_rate": 7.280969721890163e-06,
+      "loss": 1.9116,
+      "step": 270900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 0.994209885597229,
+      "learning_rate": 7.26535770367403e-06,
+      "loss": 1.9031,
+      "step": 271000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.1892189979553223,
+      "eval_runtime": 53.5994,
+      "eval_samples_per_second": 190.189,
+      "eval_steps_per_second": 1.493,
+      "step": 271000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 0.9945200085639954,
+      "learning_rate": 7.249759595118011e-06,
+      "loss": 1.9045,
+      "step": 271100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 0.9387146234512329,
+      "learning_rate": 7.234175408456037e-06,
+      "loss": 1.9048,
+      "step": 271200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 0.9996144771575928,
+      "learning_rate": 7.218605155911126e-06,
+      "loss": 1.9089,
+      "step": 271300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 0.9891520142555237,
+      "learning_rate": 7.203048849695357e-06,
+      "loss": 1.9093,
+      "step": 271400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 1.0603066682815552,
+      "learning_rate": 7.187506502009886e-06,
+      "loss": 1.8988,
+      "step": 271500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 1.0593341588974,
+      "learning_rate": 7.17197812504489e-06,
+      "loss": 1.9138,
+      "step": 271600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 1.0183734893798828,
+      "learning_rate": 7.156463730979626e-06,
+      "loss": 1.9011,
+      "step": 271700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 0.9992024302482605,
+      "learning_rate": 7.140963331982351e-06,
+      "loss": 1.9059,
+      "step": 271800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 0.9801898002624512,
+      "learning_rate": 7.125476940210371e-06,
+      "loss": 1.905,
+      "step": 271900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 0.965479850769043,
+      "learning_rate": 7.110004567809986e-06,
+      "loss": 1.9043,
+      "step": 272000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.1842684745788574,
+      "eval_runtime": 51.7157,
+      "eval_samples_per_second": 197.116,
+      "eval_steps_per_second": 1.547,
+      "step": 272000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 1.0330137014389038,
+      "learning_rate": 7.094546226916513e-06,
+      "loss": 1.9144,
+      "step": 272100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 0.9688111543655396,
+      "learning_rate": 7.079101929654261e-06,
+      "loss": 1.9102,
+      "step": 272200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 0.9989941120147705,
+      "learning_rate": 7.06367168813653e-06,
+      "loss": 1.9074,
+      "step": 272300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 1.0278581380844116,
+      "learning_rate": 7.048255514465577e-06,
+      "loss": 1.8924,
+      "step": 272400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 0.9955400228500366,
+      "learning_rate": 7.032853420732644e-06,
+      "loss": 1.8814,
+      "step": 272500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 0.9963505864143372,
+      "learning_rate": 7.017465419017921e-06,
+      "loss": 1.8934,
+      "step": 272600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 1.0569164752960205,
+      "learning_rate": 7.002091521390555e-06,
+      "loss": 1.8939,
+      "step": 272700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 0.9949243068695068,
+      "learning_rate": 6.986731739908611e-06,
+      "loss": 1.9021,
+      "step": 272800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 1.0075616836547852,
+      "learning_rate": 6.971386086619103e-06,
+      "loss": 1.8978,
+      "step": 272900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 0.9863401651382446,
+      "learning_rate": 6.9560545735579606e-06,
+      "loss": 1.9168,
+      "step": 273000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.190558433532715,
+      "eval_runtime": 51.5702,
+      "eval_samples_per_second": 197.672,
+      "eval_steps_per_second": 1.551,
+      "step": 273000
+    },
+    {
+      "epoch": 0.008651532995830519,
+      "grad_norm": 0.9959931969642639,
+      "learning_rate": 6.940737212750012e-06,
+      "loss": 1.8908,
+      "step": 273100
+    },
+    {
+      "epoch": 0.008930614705373438,
+      "grad_norm": 1.0437434911727905,
+      "learning_rate": 6.9254340162089846e-06,
+      "loss": 1.892,
+      "step": 273200
+    },
+    {
+      "epoch": 0.00920969641491636,
+      "grad_norm": 0.9680078625679016,
+      "learning_rate": 6.91014499593751e-06,
+      "loss": 1.8859,
+      "step": 273300
+    },
+    {
+      "epoch": 0.00948877812445928,
+      "grad_norm": 0.9896761775016785,
+      "learning_rate": 6.894870163927095e-06,
+      "loss": 1.8885,
+      "step": 273400
+    },
+    {
+      "epoch": 0.0097678598340022,
+      "grad_norm": 1.0668361186981201,
+      "learning_rate": 6.879609532158124e-06,
+      "loss": 1.9031,
+      "step": 273500
+    },
+    {
+      "epoch": 0.010046941543545119,
+      "grad_norm": 0.9838683605194092,
+      "learning_rate": 6.864363112599823e-06,
+      "loss": 1.9065,
+      "step": 273600
+    },
+    {
+      "epoch": 0.010326023253088039,
+      "grad_norm": 1.0146870613098145,
+      "learning_rate": 6.849130917210295e-06,
+      "loss": 1.8873,
+      "step": 273700
+    },
+    {
+      "epoch": 0.01060510496263096,
+      "grad_norm": 0.959338366985321,
+      "learning_rate": 6.833912957936478e-06,
+      "loss": 1.8851,
+      "step": 273800
+    },
+    {
+      "epoch": 0.01088418667217388,
+      "grad_norm": 1.032836675643921,
+      "learning_rate": 6.818709246714147e-06,
+      "loss": 1.8971,
+      "step": 273900
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "grad_norm": 0.9915603399276733,
+      "learning_rate": 6.803519795467888e-06,
+      "loss": 1.8906,
+      "step": 274000
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "eval_loss": 2.1938321590423584,
+      "eval_runtime": 51.6041,
+      "eval_samples_per_second": 197.543,
+      "eval_steps_per_second": 1.55,
+      "step": 274000
+    },
+    {
+      "epoch": 0.011442350091259719,
+      "grad_norm": 0.9873210787773132,
+      "learning_rate": 6.788344616111117e-06,
+      "loss": 1.879,
+      "step": 274100
+    },
+    {
+      "epoch": 0.011721431800802639,
+      "grad_norm": 0.9958903193473816,
+      "learning_rate": 6.773183720546056e-06,
+      "loss": 1.8915,
+      "step": 274200
+    },
+    {
+      "epoch": 0.012000513510345558,
+      "grad_norm": 0.9812116026878357,
+      "learning_rate": 6.758037120663727e-06,
+      "loss": 1.8922,
+      "step": 274300
+    },
+    {
+      "epoch": 0.01227959521988848,
+      "grad_norm": 1.0199834108352661,
+      "learning_rate": 6.742904828343921e-06,
+      "loss": 1.8928,
+      "step": 274400
+    },
+    {
+      "epoch": 0.0125586769294314,
+      "grad_norm": 0.9892932772636414,
+      "learning_rate": 6.727786855455218e-06,
+      "loss": 1.8689,
+      "step": 274500
+    },
+    {
+      "epoch": 0.012837758638974319,
+      "grad_norm": 0.9794331789016724,
+      "learning_rate": 6.712683213854973e-06,
+      "loss": 1.8766,
+      "step": 274600
+    },
+    {
+      "epoch": 0.013116840348517239,
+      "grad_norm": 0.9654126763343811,
+      "learning_rate": 6.697593915389297e-06,
+      "loss": 1.8887,
+      "step": 274700
+    },
+    {
+      "epoch": 0.013395922058060158,
+      "grad_norm": 0.9861681461334229,
+      "learning_rate": 6.682518971893053e-06,
+      "loss": 1.8936,
+      "step": 274800
+    },
+    {
+      "epoch": 0.013675003767603078,
+      "grad_norm": 1.0138262510299683,
+      "learning_rate": 6.667458395189835e-06,
+      "loss": 1.8718,
+      "step": 274900
+    },
+    {
+      "epoch": 0.013954085477146,
+      "grad_norm": 0.9910663962364197,
+      "learning_rate": 6.652412197091979e-06,
+      "loss": 1.8931,
+      "step": 275000
+    },
+    {
+      "epoch": 0.013954085477146,
+      "eval_loss": 2.1973860263824463,
+      "eval_runtime": 51.7316,
+      "eval_samples_per_second": 197.056,
+      "eval_steps_per_second": 1.546,
+      "step": 275000
+    },
+    {
+      "epoch": 0.01423316718668892,
+      "grad_norm": 0.9887643456459045,
+      "learning_rate": 6.637380389400538e-06,
+      "loss": 1.8915,
+      "step": 275100
+    },
+    {
+      "epoch": 0.014512248896231839,
+      "grad_norm": 1.0442452430725098,
+      "learning_rate": 6.622362983905295e-06,
+      "loss": 1.8866,
+      "step": 275200
+    },
+    {
+      "epoch": 0.014791330605774759,
+      "grad_norm": 1.025341272354126,
+      "learning_rate": 6.607359992384704e-06,
+      "loss": 1.8727,
+      "step": 275300
+    },
+    {
+      "epoch": 0.015070412315317678,
+      "grad_norm": 1.0826934576034546,
+      "learning_rate": 6.592371426605942e-06,
+      "loss": 1.878,
+      "step": 275400
+    },
+    {
+      "epoch": 0.015349494024860598,
+      "grad_norm": 0.9907537698745728,
+      "learning_rate": 6.5773972983248635e-06,
+      "loss": 1.8876,
+      "step": 275500
+    },
+    {
+      "epoch": 0.01562857573440352,
+      "grad_norm": 1.0108195543289185,
+      "learning_rate": 6.562437619286002e-06,
+      "loss": 1.8791,
+      "step": 275600
+    },
+    {
+      "epoch": 0.015907657443946437,
+      "grad_norm": 0.9989004731178284,
+      "learning_rate": 6.547492401222549e-06,
+      "loss": 1.8747,
+      "step": 275700
+    },
+    {
+      "epoch": 0.01618673915348936,
+      "grad_norm": 1.0045630931854248,
+      "learning_rate": 6.532561655856351e-06,
+      "loss": 1.8863,
+      "step": 275800
+    },
+    {
+      "epoch": 0.01646582086303228,
+      "grad_norm": 0.9753278493881226,
+      "learning_rate": 6.517645394897923e-06,
+      "loss": 1.8804,
+      "step": 275900
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "grad_norm": 0.9882794618606567,
+      "learning_rate": 6.5027436300464095e-06,
+      "loss": 1.8751,
+      "step": 276000
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "eval_loss": 2.195430040359497,
+      "eval_runtime": 51.7195,
+      "eval_samples_per_second": 197.102,
+      "eval_steps_per_second": 1.547,
+      "step": 276000
+    },
+    {
+      "epoch": 0.01702398428211812,
+      "grad_norm": 1.0431910753250122,
+      "learning_rate": 6.487856372989573e-06,
+      "loss": 1.8739,
+      "step": 276100
+    },
+    {
+      "epoch": 0.017303065991661037,
+      "grad_norm": 1.0198723077774048,
+      "learning_rate": 6.472983635403818e-06,
+      "loss": 1.8622,
+      "step": 276200
+    },
+    {
+      "epoch": 0.01758214770120396,
+      "grad_norm": 1.0333479642868042,
+      "learning_rate": 6.458125428954146e-06,
+      "loss": 1.871,
+      "step": 276300
+    },
+    {
+      "epoch": 0.017861229410746877,
+      "grad_norm": 0.9855126738548279,
+      "learning_rate": 6.443281765294177e-06,
+      "loss": 1.8632,
+      "step": 276400
+    },
+    {
+      "epoch": 0.018140311120289798,
+      "grad_norm": 1.05318284034729,
+      "learning_rate": 6.4284526560661005e-06,
+      "loss": 1.8804,
+      "step": 276500
+    },
+    {
+      "epoch": 0.01841939282983272,
+      "grad_norm": 1.0296765565872192,
+      "learning_rate": 6.41363811290071e-06,
+      "loss": 1.8752,
+      "step": 276600
+    },
+    {
+      "epoch": 0.018698474539375638,
+      "grad_norm": 1.0334811210632324,
+      "learning_rate": 6.398838147417374e-06,
+      "loss": 1.8768,
+      "step": 276700
+    },
+    {
+      "epoch": 0.01897755624891856,
+      "grad_norm": 0.9788868427276611,
+      "learning_rate": 6.384052771224022e-06,
+      "loss": 1.867,
+      "step": 276800
+    },
+    {
+      "epoch": 0.019256637958461477,
+      "grad_norm": 1.0330471992492676,
+      "learning_rate": 6.369281995917134e-06,
+      "loss": 1.8668,
+      "step": 276900
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "grad_norm": 0.9711721539497375,
+      "learning_rate": 6.354525833081759e-06,
+      "loss": 1.8703,
+      "step": 277000
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "eval_loss": 2.19480037689209,
+      "eval_runtime": 51.7555,
+      "eval_samples_per_second": 196.964,
+      "eval_steps_per_second": 1.546,
+      "step": 277000
+    },
+    {
+      "epoch": 0.01981480137754732,
+      "grad_norm": 1.0206748247146606,
+      "learning_rate": 6.339784294291454e-06,
+      "loss": 1.8639,
+      "step": 277100
+    },
+    {
+      "epoch": 0.020093883087090238,
+      "grad_norm": 1.019838571548462,
+      "learning_rate": 6.325057391108341e-06,
+      "loss": 1.8703,
+      "step": 277200
+    },
+    {
+      "epoch": 0.02037296479663316,
+      "grad_norm": 0.9485549330711365,
+      "learning_rate": 6.3103451350830316e-06,
+      "loss": 1.8753,
+      "step": 277300
+    },
+    {
+      "epoch": 0.020652046506176077,
+      "grad_norm": 0.9893754124641418,
+      "learning_rate": 6.295647537754668e-06,
+      "loss": 1.8808,
+      "step": 277400
+    },
+    {
+      "epoch": 0.020931128215719,
+      "grad_norm": 0.9906275868415833,
+      "learning_rate": 6.280964610650894e-06,
+      "loss": 1.875,
+      "step": 277500
+    },
+    {
+      "epoch": 0.02121020992526192,
+      "grad_norm": 1.0166252851486206,
+      "learning_rate": 6.266296365287844e-06,
+      "loss": 1.8808,
+      "step": 277600
+    },
+    {
+      "epoch": 0.021489291634804838,
+      "grad_norm": 1.0124883651733398,
+      "learning_rate": 6.251642813170142e-06,
+      "loss": 1.8795,
+      "step": 277700
+    },
+    {
+      "epoch": 0.02176837334434776,
+      "grad_norm": 1.0064804553985596,
+      "learning_rate": 6.237003965790872e-06,
+      "loss": 1.8692,
+      "step": 277800
+    },
+    {
+      "epoch": 0.022047455053890677,
+      "grad_norm": 1.0103724002838135,
+      "learning_rate": 6.222379834631598e-06,
+      "loss": 1.8863,
+      "step": 277900
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "grad_norm": 1.0621378421783447,
+      "learning_rate": 6.207770431162343e-06,
+      "loss": 1.8616,
+      "step": 278000
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "eval_loss": 2.2066683769226074,
+      "eval_runtime": 51.8686,
+      "eval_samples_per_second": 196.535,
+      "eval_steps_per_second": 1.542,
+      "step": 278000
+    },
+    {
+      "epoch": 0.022605618472976517,
+      "grad_norm": 1.0321729183197021,
+      "learning_rate": 6.1931757668415855e-06,
+      "loss": 1.8622,
+      "step": 278100
+    },
+    {
+      "epoch": 0.022884700182519438,
+      "grad_norm": 1.0194209814071655,
+      "learning_rate": 6.178595853116212e-06,
+      "loss": 1.8701,
+      "step": 278200
+    },
+    {
+      "epoch": 0.02316378189206236,
+      "grad_norm": 1.0299484729766846,
+      "learning_rate": 6.164030701421583e-06,
+      "loss": 1.8809,
+      "step": 278300
+    },
+    {
+      "epoch": 0.023442863601605277,
+      "grad_norm": 1.0566426515579224,
+      "learning_rate": 6.149480323181439e-06,
+      "loss": 1.871,
+      "step": 278400
+    },
+    {
+      "epoch": 0.0237219453111482,
+      "grad_norm": 1.0435720682144165,
+      "learning_rate": 6.134944729807971e-06,
+      "loss": 1.8587,
+      "step": 278500
+    },
+    {
+      "epoch": 0.024001027020691117,
+      "grad_norm": 0.9985933303833008,
+      "learning_rate": 6.120423932701741e-06,
+      "loss": 1.8571,
+      "step": 278600
+    },
+    {
+      "epoch": 0.024280108730234038,
+      "grad_norm": 1.0292407274246216,
+      "learning_rate": 6.1059179432517295e-06,
+      "loss": 1.8612,
+      "step": 278700
+    },
+    {
+      "epoch": 0.02455919043977696,
+      "grad_norm": 1.0021073818206787,
+      "learning_rate": 6.091426772835293e-06,
+      "loss": 1.8695,
+      "step": 278800
+    },
+    {
+      "epoch": 0.024838272149319877,
+      "grad_norm": 1.0160890817642212,
+      "learning_rate": 6.076950432818176e-06,
+      "loss": 1.8865,
+      "step": 278900
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "grad_norm": 1.002803921699524,
+      "learning_rate": 6.062488934554469e-06,
+      "loss": 1.8657,
+      "step": 279000
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "eval_loss": 2.2028017044067383,
+      "eval_runtime": 51.9876,
+      "eval_samples_per_second": 196.085,
+      "eval_steps_per_second": 1.539,
+      "step": 279000
+    },
+    {
+      "epoch": 0.025396435568405717,
+      "grad_norm": 1.0246683359146118,
+      "learning_rate": 6.048042289386643e-06,
+      "loss": 1.8605,
+      "step": 279100
+    },
+    {
+      "epoch": 0.025675517277948638,
+      "grad_norm": 1.0214561223983765,
+      "learning_rate": 6.033610508645507e-06,
+      "loss": 1.869,
+      "step": 279200
+    },
+    {
+      "epoch": 0.025954598987491556,
+      "grad_norm": 1.0367177724838257,
+      "learning_rate": 6.019193603650225e-06,
+      "loss": 1.8564,
+      "step": 279300
+    },
+    {
+      "epoch": 0.026233680697034478,
+      "grad_norm": 1.0507416725158691,
+      "learning_rate": 6.004791585708272e-06,
+      "loss": 1.8819,
+      "step": 279400
+    },
+    {
+      "epoch": 0.0265127624065774,
+      "grad_norm": 1.0242899656295776,
+      "learning_rate": 5.990404466115465e-06,
+      "loss": 1.8804,
+      "step": 279500
+    },
+    {
+      "epoch": 0.026791844116120317,
+      "grad_norm": 1.0185377597808838,
+      "learning_rate": 5.976032256155939e-06,
+      "loss": 1.873,
+      "step": 279600
+    },
+    {
+      "epoch": 0.02707092582566324,
+      "grad_norm": 1.0047358274459839,
+      "learning_rate": 5.961674967102113e-06,
+      "loss": 1.8726,
+      "step": 279700
+    },
+    {
+      "epoch": 0.027350007535206156,
+      "grad_norm": 1.0124741792678833,
+      "learning_rate": 5.9473326102147255e-06,
+      "loss": 1.8711,
+      "step": 279800
+    },
+    {
+      "epoch": 0.027629089244749078,
+      "grad_norm": 1.0482338666915894,
+      "learning_rate": 5.933005196742783e-06,
+      "loss": 1.8599,
+      "step": 279900
+    },
+    {
+      "epoch": 0.027908170954292,
+      "grad_norm": 0.9833710193634033,
+      "learning_rate": 5.918692737923592e-06,
+      "loss": 1.8488,
+      "step": 280000
+    },
+    {
+      "epoch": 0.027908170954292,
+      "eval_loss": 2.2023353576660156,
+      "eval_runtime": 51.8909,
+      "eval_samples_per_second": 196.451,
+      "eval_steps_per_second": 1.542,
+      "step": 280000
+    },
+    {
+      "epoch": 0.028187252663834917,
+      "grad_norm": 1.0626851320266724,
+      "learning_rate": 5.9043952449827275e-06,
+      "loss": 1.8484,
+      "step": 280100
+    },
+    {
+      "epoch": 0.02846633437337784,
+      "grad_norm": 1.0383021831512451,
+      "learning_rate": 5.890112729134004e-06,
+      "loss": 1.8728,
+      "step": 280200
+    },
+    {
+      "epoch": 0.028745416082920756,
+      "grad_norm": 1.0421721935272217,
+      "learning_rate": 5.875845201579513e-06,
+      "loss": 1.8676,
+      "step": 280300
+    },
+    {
+      "epoch": 0.029024497792463678,
+      "grad_norm": 1.0611158609390259,
+      "learning_rate": 5.861592673509581e-06,
+      "loss": 1.8549,
+      "step": 280400
+    },
+    {
+      "epoch": 0.0293035795020066,
+      "grad_norm": 1.0284640789031982,
+      "learning_rate": 5.847355156102771e-06,
+      "loss": 1.8523,
+      "step": 280500
+    },
+    {
+      "epoch": 0.029582661211549517,
+      "grad_norm": 0.9916824698448181,
+      "learning_rate": 5.833132660525883e-06,
+      "loss": 1.864,
+      "step": 280600
+    },
+    {
+      "epoch": 0.02986174292109244,
+      "grad_norm": 1.0592784881591797,
+      "learning_rate": 5.818925197933911e-06,
+      "loss": 1.8686,
+      "step": 280700
+    },
+    {
+      "epoch": 0.030140824630635357,
+      "grad_norm": 1.014319896697998,
+      "learning_rate": 5.804732779470074e-06,
+      "loss": 1.8572,
+      "step": 280800
+    },
+    {
+      "epoch": 0.030419906340178278,
+      "grad_norm": 1.017314076423645,
+      "learning_rate": 5.7905554162658025e-06,
+      "loss": 1.8666,
+      "step": 280900
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "grad_norm": 1.0180898904800415,
+      "learning_rate": 5.77639311944069e-06,
+      "loss": 1.8735,
+      "step": 281000
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "eval_loss": 2.203965187072754,
+      "eval_runtime": 52.0422,
+      "eval_samples_per_second": 195.879,
+      "eval_steps_per_second": 1.537,
+      "step": 281000
+    },
+    {
+      "epoch": 0.030978069759264117,
+      "grad_norm": 0.9797289371490479,
+      "learning_rate": 5.762245900102545e-06,
+      "loss": 1.8685,
+      "step": 281100
+    },
+    {
+      "epoch": 0.03125715146880704,
+      "grad_norm": 1.0343824625015259,
+      "learning_rate": 5.748113769347319e-06,
+      "loss": 1.836,
+      "step": 281200
+    },
+    {
+      "epoch": 0.03153623317834996,
+      "grad_norm": 1.0038707256317139,
+      "learning_rate": 5.7339967382591534e-06,
+      "loss": 1.8629,
+      "step": 281300
+    },
+    {
+      "epoch": 0.031815314887892875,
+      "grad_norm": 1.0676428079605103,
+      "learning_rate": 5.7198948179103455e-06,
+      "loss": 1.8547,
+      "step": 281400
+    },
+    {
+      "epoch": 0.0320943965974358,
+      "grad_norm": 1.0127052068710327,
+      "learning_rate": 5.70580801936132e-06,
+      "loss": 1.8591,
+      "step": 281500
+    },
+    {
+      "epoch": 0.03237347830697872,
+      "grad_norm": 0.9936082363128662,
+      "learning_rate": 5.6917363536606596e-06,
+      "loss": 1.827,
+      "step": 281600
+    },
+    {
+      "epoch": 0.032652560016521635,
+      "grad_norm": 1.0413488149642944,
+      "learning_rate": 5.6776798318450755e-06,
+      "loss": 1.8518,
+      "step": 281700
+    },
+    {
+      "epoch": 0.03293164172606456,
+      "grad_norm": 1.0689826011657715,
+      "learning_rate": 5.663638464939405e-06,
+      "loss": 1.8469,
+      "step": 281800
+    },
+    {
+      "epoch": 0.03321072343560748,
+      "grad_norm": 1.0071433782577515,
+      "learning_rate": 5.64961226395658e-06,
+      "loss": 1.8499,
+      "step": 281900
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "grad_norm": 1.0158226490020752,
+      "learning_rate": 5.635601239897659e-06,
+      "loss": 1.8408,
+      "step": 282000
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "eval_loss": 2.204533338546753,
+      "eval_runtime": 52.0127,
+      "eval_samples_per_second": 195.991,
+      "eval_steps_per_second": 1.538,
+      "step": 282000
+    },
+    {
+      "epoch": 0.033768886854693314,
+      "grad_norm": 1.0187814235687256,
+      "learning_rate": 5.6216054037517865e-06,
+      "loss": 1.8592,
+      "step": 282100
+    },
+    {
+      "epoch": 0.03404796856423624,
+      "grad_norm": 1.0497030019760132,
+      "learning_rate": 5.607624766496203e-06,
+      "loss": 1.866,
+      "step": 282200
+    },
+    {
+      "epoch": 0.03432705027377916,
+      "grad_norm": 1.0341991186141968,
+      "learning_rate": 5.5936593390962165e-06,
+      "loss": 1.8463,
+      "step": 282300
+    },
+    {
+      "epoch": 0.034606131983322075,
+      "grad_norm": 1.0514994859695435,
+      "learning_rate": 5.579709132505203e-06,
+      "loss": 1.8384,
+      "step": 282400
+    },
+    {
+      "epoch": 0.034885213692865,
+      "grad_norm": 1.0009490251541138,
+      "learning_rate": 5.565774157664616e-06,
+      "loss": 1.8544,
+      "step": 282500
+    },
+    {
+      "epoch": 0.03516429540240792,
+      "grad_norm": 1.033133625984192,
+      "learning_rate": 5.551854425503964e-06,
+      "loss": 1.871,
+      "step": 282600
+    },
+    {
+      "epoch": 0.035443377111950836,
+      "grad_norm": 1.0105431079864502,
+      "learning_rate": 5.537949946940774e-06,
+      "loss": 1.8499,
+      "step": 282700
+    },
+    {
+      "epoch": 0.035722458821493754,
+      "grad_norm": 1.0701624155044556,
+      "learning_rate": 5.524060732880637e-06,
+      "loss": 1.8447,
+      "step": 282800
+    },
+    {
+      "epoch": 0.03600154053103668,
+      "grad_norm": 1.0628591775894165,
+      "learning_rate": 5.510186794217157e-06,
+      "loss": 1.8413,
+      "step": 282900
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "grad_norm": 1.0623475313186646,
+      "learning_rate": 5.4963281418319716e-06,
+      "loss": 1.8549,
+      "step": 283000
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "eval_loss": 2.2079403400421143,
+      "eval_runtime": 52.0695,
+      "eval_samples_per_second": 195.777,
+      "eval_steps_per_second": 1.536,
+      "step": 283000
+    },
+    {
+      "epoch": 0.036559703950122514,
+      "grad_norm": 1.002023696899414,
+      "learning_rate": 5.4824847865947045e-06,
+      "loss": 1.8312,
+      "step": 283100
+    },
+    {
+      "epoch": 0.03683878565966544,
+      "grad_norm": 1.057966947555542,
+      "learning_rate": 5.468656739363004e-06,
+      "loss": 1.8501,
+      "step": 283200
+    },
+    {
+      "epoch": 0.03711786736920836,
+      "grad_norm": 1.0365926027297974,
+      "learning_rate": 5.454844010982504e-06,
+      "loss": 1.861,
+      "step": 283300
+    },
+    {
+      "epoch": 0.037396949078751275,
+      "grad_norm": 0.9884259700775146,
+      "learning_rate": 5.441046612286827e-06,
+      "loss": 1.8495,
+      "step": 283400
+    },
+    {
+      "epoch": 0.0376760307882942,
+      "grad_norm": 1.0611577033996582,
+      "learning_rate": 5.427264554097555e-06,
+      "loss": 1.8521,
+      "step": 283500
+    },
+    {
+      "epoch": 0.03795511249783712,
+      "grad_norm": 1.0047646760940552,
+      "learning_rate": 5.413497847224272e-06,
+      "loss": 1.8497,
+      "step": 283600
+    },
+    {
+      "epoch": 0.038234194207380036,
+      "grad_norm": 1.0214877128601074,
+      "learning_rate": 5.399746502464479e-06,
+      "loss": 1.847,
+      "step": 283700
+    },
+    {
+      "epoch": 0.038513275916922954,
+      "grad_norm": 1.0316286087036133,
+      "learning_rate": 5.386010530603663e-06,
+      "loss": 1.8566,
+      "step": 283800
+    },
+    {
+      "epoch": 0.03879235762646588,
+      "grad_norm": 1.046181559562683,
+      "learning_rate": 5.3722899424152456e-06,
+      "loss": 1.856,
+      "step": 283900
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "grad_norm": 1.1351335048675537,
+      "learning_rate": 5.358584748660567e-06,
+      "loss": 1.857,
+      "step": 284000
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "eval_loss": 2.196443796157837,
+      "eval_runtime": 52.012,
+      "eval_samples_per_second": 195.993,
+      "eval_steps_per_second": 1.538,
+      "step": 284000
+    },
+    {
+      "epoch": 0.039350521045551715,
+      "grad_norm": 1.0209579467773438,
+      "learning_rate": 5.344894960088906e-06,
+      "loss": 1.8367,
+      "step": 284100
+    },
+    {
+      "epoch": 0.03962960275509464,
+      "grad_norm": 0.9978814125061035,
+      "learning_rate": 5.331220587437463e-06,
+      "loss": 1.8627,
+      "step": 284200
+    },
+    {
+      "epoch": 0.03990868446463756,
+      "grad_norm": 1.0793671607971191,
+      "learning_rate": 5.317561641431349e-06,
+      "loss": 1.8504,
+      "step": 284300
+    },
+    {
+      "epoch": 0.040187766174180475,
+      "grad_norm": 0.9899407029151917,
+      "learning_rate": 5.303918132783547e-06,
+      "loss": 1.8294,
+      "step": 284400
+    },
+    {
+      "epoch": 0.04046684788372339,
+      "grad_norm": 1.0489463806152344,
+      "learning_rate": 5.290290072194967e-06,
+      "loss": 1.8507,
+      "step": 284500
+    },
+    {
+      "epoch": 0.04074592959326632,
+      "grad_norm": 1.0309258699417114,
+      "learning_rate": 5.2766774703543855e-06,
+      "loss": 1.8558,
+      "step": 284600
+    },
+    {
+      "epoch": 0.041025011302809236,
+      "grad_norm": 1.0452024936676025,
+      "learning_rate": 5.2630803379384665e-06,
+      "loss": 1.8562,
+      "step": 284700
+    },
+    {
+      "epoch": 0.041304093012352154,
+      "grad_norm": 1.0156536102294922,
+      "learning_rate": 5.24949868561172e-06,
+      "loss": 1.8487,
+      "step": 284800
+    },
+    {
+      "epoch": 0.04158317472189508,
+      "grad_norm": 1.0449084043502808,
+      "learning_rate": 5.2359325240265375e-06,
+      "loss": 1.8601,
+      "step": 284900
+    },
+    {
+      "epoch": 0.041862256431438,
+      "grad_norm": 1.0443174839019775,
+      "learning_rate": 5.222381863823139e-06,
+      "loss": 1.8411,
+      "step": 285000
+    },
+    {
+      "epoch": 0.041862256431438,
+      "eval_loss": 2.207988739013672,
+      "eval_runtime": 51.9411,
+      "eval_samples_per_second": 196.261,
+      "eval_steps_per_second": 1.54,
+      "step": 285000
+    },
+    {
+      "epoch": 0.042141338140980915,
+      "grad_norm": 1.0384715795516968,
+      "learning_rate": 5.208846715629609e-06,
+      "loss": 1.8524,
+      "step": 285100
+    },
+    {
+      "epoch": 0.04242041985052384,
+      "grad_norm": 1.0093611478805542,
+      "learning_rate": 5.195327090061844e-06,
+      "loss": 1.848,
+      "step": 285200
+    },
+    {
+      "epoch": 0.04269950156006676,
+      "grad_norm": 1.0251661539077759,
+      "learning_rate": 5.181822997723582e-06,
+      "loss": 1.8428,
+      "step": 285300
+    },
+    {
+      "epoch": 0.042978583269609676,
+      "grad_norm": 1.056368112564087,
+      "learning_rate": 5.168334449206372e-06,
+      "loss": 1.858,
+      "step": 285400
+    },
+    {
+      "epoch": 0.043257664979152594,
+      "grad_norm": 1.0536158084869385,
+      "learning_rate": 5.154861455089577e-06,
+      "loss": 1.8375,
+      "step": 285500
+    },
+    {
+      "epoch": 0.04353674668869552,
+      "grad_norm": 1.0133367776870728,
+      "learning_rate": 5.141404025940341e-06,
+      "loss": 1.8382,
+      "step": 285600
+    },
+    {
+      "epoch": 0.043815828398238436,
+      "grad_norm": 1.0329740047454834,
+      "learning_rate": 5.127962172313624e-06,
+      "loss": 1.8546,
+      "step": 285700
+    },
+    {
+      "epoch": 0.044094910107781354,
+      "grad_norm": 1.0410008430480957,
+      "learning_rate": 5.114535904752157e-06,
+      "loss": 1.8303,
+      "step": 285800
+    },
+    {
+      "epoch": 0.04437399181732428,
+      "grad_norm": 1.0532476902008057,
+      "learning_rate": 5.1011252337864605e-06,
+      "loss": 1.8418,
+      "step": 285900
+    },
+    {
+      "epoch": 0.0446530735268672,
+      "grad_norm": 1.031091332435608,
+      "learning_rate": 5.087730169934793e-06,
+      "loss": 1.8524,
+      "step": 286000
+    },
+    {
+      "epoch": 0.0446530735268672,
+      "eval_loss": 2.2169249057769775,
+      "eval_runtime": 52.0382,
+      "eval_samples_per_second": 195.895,
+      "eval_steps_per_second": 1.537,
+      "step": 286000
+    },
+    {
+      "epoch": 0.044932155236410115,
+      "grad_norm": 1.0077662467956543,
+      "learning_rate": 5.0743507237032e-06,
+      "loss": 1.8372,
+      "step": 286100
+    },
+    {
+      "epoch": 0.04521123694595303,
+      "grad_norm": 0.9833052754402161,
+      "learning_rate": 5.0609869055854714e-06,
+      "loss": 1.8493,
+      "step": 286200
+    },
+    {
+      "epoch": 0.04549031865549596,
+      "grad_norm": 1.0318917036056519,
+      "learning_rate": 5.047638726063128e-06,
+      "loss": 1.8331,
+      "step": 286300
+    },
+    {
+      "epoch": 0.045769400365038876,
+      "grad_norm": 1.0132189989089966,
+      "learning_rate": 5.03430619560544e-06,
+      "loss": 1.836,
+      "step": 286400
+    },
+    {
+      "epoch": 0.046048482074581794,
+      "grad_norm": 1.016453742980957,
+      "learning_rate": 5.0209893246693895e-06,
+      "loss": 1.8561,
+      "step": 286500
+    },
+    {
+      "epoch": 0.04632756378412472,
+      "grad_norm": 1.0423955917358398,
+      "learning_rate": 5.007688123699686e-06,
+      "loss": 1.8488,
+      "step": 286600
+    },
+    {
+      "epoch": 0.04660664549366764,
+      "grad_norm": 1.0047492980957031,
+      "learning_rate": 4.994402603128751e-06,
+      "loss": 1.8384,
+      "step": 286700
+    },
+    {
+      "epoch": 0.046885727203210555,
+      "grad_norm": 1.0472545623779297,
+      "learning_rate": 4.981132773376704e-06,
+      "loss": 1.8359,
+      "step": 286800
+    },
+    {
+      "epoch": 0.04716480891275347,
+      "grad_norm": 1.0129274129867554,
+      "learning_rate": 4.967878644851351e-06,
+      "loss": 1.8428,
+      "step": 286900
+    },
+    {
+      "epoch": 0.0474438906222964,
+      "grad_norm": 1.0653069019317627,
+      "learning_rate": 4.954640227948188e-06,
+      "loss": 1.8516,
+      "step": 287000
+    },
+    {
+      "epoch": 0.0474438906222964,
+      "eval_loss": 2.207484483718872,
+      "eval_runtime": 51.9821,
+      "eval_samples_per_second": 196.106,
+      "eval_steps_per_second": 1.539,
+      "step": 287000
+    },
+    {
+      "epoch": 0.047722972331839315,
+      "grad_norm": 1.0424509048461914,
+      "learning_rate": 4.941417533050394e-06,
+      "loss": 1.8518,
+      "step": 287100
+    },
+    {
+      "epoch": 0.04800205404138223,
+      "grad_norm": 1.0581769943237305,
+      "learning_rate": 4.9282105705288185e-06,
+      "loss": 1.8239,
+      "step": 287200
+    },
+    {
+      "epoch": 0.04828113575092516,
+      "grad_norm": 1.0191422700881958,
+      "learning_rate": 4.9150193507419505e-06,
+      "loss": 1.8555,
+      "step": 287300
+    },
+    {
+      "epoch": 0.048560217460468076,
+      "grad_norm": 1.0733542442321777,
+      "learning_rate": 4.901843884035953e-06,
+      "loss": 1.8397,
+      "step": 287400
+    },
+    {
+      "epoch": 0.048839299170010994,
+      "grad_norm": 1.0520180463790894,
+      "learning_rate": 4.888684180744635e-06,
+      "loss": 1.841,
+      "step": 287500
+    },
+    {
+      "epoch": 0.04911838087955392,
+      "grad_norm": 1.047424077987671,
+      "learning_rate": 4.8755402511894175e-06,
+      "loss": 1.8359,
+      "step": 287600
+    },
+    {
+      "epoch": 0.04939746258909684,
+      "grad_norm": 1.0499184131622314,
+      "learning_rate": 4.862412105679384e-06,
+      "loss": 1.8258,
+      "step": 287700
+    },
+    {
+      "epoch": 0.049676544298639755,
+      "grad_norm": 1.0351871252059937,
+      "learning_rate": 4.849299754511205e-06,
+      "loss": 1.8348,
+      "step": 287800
+    },
+    {
+      "epoch": 0.04995562600818267,
+      "grad_norm": 1.0380860567092896,
+      "learning_rate": 4.836203207969183e-06,
+      "loss": 1.8306,
+      "step": 287900
+    },
+    {
+      "epoch": 0.0502347077177256,
+      "grad_norm": 1.0105301141738892,
+      "learning_rate": 4.823122476325231e-06,
+      "loss": 1.8432,
+      "step": 288000
+    },
+    {
+      "epoch": 0.0502347077177256,
+      "eval_loss": 2.199079751968384,
+      "eval_runtime": 52.0806,
+      "eval_samples_per_second": 195.735,
+      "eval_steps_per_second": 1.536,
+      "step": 288000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 1.004279375076294,
+      "learning_rate": 4.8100575698388324e-06,
+      "loss": 1.8457,
+      "step": 288100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 1.001739740371704,
+      "learning_rate": 4.79700849875708e-06,
+      "loss": 1.8321,
+      "step": 288200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 1.0556291341781616,
+      "learning_rate": 4.7839752733146395e-06,
+      "loss": 1.8446,
+      "step": 288300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 1.014046311378479,
+      "learning_rate": 4.7709579037337525e-06,
+      "loss": 1.8194,
+      "step": 288400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 1.0474220514297485,
+      "learning_rate": 4.757956400224214e-06,
+      "loss": 1.8424,
+      "step": 288500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 1.0006691217422485,
+      "learning_rate": 4.744970772983387e-06,
+      "loss": 1.83,
+      "step": 288600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 1.0182647705078125,
+      "learning_rate": 4.732001032196173e-06,
+      "loss": 1.8357,
+      "step": 288700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 1.0402028560638428,
+      "learning_rate": 4.719047188035028e-06,
+      "loss": 1.8395,
+      "step": 288800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 0.9996068477630615,
+      "learning_rate": 4.706109250659915e-06,
+      "loss": 1.8601,
+      "step": 288900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 1.0102015733718872,
+      "learning_rate": 4.693187230218351e-06,
+      "loss": 1.8282,
+      "step": 289000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.2036867141723633,
+      "eval_runtime": 51.9029,
+      "eval_samples_per_second": 196.405,
+      "eval_steps_per_second": 1.541,
+      "step": 289000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 1.0433801412582397,
+      "learning_rate": 4.680281136845338e-06,
+      "loss": 1.843,
+      "step": 289100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 1.04608952999115,
+      "learning_rate": 4.667390980663416e-06,
+      "loss": 1.8499,
+      "step": 289200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 1.0211896896362305,
+      "learning_rate": 4.654516771782597e-06,
+      "loss": 1.8431,
+      "step": 289300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 1.0457395315170288,
+      "learning_rate": 4.641658520300407e-06,
+      "loss": 1.8281,
+      "step": 289400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 1.0063202381134033,
+      "learning_rate": 4.6288162363018475e-06,
+      "loss": 1.8336,
+      "step": 289500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 1.003279209136963,
+      "learning_rate": 4.615989929859402e-06,
+      "loss": 1.8408,
+      "step": 289600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 1.0246212482452393,
+      "learning_rate": 4.603179611033006e-06,
+      "loss": 1.8343,
+      "step": 289700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 1.0443739891052246,
+      "learning_rate": 4.590385289870075e-06,
+      "loss": 1.8401,
+      "step": 289800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 1.0705519914627075,
+      "learning_rate": 4.577606976405466e-06,
+      "loss": 1.8389,
+      "step": 289900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 1.0605077743530273,
+      "learning_rate": 4.564844680661487e-06,
+      "loss": 1.8219,
+      "step": 290000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.2071588039398193,
+      "eval_runtime": 51.4776,
+      "eval_samples_per_second": 198.028,
+      "eval_steps_per_second": 1.554,
+      "step": 290000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 1.0212249755859375,
+      "learning_rate": 4.552098412647887e-06,
+      "loss": 1.8286,
+      "step": 290100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 1.0316197872161865,
+      "learning_rate": 4.539368182361822e-06,
+      "loss": 1.831,
+      "step": 290200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 1.059012532234192,
+      "learning_rate": 4.526653999787897e-06,
+      "loss": 1.8454,
+      "step": 290300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 1.0601192712783813,
+      "learning_rate": 4.51395587489811e-06,
+      "loss": 1.8344,
+      "step": 290400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 1.028264045715332,
+      "learning_rate": 4.50127381765188e-06,
+      "loss": 1.8418,
+      "step": 290500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 1.0473825931549072,
+      "learning_rate": 4.488607837996006e-06,
+      "loss": 1.8273,
+      "step": 290600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 1.0223557949066162,
+      "learning_rate": 4.475957945864692e-06,
+      "loss": 1.8318,
+      "step": 290700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 1.026455283164978,
+      "learning_rate": 4.463324151179521e-06,
+      "loss": 1.8252,
+      "step": 290800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 1.0249643325805664,
+      "learning_rate": 4.450706463849458e-06,
+      "loss": 1.8384,
+      "step": 290900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 1.0897847414016724,
+      "learning_rate": 4.438104893770806e-06,
+      "loss": 1.8316,
+      "step": 291000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.2129878997802734,
+      "eval_runtime": 51.4364,
+      "eval_samples_per_second": 198.186,
+      "eval_steps_per_second": 1.555,
+      "step": 291000
+    },
+    {
+      "epoch": 0.008651532995830519,
+      "grad_norm": 1.0365105867385864,
+      "learning_rate": 4.425519450827259e-06,
+      "loss": 1.8085,
+      "step": 291100
+    },
+    {
+      "epoch": 0.008930614705373438,
+      "grad_norm": 1.0275731086730957,
+      "learning_rate": 4.412950144889849e-06,
+      "loss": 1.8278,
+      "step": 291200
+    },
+    {
+      "epoch": 0.00920969641491636,
+      "grad_norm": 1.022194504737854,
+      "learning_rate": 4.400396985816957e-06,
+      "loss": 1.8147,
+      "step": 291300
+    },
+    {
+      "epoch": 0.00948877812445928,
+      "grad_norm": 1.0360770225524902,
+      "learning_rate": 4.387859983454279e-06,
+      "loss": 1.835,
+      "step": 291400
+    },
+    {
+      "epoch": 0.0097678598340022,
+      "grad_norm": 1.0490261316299438,
+      "learning_rate": 4.375339147634866e-06,
+      "loss": 1.8309,
+      "step": 291500
+    },
+    {
+      "epoch": 0.010046941543545119,
+      "grad_norm": 1.0081874132156372,
+      "learning_rate": 4.362834488179085e-06,
+      "loss": 1.8247,
+      "step": 291600
+    },
+    {
+      "epoch": 0.010326023253088039,
+      "grad_norm": 1.0340025424957275,
+      "learning_rate": 4.350346014894596e-06,
+      "loss": 1.8288,
+      "step": 291700
+    },
+    {
+      "epoch": 0.01060510496263096,
+      "grad_norm": 1.080769419670105,
+      "learning_rate": 4.337873737576376e-06,
+      "loss": 1.8186,
+      "step": 291800
+    },
+    {
+      "epoch": 0.01088418667217388,
+      "grad_norm": 1.027443528175354,
+      "learning_rate": 4.3254176660067005e-06,
+      "loss": 1.8374,
+      "step": 291900
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "grad_norm": 0.9847263097763062,
+      "learning_rate": 4.3129778099551376e-06,
+      "loss": 1.8312,
+      "step": 292000
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "eval_loss": 2.2189362049102783,
+      "eval_runtime": 51.4867,
+      "eval_samples_per_second": 197.993,
+      "eval_steps_per_second": 1.554,
+      "step": 292000
+    },
+    {
+      "epoch": 0.011442350091259719,
+      "grad_norm": 1.0336400270462036,
+      "learning_rate": 4.30055417917854e-06,
+      "loss": 1.8155,
+      "step": 292100
+    },
+    {
+      "epoch": 0.011721431800802639,
+      "grad_norm": 1.011435627937317,
+      "learning_rate": 4.288146783421012e-06,
+      "loss": 1.8494,
+      "step": 292200
+    },
+    {
+      "epoch": 0.012000513510345558,
+      "grad_norm": 1.0581125020980835,
+      "learning_rate": 4.275755632413947e-06,
+      "loss": 1.8194,
+      "step": 292300
+    },
+    {
+      "epoch": 0.01227959521988848,
+      "grad_norm": 1.0441781282424927,
+      "learning_rate": 4.263380735875991e-06,
+      "loss": 1.8043,
+      "step": 292400
+    },
+    {
+      "epoch": 0.0125586769294314,
+      "grad_norm": 0.9977090358734131,
+      "learning_rate": 4.251022103513047e-06,
+      "loss": 1.8424,
+      "step": 292500
+    },
+    {
+      "epoch": 0.012837758638974319,
+      "grad_norm": 1.0513739585876465,
+      "learning_rate": 4.238679745018243e-06,
+      "loss": 1.8396,
+      "step": 292600
+    },
+    {
+      "epoch": 0.013116840348517239,
+      "grad_norm": 1.013461947441101,
+      "learning_rate": 4.226353670071961e-06,
+      "loss": 1.8254,
+      "step": 292700
+    },
+    {
+      "epoch": 0.013395922058060158,
+      "grad_norm": 1.0267400741577148,
+      "learning_rate": 4.214043888341812e-06,
+      "loss": 1.8194,
+      "step": 292800
+    },
+    {
+      "epoch": 0.013675003767603078,
+      "grad_norm": 1.0605510473251343,
+      "learning_rate": 4.201750409482607e-06,
+      "loss": 1.83,
+      "step": 292900
+    },
+    {
+      "epoch": 0.013954085477146,
+      "grad_norm": 1.0429390668869019,
+      "learning_rate": 4.189473243136402e-06,
+      "loss": 1.8305,
+      "step": 293000
+    },
+    {
+      "epoch": 0.013954085477146,
+      "eval_loss": 2.212700843811035,
+      "eval_runtime": 51.5608,
+      "eval_samples_per_second": 197.708,
+      "eval_steps_per_second": 1.552,
+      "step": 293000
+    },
+    {
+      "epoch": 0.01423316718668892,
+      "grad_norm": 0.9996118545532227,
+      "learning_rate": 4.177212398932428e-06,
+      "loss": 1.8341,
+      "step": 293100
+    },
+    {
+      "epoch": 0.014512248896231839,
+      "grad_norm": 1.0341185331344604,
+      "learning_rate": 4.164967886487131e-06,
+      "loss": 1.8232,
+      "step": 293200
+    },
+    {
+      "epoch": 0.014791330605774759,
+      "grad_norm": 1.0189030170440674,
+      "learning_rate": 4.15273971540415e-06,
+      "loss": 1.8226,
+      "step": 293300
+    },
+    {
+      "epoch": 0.015070412315317678,
+      "grad_norm": 1.0681477785110474,
+      "learning_rate": 4.140527895274301e-06,
+      "loss": 1.8146,
+      "step": 293400
+    },
+    {
+      "epoch": 0.015349494024860598,
+      "grad_norm": 1.066925048828125,
+      "learning_rate": 4.128332435675569e-06,
+      "loss": 1.8229,
+      "step": 293500
+    },
+    {
+      "epoch": 0.01562857573440352,
+      "grad_norm": 1.0204412937164307,
+      "learning_rate": 4.116153346173121e-06,
+      "loss": 1.8244,
+      "step": 293600
+    },
+    {
+      "epoch": 0.015907657443946437,
+      "grad_norm": 1.0246905088424683,
+      "learning_rate": 4.103990636319274e-06,
+      "loss": 1.8073,
+      "step": 293700
+    },
+    {
+      "epoch": 0.01618673915348936,
+      "grad_norm": 1.0570878982543945,
+      "learning_rate": 4.091844315653512e-06,
+      "loss": 1.8125,
+      "step": 293800
+    },
+    {
+      "epoch": 0.01646582086303228,
+      "grad_norm": 1.0208039283752441,
+      "learning_rate": 4.079714393702441e-06,
+      "loss": 1.8197,
+      "step": 293900
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "grad_norm": 1.0461581945419312,
+      "learning_rate": 4.067600879979824e-06,
+      "loss": 1.8177,
+      "step": 294000
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "eval_loss": 2.225311756134033,
+      "eval_runtime": 51.5825,
+      "eval_samples_per_second": 197.625,
+      "eval_steps_per_second": 1.551,
+      "step": 294000
+    },
+    {
+      "epoch": 0.01702398428211812,
+      "grad_norm": 1.0400618314743042,
+      "learning_rate": 4.055503783986556e-06,
+      "loss": 1.8126,
+      "step": 294100
+    },
+    {
+      "epoch": 0.017303065991661037,
+      "grad_norm": 1.0598971843719482,
+      "learning_rate": 4.043423115210637e-06,
+      "loss": 1.826,
+      "step": 294200
+    },
+    {
+      "epoch": 0.01758214770120396,
+      "grad_norm": 0.9947335124015808,
+      "learning_rate": 4.031358883127207e-06,
+      "loss": 1.8312,
+      "step": 294300
+    },
+    {
+      "epoch": 0.017861229410746877,
+      "grad_norm": 1.0881414413452148,
+      "learning_rate": 4.019311097198489e-06,
+      "loss": 1.8321,
+      "step": 294400
+    },
+    {
+      "epoch": 0.018140311120289798,
+      "grad_norm": 1.0416432619094849,
+      "learning_rate": 4.007279766873828e-06,
+      "loss": 1.8171,
+      "step": 294500
+    },
+    {
+      "epoch": 0.01841939282983272,
+      "grad_norm": 1.0456783771514893,
+      "learning_rate": 3.9952649015896545e-06,
+      "loss": 1.8077,
+      "step": 294600
+    },
+    {
+      "epoch": 0.018698474539375638,
+      "grad_norm": 1.0717263221740723,
+      "learning_rate": 3.983266510769479e-06,
+      "loss": 1.8269,
+      "step": 294700
+    },
+    {
+      "epoch": 0.01897755624891856,
+      "grad_norm": 1.0348212718963623,
+      "learning_rate": 3.971284603823899e-06,
+      "loss": 1.839,
+      "step": 294800
+    },
+    {
+      "epoch": 0.019256637958461477,
+      "grad_norm": 1.017639398574829,
+      "learning_rate": 3.9593191901505846e-06,
+      "loss": 1.8076,
+      "step": 294900
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "grad_norm": 1.0568976402282715,
+      "learning_rate": 3.947370279134269e-06,
+      "loss": 1.8317,
+      "step": 295000
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "eval_loss": 2.2106025218963623,
+      "eval_runtime": 51.5707,
+      "eval_samples_per_second": 197.67,
+      "eval_steps_per_second": 1.551,
+      "step": 295000
+    },
+    {
+      "epoch": 0.01981480137754732,
+      "grad_norm": 0.9975104928016663,
+      "learning_rate": 3.935437880146728e-06,
+      "loss": 1.8075,
+      "step": 295100
+    },
+    {
+      "epoch": 0.020093883087090238,
+      "grad_norm": 1.0724748373031616,
+      "learning_rate": 3.923522002546804e-06,
+      "loss": 1.8101,
+      "step": 295200
+    },
+    {
+      "epoch": 0.02037296479663316,
+      "grad_norm": 1.0251374244689941,
+      "learning_rate": 3.911622655680375e-06,
+      "loss": 1.8165,
+      "step": 295300
+    },
+    {
+      "epoch": 0.020652046506176077,
+      "grad_norm": 0.9875963926315308,
+      "learning_rate": 3.89973984888036e-06,
+      "loss": 1.832,
+      "step": 295400
+    },
+    {
+      "epoch": 0.020931128215719,
+      "grad_norm": 1.022261619567871,
+      "learning_rate": 3.887873591466687e-06,
+      "loss": 1.822,
+      "step": 295500
+    },
+    {
+      "epoch": 0.02121020992526192,
+      "grad_norm": 1.035934329032898,
+      "learning_rate": 3.8760238927463306e-06,
+      "loss": 1.8143,
+      "step": 295600
+    },
+    {
+      "epoch": 0.021489291634804838,
+      "grad_norm": 1.0614137649536133,
+      "learning_rate": 3.864190762013248e-06,
+      "loss": 1.8123,
+      "step": 295700
+    },
+    {
+      "epoch": 0.02176837334434776,
+      "grad_norm": 1.0247828960418701,
+      "learning_rate": 3.8523742085484235e-06,
+      "loss": 1.8284,
+      "step": 295800
+    },
+    {
+      "epoch": 0.022047455053890677,
+      "grad_norm": 1.0341575145721436,
+      "learning_rate": 3.84057424161984e-06,
+      "loss": 1.8288,
+      "step": 295900
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "grad_norm": 1.0558165311813354,
+      "learning_rate": 3.8287908704824545e-06,
+      "loss": 1.8145,
+      "step": 296000
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "eval_loss": 2.2238752841949463,
+      "eval_runtime": 52.008,
+      "eval_samples_per_second": 196.008,
+      "eval_steps_per_second": 1.538,
+      "step": 296000
+    },
+    {
+      "epoch": 0.022605618472976517,
+      "grad_norm": 1.0519689321517944,
+      "learning_rate": 3.8170241043782225e-06,
+      "loss": 1.8309,
+      "step": 296100
+    },
+    {
+      "epoch": 0.022884700182519438,
+      "grad_norm": 1.0083707571029663,
+      "learning_rate": 3.8052739525360674e-06,
+      "loss": 1.8125,
+      "step": 296200
+    },
+    {
+      "epoch": 0.02316378189206236,
+      "grad_norm": 1.028019905090332,
+      "learning_rate": 3.793540424171896e-06,
+      "loss": 1.819,
+      "step": 296300
+    },
+    {
+      "epoch": 0.023442863601605277,
+      "grad_norm": 1.0253424644470215,
+      "learning_rate": 3.781823528488554e-06,
+      "loss": 1.7998,
+      "step": 296400
+    },
+    {
+      "epoch": 0.0237219453111482,
+      "grad_norm": 1.0056533813476562,
+      "learning_rate": 3.770123274675855e-06,
+      "loss": 1.8169,
+      "step": 296500
+    },
+    {
+      "epoch": 0.024001027020691117,
+      "grad_norm": 0.9970018863677979,
+      "learning_rate": 3.758439671910563e-06,
+      "loss": 1.8182,
+      "step": 296600
+    },
+    {
+      "epoch": 0.024280108730234038,
+      "grad_norm": 1.1070934534072876,
+      "learning_rate": 3.746772729356382e-06,
+      "loss": 1.8255,
+      "step": 296700
+    },
+    {
+      "epoch": 0.02455919043977696,
+      "grad_norm": 1.0187138319015503,
+      "learning_rate": 3.735122456163936e-06,
+      "loss": 1.8185,
+      "step": 296800
+    },
+    {
+      "epoch": 0.024838272149319877,
+      "grad_norm": 1.029852271080017,
+      "learning_rate": 3.723488861470792e-06,
+      "loss": 1.8215,
+      "step": 296900
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "grad_norm": 1.0434601306915283,
+      "learning_rate": 3.711871954401419e-06,
+      "loss": 1.8068,
+      "step": 297000
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "eval_loss": 2.2075464725494385,
+      "eval_runtime": 51.6992,
+      "eval_samples_per_second": 197.179,
+      "eval_steps_per_second": 1.547,
+      "step": 297000
+    },
+    {
+      "epoch": 0.025396435568405717,
+      "grad_norm": 1.087276816368103,
+      "learning_rate": 3.7002717440672184e-06,
+      "loss": 1.8137,
+      "step": 297100
+    },
+    {
+      "epoch": 0.025675517277948638,
+      "grad_norm": 1.039167046546936,
+      "learning_rate": 3.688688239566471e-06,
+      "loss": 1.802,
+      "step": 297200
+    },
+    {
+      "epoch": 0.025954598987491556,
+      "grad_norm": 1.0578351020812988,
+      "learning_rate": 3.6771214499843693e-06,
+      "loss": 1.8276,
+      "step": 297300
+    },
+    {
+      "epoch": 0.026233680697034478,
+      "grad_norm": 1.0496500730514526,
+      "learning_rate": 3.6655713843930018e-06,
+      "loss": 1.806,
+      "step": 297400
+    },
+    {
+      "epoch": 0.0265127624065774,
+      "grad_norm": 1.0503089427947998,
+      "learning_rate": 3.654038051851333e-06,
+      "loss": 1.8041,
+      "step": 297500
+    },
+    {
+      "epoch": 0.026791844116120317,
+      "grad_norm": 1.0246284008026123,
+      "learning_rate": 3.6425214614051936e-06,
+      "loss": 1.7952,
+      "step": 297600
+    },
+    {
+      "epoch": 0.02707092582566324,
+      "grad_norm": 1.0332282781600952,
+      "learning_rate": 3.631021622087297e-06,
+      "loss": 1.8265,
+      "step": 297700
+    },
+    {
+      "epoch": 0.027350007535206156,
+      "grad_norm": 1.0222516059875488,
+      "learning_rate": 3.619538542917217e-06,
+      "loss": 1.8215,
+      "step": 297800
+    },
+    {
+      "epoch": 0.027629089244749078,
+      "grad_norm": 1.0545893907546997,
+      "learning_rate": 3.608072232901377e-06,
+      "loss": 1.8263,
+      "step": 297900
+    },
+    {
+      "epoch": 0.027908170954292,
+      "grad_norm": 1.091201901435852,
+      "learning_rate": 3.596622701033048e-06,
+      "loss": 1.8228,
+      "step": 298000
+    },
+    {
+      "epoch": 0.027908170954292,
+      "eval_loss": 2.2129366397857666,
+      "eval_runtime": 51.7579,
+      "eval_samples_per_second": 196.955,
+      "eval_steps_per_second": 1.546,
+      "step": 298000
+    },
+    {
+      "epoch": 0.028187252663834917,
+      "grad_norm": 1.0261002779006958,
+      "learning_rate": 3.58518995629234e-06,
+      "loss": 1.8203,
+      "step": 298100
+    },
+    {
+      "epoch": 0.02846633437337784,
+      "grad_norm": 1.0479872226715088,
+      "learning_rate": 3.5737740076462106e-06,
+      "loss": 1.7966,
+      "step": 298200
+    },
+    {
+      "epoch": 0.028745416082920756,
+      "grad_norm": 1.036954641342163,
+      "learning_rate": 3.562374864048429e-06,
+      "loss": 1.8111,
+      "step": 298300
+    },
+    {
+      "epoch": 0.029024497792463678,
+      "grad_norm": 1.0831959247589111,
+      "learning_rate": 3.550992534439576e-06,
+      "loss": 1.7991,
+      "step": 298400
+    },
+    {
+      "epoch": 0.0293035795020066,
+      "grad_norm": 1.0515846014022827,
+      "learning_rate": 3.539627027747067e-06,
+      "loss": 1.815,
+      "step": 298500
+    },
+    {
+      "epoch": 0.029582661211549517,
+      "grad_norm": 1.0659505128860474,
+      "learning_rate": 3.5282783528851117e-06,
+      "loss": 1.8105,
+      "step": 298600
+    },
+    {
+      "epoch": 0.02986174292109244,
+      "grad_norm": 1.0669214725494385,
+      "learning_rate": 3.516946518754724e-06,
+      "loss": 1.7961,
+      "step": 298700
+    },
+    {
+      "epoch": 0.030140824630635357,
+      "grad_norm": 1.0422730445861816,
+      "learning_rate": 3.5056315342436945e-06,
+      "loss": 1.8219,
+      "step": 298800
+    },
+    {
+      "epoch": 0.030419906340178278,
+      "grad_norm": 1.0351274013519287,
+      "learning_rate": 3.4943334082266103e-06,
+      "loss": 1.8183,
+      "step": 298900
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "grad_norm": 1.0675437450408936,
+      "learning_rate": 3.483052149564839e-06,
+      "loss": 1.8024,
+      "step": 299000
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "eval_loss": 2.2119719982147217,
+      "eval_runtime": 51.8915,
+      "eval_samples_per_second": 196.448,
+      "eval_steps_per_second": 1.542,
+      "step": 299000
+    },
+    {
+      "epoch": 0.030978069759264117,
+      "grad_norm": 1.033592939376831,
+      "learning_rate": 3.4717877671065103e-06,
+      "loss": 1.8264,
+      "step": 299100
+    },
+    {
+      "epoch": 0.03125715146880704,
+      "grad_norm": 1.0776604413986206,
+      "learning_rate": 3.460540269686524e-06,
+      "loss": 1.7936,
+      "step": 299200
+    },
+    {
+      "epoch": 0.03153623317834996,
+      "grad_norm": 1.0117859840393066,
+      "learning_rate": 3.4493096661265267e-06,
+      "loss": 1.7972,
+      "step": 299300
+    },
+    {
+      "epoch": 0.031815314887892875,
+      "grad_norm": 1.0392084121704102,
+      "learning_rate": 3.438095965234928e-06,
+      "loss": 1.8174,
+      "step": 299400
+    },
+    {
+      "epoch": 0.0320943965974358,
+      "grad_norm": 1.015053153038025,
+      "learning_rate": 3.4268991758068745e-06,
+      "loss": 1.8134,
+      "step": 299500
+    },
+    {
+      "epoch": 0.03237347830697872,
+      "grad_norm": 1.012290596961975,
+      "learning_rate": 3.415719306624246e-06,
+      "loss": 1.8254,
+      "step": 299600
+    },
+    {
+      "epoch": 0.032652560016521635,
+      "grad_norm": 1.0493707656860352,
+      "learning_rate": 3.404556366455647e-06,
+      "loss": 1.8037,
+      "step": 299700
+    },
+    {
+      "epoch": 0.03293164172606456,
+      "grad_norm": 1.0958573818206787,
+      "learning_rate": 3.3934103640564152e-06,
+      "loss": 1.8072,
+      "step": 299800
+    },
+    {
+      "epoch": 0.03321072343560748,
+      "grad_norm": 1.0864017009735107,
+      "learning_rate": 3.382281308168603e-06,
+      "loss": 1.8337,
+      "step": 299900
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "grad_norm": 1.0446292161941528,
+      "learning_rate": 3.3711692075209687e-06,
+      "loss": 1.8123,
+      "step": 300000
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "eval_loss": 2.2072536945343018,
+      "eval_runtime": 51.8799,
+      "eval_samples_per_second": 196.492,
+      "eval_steps_per_second": 1.542,
+      "step": 300000
+    },
+    {
+      "epoch": 0.033768886854693314,
+      "grad_norm": 1.0435408353805542,
+      "learning_rate": 3.3600740708289615e-06,
+      "loss": 1.7973,
+      "step": 300100
+    },
+    {
+      "epoch": 0.03404796856423624,
+      "grad_norm": 1.0345299243927002,
+      "learning_rate": 3.348995906794741e-06,
+      "loss": 1.8213,
+      "step": 300200
+    },
+    {
+      "epoch": 0.03432705027377916,
+      "grad_norm": 1.037927269935608,
+      "learning_rate": 3.33793472410715e-06,
+      "loss": 1.8048,
+      "step": 300300
+    },
+    {
+      "epoch": 0.034606131983322075,
+      "grad_norm": 1.0403209924697876,
+      "learning_rate": 3.326890531441712e-06,
+      "loss": 1.8136,
+      "step": 300400
+    },
+    {
+      "epoch": 0.034885213692865,
+      "grad_norm": 1.0413801670074463,
+      "learning_rate": 3.31586333746062e-06,
+      "loss": 1.7982,
+      "step": 300500
+    },
+    {
+      "epoch": 0.03516429540240792,
+      "grad_norm": 1.0370949506759644,
+      "learning_rate": 3.3048531508127366e-06,
+      "loss": 1.7944,
+      "step": 300600
+    },
+    {
+      "epoch": 0.035443377111950836,
+      "grad_norm": 1.0159741640090942,
+      "learning_rate": 3.2938599801335928e-06,
+      "loss": 1.8001,
+      "step": 300700
+    },
+    {
+      "epoch": 0.035722458821493754,
+      "grad_norm": 1.0671415328979492,
+      "learning_rate": 3.282883834045372e-06,
+      "loss": 1.7925,
+      "step": 300800
+    },
+    {
+      "epoch": 0.03600154053103668,
+      "grad_norm": 1.0309702157974243,
+      "learning_rate": 3.2719247211568965e-06,
+      "loss": 1.8119,
+      "step": 300900
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "grad_norm": 1.02182137966156,
+      "learning_rate": 3.2609826500636238e-06,
+      "loss": 1.8186,
+      "step": 301000
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "eval_loss": 2.212273597717285,
+      "eval_runtime": 51.7058,
+      "eval_samples_per_second": 197.154,
+      "eval_steps_per_second": 1.547,
+      "step": 301000
+    },
+    {
+      "epoch": 0.036559703950122514,
+      "grad_norm": 1.0467265844345093,
+      "learning_rate": 3.2500576293476638e-06,
+      "loss": 1.8002,
+      "step": 301100
+    },
+    {
+      "epoch": 0.03683878565966544,
+      "grad_norm": 1.0487096309661865,
+      "learning_rate": 3.2391496675777484e-06,
+      "loss": 1.7995,
+      "step": 301200
+    },
+    {
+      "epoch": 0.03711786736920836,
+      "grad_norm": 1.0497572422027588,
+      "learning_rate": 3.2282587733092173e-06,
+      "loss": 1.8021,
+      "step": 301300
+    },
+    {
+      "epoch": 0.037396949078751275,
+      "grad_norm": 1.0201036930084229,
+      "learning_rate": 3.217384955084035e-06,
+      "loss": 1.809,
+      "step": 301400
+    },
+    {
+      "epoch": 0.0376760307882942,
+      "grad_norm": 1.0497982501983643,
+      "learning_rate": 3.2065282214307712e-06,
+      "loss": 1.8115,
+      "step": 301500
+    },
+    {
+      "epoch": 0.03795511249783712,
+      "grad_norm": 1.0479981899261475,
+      "learning_rate": 3.1956885808646002e-06,
+      "loss": 1.805,
+      "step": 301600
+    },
+    {
+      "epoch": 0.038234194207380036,
+      "grad_norm": 1.0764997005462646,
+      "learning_rate": 3.1848660418872744e-06,
+      "loss": 1.8092,
+      "step": 301700
+    },
+    {
+      "epoch": 0.038513275916922954,
+      "grad_norm": 1.046151876449585,
+      "learning_rate": 3.174060612987148e-06,
+      "loss": 1.8185,
+      "step": 301800
+    },
+    {
+      "epoch": 0.03879235762646588,
+      "grad_norm": 1.0357836484909058,
+      "learning_rate": 3.1632723026391503e-06,
+      "loss": 1.8195,
+      "step": 301900
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "grad_norm": 1.089996099472046,
+      "learning_rate": 3.1525011193047847e-06,
+      "loss": 1.7961,
+      "step": 302000
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "eval_loss": 2.2081243991851807,
+      "eval_runtime": 51.8009,
+      "eval_samples_per_second": 196.792,
+      "eval_steps_per_second": 1.544,
+      "step": 302000
+    },
+    {
+      "epoch": 0.039350521045551715,
+      "grad_norm": 1.045300006866455,
+      "learning_rate": 3.1417470714321275e-06,
+      "loss": 1.8065,
+      "step": 302100
+    },
+    {
+      "epoch": 0.03962960275509464,
+      "grad_norm": 1.0354883670806885,
+      "learning_rate": 3.1310101674558e-06,
+      "loss": 1.795,
+      "step": 302200
+    },
+    {
+      "epoch": 0.03990868446463756,
+      "grad_norm": 1.07806396484375,
+      "learning_rate": 3.1202904157969865e-06,
+      "loss": 1.7949,
+      "step": 302300
+    },
+    {
+      "epoch": 0.040187766174180475,
+      "grad_norm": 1.0537368059158325,
+      "learning_rate": 3.1095878248634164e-06,
+      "loss": 1.8252,
+      "step": 302400
+    },
+    {
+      "epoch": 0.04046684788372339,
+      "grad_norm": 1.066607117652893,
+      "learning_rate": 3.0989024030493723e-06,
+      "loss": 1.7998,
+      "step": 302500
+    },
+    {
+      "epoch": 0.04074592959326632,
+      "grad_norm": 1.0885719060897827,
+      "learning_rate": 3.0882341587356476e-06,
+      "loss": 1.8006,
+      "step": 302600
+    },
+    {
+      "epoch": 0.041025011302809236,
+      "grad_norm": 1.0701121091842651,
+      "learning_rate": 3.0775831002895774e-06,
+      "loss": 1.8307,
+      "step": 302700
+    },
+    {
+      "epoch": 0.041304093012352154,
+      "grad_norm": 1.045860767364502,
+      "learning_rate": 3.0669492360650196e-06,
+      "loss": 1.8094,
+      "step": 302800
+    },
+    {
+      "epoch": 0.04158317472189508,
+      "grad_norm": 1.0620360374450684,
+      "learning_rate": 3.056332574402346e-06,
+      "loss": 1.8162,
+      "step": 302900
+    },
+    {
+      "epoch": 0.041862256431438,
+      "grad_norm": 1.0758084058761597,
+      "learning_rate": 3.0457331236284166e-06,
+      "loss": 1.7981,
+      "step": 303000
+    },
+    {
+      "epoch": 0.041862256431438,
+      "eval_loss": 2.2137293815612793,
+      "eval_runtime": 51.9224,
+      "eval_samples_per_second": 196.331,
+      "eval_steps_per_second": 1.541,
+      "step": 303000
+    },
+    {
+      "epoch": 0.0002,
+      "grad_norm": 1.0506842136383057,
+      "learning_rate": 1.7259637505723265e-05,
+      "loss": 1.7958,
+      "step": 303100
+    },
+    {
+      "epoch": 0.0004,
+      "grad_norm": 1.034621000289917,
+      "learning_rate": 1.7244552087867325e-05,
+      "loss": 1.8261,
+      "step": 303200
+    },
+    {
+      "epoch": 0.0006,
+      "grad_norm": 1.125115156173706,
+      "learning_rate": 1.7229469793904873e-05,
+      "loss": 1.8172,
+      "step": 303300
+    },
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.0532312393188477,
+      "learning_rate": 1.7214390629911066e-05,
+      "loss": 1.8165,
+      "step": 303400
+    },
+    {
+      "epoch": 0.001,
+      "grad_norm": 1.0483386516571045,
+      "learning_rate": 1.7199314601959778e-05,
+      "loss": 1.8275,
+      "step": 303500
+    },
+    {
+      "epoch": 0.0012,
+      "grad_norm": 1.0204639434814453,
+      "learning_rate": 1.7184241716123635e-05,
+      "loss": 1.816,
+      "step": 303600
+    },
+    {
+      "epoch": 0.0014,
+      "grad_norm": 1.069264531135559,
+      "learning_rate": 1.7169171978473994e-05,
+      "loss": 1.8174,
+      "step": 303700
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.0591576099395752,
+      "learning_rate": 1.715410539508095e-05,
+      "loss": 1.8284,
+      "step": 303800
+    },
+    {
+      "epoch": 0.0018,
+      "grad_norm": 1.086665391921997,
+      "learning_rate": 1.7139041972013304e-05,
+      "loss": 1.8279,
+      "step": 303900
+    },
+    {
+      "epoch": 0.002,
+      "grad_norm": 1.094480037689209,
+      "learning_rate": 1.712398171533862e-05,
+      "loss": 1.8144,
+      "step": 304000
+    },
+    {
+      "epoch": 0.002,
+      "eval_loss": 2.229548931121826,
+      "eval_runtime": 52.0986,
+      "eval_samples_per_second": 195.667,
+      "eval_steps_per_second": 1.536,
+      "step": 304000
+    },
+    {
+      "epoch": 0.0022,
+      "grad_norm": 1.0918675661087036,
+      "learning_rate": 1.710892463112316e-05,
+      "loss": 1.8137,
+      "step": 304100
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 1.0117205381393433,
+      "learning_rate": 1.709387072543191e-05,
+      "loss": 1.8065,
+      "step": 304200
+    },
+    {
+      "epoch": 0.0026,
+      "grad_norm": 1.1326615810394287,
+      "learning_rate": 1.7078820004328587e-05,
+      "loss": 1.814,
+      "step": 304300
+    },
+    {
+      "epoch": 0.0028,
+      "grad_norm": 1.048861026763916,
+      "learning_rate": 1.7063772473875616e-05,
+      "loss": 1.8061,
+      "step": 304400
+    },
+    {
+      "epoch": 0.003,
+      "grad_norm": 1.0585246086120605,
+      "learning_rate": 1.7048728140134152e-05,
+      "loss": 1.8293,
+      "step": 304500
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.0181670188903809,
+      "learning_rate": 1.7033687009164033e-05,
+      "loss": 1.8163,
+      "step": 304600
+    },
+    {
+      "epoch": 0.0034,
+      "grad_norm": 1.0025696754455566,
+      "learning_rate": 1.701864908702384e-05,
+      "loss": 1.8058,
+      "step": 304700
+    },
+    {
+      "epoch": 0.0036,
+      "grad_norm": 1.0825532674789429,
+      "learning_rate": 1.700361437977084e-05,
+      "loss": 1.818,
+      "step": 304800
+    },
+    {
+      "epoch": 0.0038,
+      "grad_norm": 1.0389013290405273,
+      "learning_rate": 1.6988582893461008e-05,
+      "loss": 1.8142,
+      "step": 304900
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 1.0209424495697021,
+      "learning_rate": 1.697355463414903e-05,
+      "loss": 1.8103,
+      "step": 305000
+    },
+    {
+      "epoch": 0.004,
+      "eval_loss": 2.230199098587036,
+      "eval_runtime": 51.6157,
+      "eval_samples_per_second": 197.498,
+      "eval_steps_per_second": 1.55,
+      "step": 305000
+    },
+    {
+      "epoch": 0.0042,
+      "grad_norm": 1.085379958152771,
+      "learning_rate": 1.695852960788829e-05,
+      "loss": 1.8192,
+      "step": 305100
+    },
+    {
+      "epoch": 0.0044,
+      "grad_norm": 1.0386351346969604,
+      "learning_rate": 1.6943507820730854e-05,
+      "loss": 1.8061,
+      "step": 305200
+    },
+    {
+      "epoch": 0.0046,
+      "grad_norm": 1.0565484762191772,
+      "learning_rate": 1.692848927872751e-05,
+      "loss": 1.8081,
+      "step": 305300
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 1.0819813013076782,
+      "learning_rate": 1.6913473987927713e-05,
+      "loss": 1.8158,
+      "step": 305400
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 1.0319418907165527,
+      "learning_rate": 1.6898461954379636e-05,
+      "loss": 1.7954,
+      "step": 305500
+    },
+    {
+      "epoch": 0.0052,
+      "grad_norm": 1.0530176162719727,
+      "learning_rate": 1.6883453184130116e-05,
+      "loss": 1.8046,
+      "step": 305600
+    },
+    {
+      "epoch": 0.0054,
+      "grad_norm": 1.0865267515182495,
+      "learning_rate": 1.686844768322467e-05,
+      "loss": 1.7917,
+      "step": 305700
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 1.027178406715393,
+      "learning_rate": 1.6853445457707538e-05,
+      "loss": 1.7988,
+      "step": 305800
+    },
+    {
+      "epoch": 0.0058,
+      "grad_norm": 1.0627230405807495,
+      "learning_rate": 1.6838446513621593e-05,
+      "loss": 1.7954,
+      "step": 305900
+    },
+    {
+      "epoch": 0.006,
+      "grad_norm": 1.059670329093933,
+      "learning_rate": 1.6823450857008423e-05,
+      "loss": 1.7974,
+      "step": 306000
+    },
+    {
+      "epoch": 0.006,
+      "eval_loss": 2.230013608932495,
+      "eval_runtime": 51.6205,
+      "eval_samples_per_second": 197.48,
+      "eval_steps_per_second": 1.55,
+      "step": 306000
+    },
+    {
+      "epoch": 0.0062,
+      "grad_norm": 1.0403988361358643,
+      "learning_rate": 1.6808458493908258e-05,
+      "loss": 1.7976,
+      "step": 306100
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.0143063068389893,
+      "learning_rate": 1.6793469430360042e-05,
+      "loss": 1.7949,
+      "step": 306200
+    },
+    {
+      "epoch": 0.0066,
+      "grad_norm": 1.1919389963150024,
+      "learning_rate": 1.6778483672401356e-05,
+      "loss": 1.8018,
+      "step": 306300
+    },
+    {
+      "epoch": 0.0068,
+      "grad_norm": 1.06490957736969,
+      "learning_rate": 1.6763501226068465e-05,
+      "loss": 1.8087,
+      "step": 306400
+    },
+    {
+      "epoch": 0.007,
+      "grad_norm": 1.0884573459625244,
+      "learning_rate": 1.674852209739629e-05,
+      "loss": 1.8177,
+      "step": 306500
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 1.0523546934127808,
+      "learning_rate": 1.6733546292418434e-05,
+      "loss": 1.7789,
+      "step": 306600
+    },
+    {
+      "epoch": 0.0074,
+      "grad_norm": 1.0929498672485352,
+      "learning_rate": 1.6718573817167137e-05,
+      "loss": 1.8022,
+      "step": 306700
+    },
+    {
+      "epoch": 0.0076,
+      "grad_norm": 1.0335514545440674,
+      "learning_rate": 1.6703604677673322e-05,
+      "loss": 1.7912,
+      "step": 306800
+    },
+    {
+      "epoch": 0.0078,
+      "grad_norm": 1.0258134603500366,
+      "learning_rate": 1.6688638879966546e-05,
+      "loss": 1.7952,
+      "step": 306900
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.0420570373535156,
+      "learning_rate": 1.6673676430075036e-05,
+      "loss": 1.7981,
+      "step": 307000
+    },
+    {
+      "epoch": 0.008,
+      "eval_loss": 2.228384256362915,
+      "eval_runtime": 51.6428,
+      "eval_samples_per_second": 197.395,
+      "eval_steps_per_second": 1.549,
+      "step": 307000
+    },
+    {
+      "epoch": 0.0082,
+      "grad_norm": 1.065299391746521,
+      "learning_rate": 1.6658717334025664e-05,
+      "loss": 1.8051,
+      "step": 307100
+    },
+    {
+      "epoch": 0.0084,
+      "grad_norm": 1.015187382698059,
+      "learning_rate": 1.6643761597843953e-05,
+      "loss": 1.8016,
+      "step": 307200
+    },
+    {
+      "epoch": 0.0086,
+      "grad_norm": 1.047338843345642,
+      "learning_rate": 1.6628809227554077e-05,
+      "loss": 1.7974,
+      "step": 307300
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 1.0116043090820312,
+      "learning_rate": 1.6613860229178836e-05,
+      "loss": 1.793,
+      "step": 307400
+    },
+    {
+      "epoch": 0.009,
+      "grad_norm": 1.0261743068695068,
+      "learning_rate": 1.6598914608739695e-05,
+      "loss": 1.789,
+      "step": 307500
+    },
+    {
+      "epoch": 0.0092,
+      "grad_norm": 1.0221142768859863,
+      "learning_rate": 1.658397237225674e-05,
+      "loss": 1.7865,
+      "step": 307600
+    },
+    {
+      "epoch": 0.0094,
+      "grad_norm": 1.050794005393982,
+      "learning_rate": 1.6569033525748712e-05,
+      "loss": 1.7725,
+      "step": 307700
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.1043586730957031,
+      "learning_rate": 1.6554098075232967e-05,
+      "loss": 1.7772,
+      "step": 307800
+    },
+    {
+      "epoch": 0.0098,
+      "grad_norm": 1.0293883085250854,
+      "learning_rate": 1.6539166026725515e-05,
+      "loss": 1.8076,
+      "step": 307900
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.0498391389846802,
+      "learning_rate": 1.6524237386240964e-05,
+      "loss": 1.7978,
+      "step": 308000
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 2.2343056201934814,
+      "eval_runtime": 51.6864,
+      "eval_samples_per_second": 197.228,
+      "eval_steps_per_second": 1.548,
+      "step": 308000
+    },
+    {
+      "epoch": 0.0102,
+      "grad_norm": 1.0795516967773438,
+      "learning_rate": 1.6509312159792594e-05,
+      "loss": 1.8164,
+      "step": 308100
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 1.0950493812561035,
+      "learning_rate": 1.6494390353392258e-05,
+      "loss": 1.7901,
+      "step": 308200
+    },
+    {
+      "epoch": 0.0106,
+      "grad_norm": 1.0679363012313843,
+      "learning_rate": 1.6479471973050482e-05,
+      "loss": 1.8094,
+      "step": 308300
+    },
+    {
+      "epoch": 0.0108,
+      "grad_norm": 1.0638396739959717,
+      "learning_rate": 1.6464557024776365e-05,
+      "loss": 1.7981,
+      "step": 308400
+    },
+    {
+      "epoch": 0.011,
+      "grad_norm": 1.0541220903396606,
+      "learning_rate": 1.6449645514577668e-05,
+      "loss": 1.7955,
+      "step": 308500
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 1.0506057739257812,
+      "learning_rate": 1.6434737448460725e-05,
+      "loss": 1.7793,
+      "step": 308600
+    },
+    {
+      "epoch": 0.0114,
+      "grad_norm": 1.0873394012451172,
+      "learning_rate": 1.6419832832430522e-05,
+      "loss": 1.7941,
+      "step": 308700
+    },
+    {
+      "epoch": 0.0116,
+      "grad_norm": 1.0632107257843018,
+      "learning_rate": 1.6404931672490625e-05,
+      "loss": 1.7861,
+      "step": 308800
+    },
+    {
+      "epoch": 0.0118,
+      "grad_norm": 1.1098015308380127,
+      "learning_rate": 1.6390033974643222e-05,
+      "loss": 1.7709,
+      "step": 308900
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 1.046675682067871,
+      "learning_rate": 1.6375139744889107e-05,
+      "loss": 1.7811,
+      "step": 309000
+    },
+    {
+      "epoch": 0.012,
+      "eval_loss": 2.2420222759246826,
+      "eval_runtime": 51.584,
+      "eval_samples_per_second": 197.619,
+      "eval_steps_per_second": 1.551,
+      "step": 309000
+    },
+    {
+      "epoch": 0.0122,
+      "grad_norm": 1.047875165939331,
+      "learning_rate": 1.6360248989227666e-05,
+      "loss": 1.7818,
+      "step": 309100
+    },
+    {
+      "epoch": 0.0124,
+      "grad_norm": 1.0503313541412354,
+      "learning_rate": 1.6345361713656904e-05,
+      "loss": 1.7718,
+      "step": 309200
+    },
+    {
+      "epoch": 0.0126,
+      "grad_norm": 1.046026587486267,
+      "learning_rate": 1.6330477924173403e-05,
+      "loss": 1.7518,
+      "step": 309300
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.0461571216583252,
+      "learning_rate": 1.6315597626772365e-05,
+      "loss": 1.7751,
+      "step": 309400
+    },
+    {
+      "epoch": 0.013,
+      "grad_norm": 1.0191349983215332,
+      "learning_rate": 1.6300720827447556e-05,
+      "loss": 1.7724,
+      "step": 309500
+    },
+    {
+      "epoch": 0.0132,
+      "grad_norm": 1.0420078039169312,
+      "learning_rate": 1.6285847532191364e-05,
+      "loss": 1.7394,
+      "step": 309600
+    },
+    {
+      "epoch": 0.0134,
+      "grad_norm": 1.0415441989898682,
+      "learning_rate": 1.627097774699474e-05,
+      "loss": 1.7405,
+      "step": 309700
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 1.0861761569976807,
+      "learning_rate": 1.625611147784724e-05,
+      "loss": 1.7572,
+      "step": 309800
+    },
+    {
+      "epoch": 0.0138,
+      "grad_norm": 1.042179822921753,
+      "learning_rate": 1.6241248730736985e-05,
+      "loss": 1.7634,
+      "step": 309900
+    },
+    {
+      "epoch": 0.014,
+      "grad_norm": 1.0887514352798462,
+      "learning_rate": 1.6226389511650697e-05,
+      "loss": 1.7487,
+      "step": 310000
+    },
+    {
+      "epoch": 0.014,
+      "eval_loss": 2.244732618331909,
+      "eval_runtime": 51.6991,
+      "eval_samples_per_second": 197.18,
+      "eval_steps_per_second": 1.547,
+      "step": 310000
+    },
+    {
+      "epoch": 0.0142,
+      "grad_norm": 1.0510177612304688,
+      "learning_rate": 1.6211533826573662e-05,
+      "loss": 1.7426,
+      "step": 310100
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.9902233481407166,
+      "learning_rate": 1.6196681681489755e-05,
+      "loss": 1.7452,
+      "step": 310200
+    },
+    {
+      "epoch": 0.0146,
+      "grad_norm": 1.0358948707580566,
+      "learning_rate": 1.6181833082381413e-05,
+      "loss": 1.7292,
+      "step": 310300
+    },
+    {
+      "epoch": 0.0148,
+      "grad_norm": 1.0080764293670654,
+      "learning_rate": 1.6166988035229652e-05,
+      "loss": 1.7368,
+      "step": 310400
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 1.0920326709747314,
+      "learning_rate": 1.6152146546014053e-05,
+      "loss": 1.7186,
+      "step": 310500
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 1.0890278816223145,
+      "learning_rate": 1.6137308620712765e-05,
+      "loss": 1.7179,
+      "step": 310600
+    },
+    {
+      "epoch": 0.0154,
+      "grad_norm": 1.0208715200424194,
+      "learning_rate": 1.612247426530251e-05,
+      "loss": 1.744,
+      "step": 310700
+    },
+    {
+      "epoch": 0.0156,
+      "grad_norm": 0.9500866532325745,
+      "learning_rate": 1.610764348575856e-05,
+      "loss": 1.6606,
+      "step": 310800
+    },
+    {
+      "epoch": 0.0158,
+      "grad_norm": 0.9557023048400879,
+      "learning_rate": 1.6092816288054746e-05,
+      "loss": 1.4109,
+      "step": 310900
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9185681343078613,
+      "learning_rate": 1.6077992678163467e-05,
+      "loss": 1.3687,
+      "step": 311000
+    },
+    {
+      "epoch": 0.016,
+      "eval_loss": 2.290562629699707,
+      "eval_runtime": 51.8115,
+      "eval_samples_per_second": 196.752,
+      "eval_steps_per_second": 1.544,
+      "step": 311000
+    },
+    {
+      "epoch": 0.0162,
+      "grad_norm": 0.9145857691764832,
+      "learning_rate": 1.6063172662055665e-05,
+      "loss": 1.3382,
+      "step": 311100
+    },
+    {
+      "epoch": 0.0164,
+      "grad_norm": 0.9351733922958374,
+      "learning_rate": 1.6048356245700856e-05,
+      "loss": 1.3208,
+      "step": 311200
+    },
+    {
+      "epoch": 0.0166,
+      "grad_norm": 0.9079789519309998,
+      "learning_rate": 1.603354343506707e-05,
+      "loss": 1.2985,
+      "step": 311300
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 1.0671257972717285,
+      "learning_rate": 1.6018734236120926e-05,
+      "loss": 1.3041,
+      "step": 311400
+    },
+    {
+      "epoch": 0.017,
+      "grad_norm": 0.8702555894851685,
+      "learning_rate": 1.600392865482755e-05,
+      "loss": 1.278,
+      "step": 311500
+    },
+    {
+      "epoch": 0.0172,
+      "grad_norm": 0.9119271039962769,
+      "learning_rate": 1.598912669715064e-05,
+      "loss": 1.2662,
+      "step": 311600
+    },
+    {
+      "epoch": 0.0174,
+      "grad_norm": 0.8721778988838196,
+      "learning_rate": 1.5974328369052415e-05,
+      "loss": 1.2713,
+      "step": 311700
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.9360621571540833,
+      "learning_rate": 1.5959533676493647e-05,
+      "loss": 1.2523,
+      "step": 311800
+    },
+    {
+      "epoch": 0.0178,
+      "grad_norm": 0.9057286381721497,
+      "learning_rate": 1.5944742625433633e-05,
+      "loss": 1.2308,
+      "step": 311900
+    },
+    {
+      "epoch": 0.018,
+      "grad_norm": 0.874999463558197,
+      "learning_rate": 1.5929955221830202e-05,
+      "loss": 1.2274,
+      "step": 312000
+    },
+    {
+      "epoch": 0.018,
+      "eval_loss": 2.374765634536743,
+      "eval_runtime": 51.8773,
+      "eval_samples_per_second": 196.502,
+      "eval_steps_per_second": 1.542,
+      "step": 312000
+    },
+    {
+      "epoch": 0.0182,
+      "grad_norm": 0.9309009313583374,
+      "learning_rate": 1.591517147163973e-05,
+      "loss": 1.224,
+      "step": 312100
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.8504728674888611,
+      "learning_rate": 1.59003913808171e-05,
+      "loss": 1.2055,
+      "step": 312200
+    },
+    {
+      "epoch": 0.0186,
+      "grad_norm": 0.9231265783309937,
+      "learning_rate": 1.588561495531573e-05,
+      "loss": 1.2074,
+      "step": 312300
+    },
+    {
+      "epoch": 0.0188,
+      "grad_norm": 0.9524025321006775,
+      "learning_rate": 1.587084220108757e-05,
+      "loss": 1.1945,
+      "step": 312400
+    },
+    {
+      "epoch": 0.019,
+      "grad_norm": 0.8538132309913635,
+      "learning_rate": 1.585607312408308e-05,
+      "loss": 1.202,
+      "step": 312500
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.1738858222961426,
+      "learning_rate": 1.5841307730251237e-05,
+      "loss": 1.1787,
+      "step": 312600
+    },
+    {
+      "epoch": 0.0194,
+      "grad_norm": 0.9254825711250305,
+      "learning_rate": 1.5826546025539552e-05,
+      "loss": 1.1737,
+      "step": 312700
+    },
+    {
+      "epoch": 0.0196,
+      "grad_norm": 0.8884557485580444,
+      "learning_rate": 1.5811788015894025e-05,
+      "loss": 1.1715,
+      "step": 312800
+    },
+    {
+      "epoch": 0.0198,
+      "grad_norm": 0.8768421411514282,
+      "learning_rate": 1.579703370725919e-05,
+      "loss": 1.1701,
+      "step": 312900
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.9079160690307617,
+      "learning_rate": 1.5782283105578076e-05,
+      "loss": 1.1533,
+      "step": 313000
+    },
+    {
+      "epoch": 0.02,
+      "eval_loss": 2.40177059173584,
+      "eval_runtime": 52.039,
+      "eval_samples_per_second": 195.892,
+      "eval_steps_per_second": 1.537,
+      "step": 313000
     }
   ],
   "logging_steps": 100,
+  "max_steps": 500000,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 9223372036854775807,
   "save_steps": 1000,
       "attributes": {}
     }
   },
+  "total_flos": 2.731626434710733e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4a02f378eb43a11dd8f33d3260f082abc6de1be9c9ee104cd03f04d37fc9b629
 size 5777

 version https://git-lfs.github.com/spec/v1
+oid sha256:f0dc725694ec7d7e0bc3b408e66c887f704ad47bb8c1c9fcffc5533d57950135
 size 5777