Upload 10 files

Browse files

Files changed (6) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +2343 -3
training_args.bin +1 -1

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:20566ce2cded3cc02fb93eee498468296e195bde2f327717d82d2153bf039a5c
 size 598635032

 version https://git-lfs.github.com/spec/v1
+oid sha256:724be5ec56c8cea0a6bccb0fb0bcec03b849814458eb8b51ff9f3d953d0ed14c
 size 598635032

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c4bb4ca88b8d2d3ea2dfd7303f13e3a8cd59c49a916572576e2dc64da5c07512
 size 1197359627

 version https://git-lfs.github.com/spec/v1
+oid sha256:aae36e7eb1c7e8d3c5cc3aa77fc98b6aae23dbfbb8ba5dbcfe46c7087de864d3
 size 1197359627

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:50c7b18601d8312eab9dd312837f003a894f9f32c0a047b958e34fe83b5149bb
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:d808ac48aeb2285a7d15fe96957631f4317dc7cd8cbbaa8b381b1638da837ef8
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9f734a52ddcea7feef7729d4ad9d1d723abcc8fb15cbcedadde156471860e8d2
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:2ef58d5b955824dfbbc6cf55d8b7019f163372cbafcda9d38b4c7e503714eff0
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.02,
   "eval_steps": 1000,
-  "global_step": 313000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -24429,6 +24429,2346 @@
       "eval_samples_per_second": 195.892,
       "eval_steps_per_second": 1.537,
       "step": 313000
     }
   ],
   "logging_steps": 100,
@@ -24448,7 +26788,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.731626434710733e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.034,
   "eval_steps": 1000,
+  "global_step": 343000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 195.892,
       "eval_steps_per_second": 1.537,
       "step": 313000
+    },
+    {
+      "epoch": 0.0202,
+      "grad_norm": 0.8182185888290405,
+      "learning_rate": 1.5767536216792224e-05,
+      "loss": 1.1693,
+      "step": 313100
+    },
+    {
+      "epoch": 0.0204,
+      "grad_norm": 0.8927570581436157,
+      "learning_rate": 1.575279304684168e-05,
+      "loss": 1.1373,
+      "step": 313200
+    },
+    {
+      "epoch": 0.0206,
+      "grad_norm": 0.881147027015686,
+      "learning_rate": 1.573805360166499e-05,
+      "loss": 1.1504,
+      "step": 313300
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.8833571672439575,
+      "learning_rate": 1.572331788719921e-05,
+      "loss": 1.1405,
+      "step": 313400
+    },
+    {
+      "epoch": 0.021,
+      "grad_norm": 0.8598502278327942,
+      "learning_rate": 1.5708585909379864e-05,
+      "loss": 1.1365,
+      "step": 313500
+    },
+    {
+      "epoch": 0.0212,
+      "grad_norm": 0.8392401337623596,
+      "learning_rate": 1.5693857674141012e-05,
+      "loss": 1.1331,
+      "step": 313600
+    },
+    {
+      "epoch": 0.0214,
+      "grad_norm": 0.8870404958724976,
+      "learning_rate": 1.5679133187415168e-05,
+      "loss": 1.115,
+      "step": 313700
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.9391428232192993,
+      "learning_rate": 1.566441245513337e-05,
+      "loss": 1.1178,
+      "step": 313800
+    },
+    {
+      "epoch": 0.0218,
+      "grad_norm": 0.8332740664482117,
+      "learning_rate": 1.5649695483225107e-05,
+      "loss": 1.1335,
+      "step": 313900
+    },
+    {
+      "epoch": 0.022,
+      "grad_norm": 0.8561016917228699,
+      "learning_rate": 1.5634982277618392e-05,
+      "loss": 1.126,
+      "step": 314000
+    },
+    {
+      "epoch": 0.022,
+      "eval_loss": 2.42391037940979,
+      "eval_runtime": 51.9237,
+      "eval_samples_per_second": 196.327,
+      "eval_steps_per_second": 1.541,
+      "step": 314000
+    },
+    {
+      "epoch": 0.0222,
+      "grad_norm": 0.8221678137779236,
+      "learning_rate": 1.5620272844239697e-05,
+      "loss": 1.1344,
+      "step": 314100
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.865084707736969,
+      "learning_rate": 1.5605567189013977e-05,
+      "loss": 1.1195,
+      "step": 314200
+    },
+    {
+      "epoch": 0.0226,
+      "grad_norm": 0.8354145288467407,
+      "learning_rate": 1.5590865317864666e-05,
+      "loss": 1.1236,
+      "step": 314300
+    },
+    {
+      "epoch": 0.0228,
+      "grad_norm": 0.8688293099403381,
+      "learning_rate": 1.557616723671369e-05,
+      "loss": 1.1169,
+      "step": 314400
+    },
+    {
+      "epoch": 0.023,
+      "grad_norm": 0.8651818037033081,
+      "learning_rate": 1.5561472951481414e-05,
+      "loss": 1.1099,
+      "step": 314500
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.8726403713226318,
+      "learning_rate": 1.5546782468086706e-05,
+      "loss": 1.1284,
+      "step": 314600
+    },
+    {
+      "epoch": 0.0234,
+      "grad_norm": 0.8787026405334473,
+      "learning_rate": 1.5532095792446894e-05,
+      "loss": 1.1046,
+      "step": 314700
+    },
+    {
+      "epoch": 0.0236,
+      "grad_norm": 0.8764083981513977,
+      "learning_rate": 1.5517412930477762e-05,
+      "loss": 1.0929,
+      "step": 314800
+    },
+    {
+      "epoch": 0.0238,
+      "grad_norm": 0.8777551651000977,
+      "learning_rate": 1.5502733888093564e-05,
+      "loss": 1.1143,
+      "step": 314900
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.8219897150993347,
+      "learning_rate": 1.5488058671207027e-05,
+      "loss": 1.0936,
+      "step": 315000
+    },
+    {
+      "epoch": 0.024,
+      "eval_loss": 2.4566423892974854,
+      "eval_runtime": 52.0959,
+      "eval_samples_per_second": 195.678,
+      "eval_steps_per_second": 1.536,
+      "step": 315000
+    },
+    {
+      "epoch": 0.0242,
+      "grad_norm": 0.8803728818893433,
+      "learning_rate": 1.5473387285729317e-05,
+      "loss": 1.1068,
+      "step": 315100
+    },
+    {
+      "epoch": 0.0244,
+      "grad_norm": 0.9315307140350342,
+      "learning_rate": 1.5458719737570067e-05,
+      "loss": 1.0864,
+      "step": 315200
+    },
+    {
+      "epoch": 0.0246,
+      "grad_norm": 0.9067742824554443,
+      "learning_rate": 1.544405603263737e-05,
+      "loss": 1.0905,
+      "step": 315300
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.836283802986145,
+      "learning_rate": 1.5429396176837756e-05,
+      "loss": 1.0925,
+      "step": 315400
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.8385760188102722,
+      "learning_rate": 1.541474017607622e-05,
+      "loss": 1.0998,
+      "step": 315500
+    },
+    {
+      "epoch": 0.0252,
+      "grad_norm": 0.820689857006073,
+      "learning_rate": 1.5400088036256187e-05,
+      "loss": 1.0826,
+      "step": 315600
+    },
+    {
+      "epoch": 0.0254,
+      "grad_norm": 0.8749442100524902,
+      "learning_rate": 1.5385439763279556e-05,
+      "loss": 1.0923,
+      "step": 315700
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.8703187704086304,
+      "learning_rate": 1.537079536304663e-05,
+      "loss": 1.0874,
+      "step": 315800
+    },
+    {
+      "epoch": 0.0258,
+      "grad_norm": 0.8370440006256104,
+      "learning_rate": 1.535615484145619e-05,
+      "loss": 1.0905,
+      "step": 315900
+    },
+    {
+      "epoch": 0.026,
+      "grad_norm": 0.8787978887557983,
+      "learning_rate": 1.5341518204405416e-05,
+      "loss": 1.0855,
+      "step": 316000
+    },
+    {
+      "epoch": 0.026,
+      "eval_loss": 2.462463617324829,
+      "eval_runtime": 51.7904,
+      "eval_samples_per_second": 196.832,
+      "eval_steps_per_second": 1.545,
+      "step": 316000
+    },
+    {
+      "epoch": 0.0262,
+      "grad_norm": 0.8166690468788147,
+      "learning_rate": 1.5326885457789964e-05,
+      "loss": 1.0895,
+      "step": 316100
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.8346712589263916,
+      "learning_rate": 1.5312256607503884e-05,
+      "loss": 1.0795,
+      "step": 316200
+    },
+    {
+      "epoch": 0.0266,
+      "grad_norm": 0.8622503876686096,
+      "learning_rate": 1.529763165943969e-05,
+      "loss": 1.0682,
+      "step": 316300
+    },
+    {
+      "epoch": 0.0268,
+      "grad_norm": 0.8298165798187256,
+      "learning_rate": 1.5283010619488296e-05,
+      "loss": 1.077,
+      "step": 316400
+    },
+    {
+      "epoch": 0.027,
+      "grad_norm": 0.8516880869865417,
+      "learning_rate": 1.5268393493539073e-05,
+      "loss": 1.0686,
+      "step": 316500
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.8550381660461426,
+      "learning_rate": 1.5253780287479785e-05,
+      "loss": 1.0696,
+      "step": 316600
+    },
+    {
+      "epoch": 0.0274,
+      "grad_norm": 0.821546733379364,
+      "learning_rate": 1.5239171007196623e-05,
+      "loss": 1.0689,
+      "step": 316700
+    },
+    {
+      "epoch": 0.0276,
+      "grad_norm": 0.8041675686836243,
+      "learning_rate": 1.522456565857422e-05,
+      "loss": 1.0649,
+      "step": 316800
+    },
+    {
+      "epoch": 0.0278,
+      "grad_norm": 0.9088461995124817,
+      "learning_rate": 1.5209964247495595e-05,
+      "loss": 1.0751,
+      "step": 316900
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.8547507524490356,
+      "learning_rate": 1.5195366779842207e-05,
+      "loss": 1.0798,
+      "step": 317000
+    },
+    {
+      "epoch": 0.028,
+      "eval_loss": 2.4812233448028564,
+      "eval_runtime": 52.0302,
+      "eval_samples_per_second": 195.925,
+      "eval_steps_per_second": 1.538,
+      "step": 317000
+    },
+    {
+      "epoch": 0.0282,
+      "grad_norm": 0.8872113823890686,
+      "learning_rate": 1.5180773261493902e-05,
+      "loss": 1.0652,
+      "step": 317100
+    },
+    {
+      "epoch": 0.0284,
+      "grad_norm": 0.984126091003418,
+      "learning_rate": 1.5166183698328957e-05,
+      "loss": 1.0654,
+      "step": 317200
+    },
+    {
+      "epoch": 0.0286,
+      "grad_norm": 0.8874821066856384,
+      "learning_rate": 1.5151598096224037e-05,
+      "loss": 1.0571,
+      "step": 317300
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.8837223649024963,
+      "learning_rate": 1.5137016461054233e-05,
+      "loss": 1.066,
+      "step": 317400
+    },
+    {
+      "epoch": 0.029,
+      "grad_norm": 0.879486083984375,
+      "learning_rate": 1.512243879869301e-05,
+      "loss": 1.0572,
+      "step": 317500
+    },
+    {
+      "epoch": 0.0292,
+      "grad_norm": 0.8751283884048462,
+      "learning_rate": 1.5107865115012265e-05,
+      "loss": 1.0552,
+      "step": 317600
+    },
+    {
+      "epoch": 0.0294,
+      "grad_norm": 0.8803706765174866,
+      "learning_rate": 1.5093295415882267e-05,
+      "loss": 1.0499,
+      "step": 317700
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.8694496750831604,
+      "learning_rate": 1.507872970717169e-05,
+      "loss": 1.0608,
+      "step": 317800
+    },
+    {
+      "epoch": 0.0298,
+      "grad_norm": 0.8200892806053162,
+      "learning_rate": 1.5064167994747603e-05,
+      "loss": 1.0415,
+      "step": 317900
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.8422415256500244,
+      "learning_rate": 1.5049610284475458e-05,
+      "loss": 1.0487,
+      "step": 318000
+    },
+    {
+      "epoch": 0.03,
+      "eval_loss": 2.492359161376953,
+      "eval_runtime": 51.9706,
+      "eval_samples_per_second": 196.149,
+      "eval_steps_per_second": 1.539,
+      "step": 318000
+    },
+    {
+      "epoch": 0.0002,
+      "grad_norm": 0.8418950438499451,
+      "learning_rate": 1.5035056582219098e-05,
+      "loss": 1.0456,
+      "step": 318100
+    },
+    {
+      "epoch": 0.0004,
+      "grad_norm": 0.8390074968338013,
+      "learning_rate": 1.5020506893840758e-05,
+      "loss": 1.0318,
+      "step": 318200
+    },
+    {
+      "epoch": 0.0006,
+      "grad_norm": 0.8178459405899048,
+      "learning_rate": 1.5005961225201048e-05,
+      "loss": 1.0373,
+      "step": 318300
+    },
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.8252522349357605,
+      "learning_rate": 1.4991419582158959e-05,
+      "loss": 1.0267,
+      "step": 318400
+    },
+    {
+      "epoch": 0.001,
+      "grad_norm": 0.8596453070640564,
+      "learning_rate": 1.4976881970571868e-05,
+      "loss": 1.045,
+      "step": 318500
+    },
+    {
+      "epoch": 0.0012,
+      "grad_norm": 0.9191332459449768,
+      "learning_rate": 1.4962348396295517e-05,
+      "loss": 1.0201,
+      "step": 318600
+    },
+    {
+      "epoch": 0.0014,
+      "grad_norm": 0.8910384774208069,
+      "learning_rate": 1.4947818865184035e-05,
+      "loss": 1.0176,
+      "step": 318700
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.8146995902061462,
+      "learning_rate": 1.4933293383089908e-05,
+      "loss": 1.0263,
+      "step": 318800
+    },
+    {
+      "epoch": 0.0018,
+      "grad_norm": 0.8134050965309143,
+      "learning_rate": 1.4918771955864009e-05,
+      "loss": 1.0085,
+      "step": 318900
+    },
+    {
+      "epoch": 0.002,
+      "grad_norm": 0.8413226008415222,
+      "learning_rate": 1.4904254589355555e-05,
+      "loss": 1.0336,
+      "step": 319000
+    },
+    {
+      "epoch": 0.002,
+      "eval_loss": 2.506340742111206,
+      "eval_runtime": 52.1719,
+      "eval_samples_per_second": 195.393,
+      "eval_steps_per_second": 1.533,
+      "step": 319000
+    },
+    {
+      "epoch": 0.0022,
+      "grad_norm": 0.8138185739517212,
+      "learning_rate": 1.4889741289412145e-05,
+      "loss": 1.023,
+      "step": 319100
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.8572819232940674,
+      "learning_rate": 1.4875232061879735e-05,
+      "loss": 1.0055,
+      "step": 319200
+    },
+    {
+      "epoch": 0.0026,
+      "grad_norm": 0.8657738566398621,
+      "learning_rate": 1.4860726912602643e-05,
+      "loss": 1.009,
+      "step": 319300
+    },
+    {
+      "epoch": 0.0028,
+      "grad_norm": 0.8982349634170532,
+      "learning_rate": 1.4846225847423545e-05,
+      "loss": 1.021,
+      "step": 319400
+    },
+    {
+      "epoch": 0.003,
+      "grad_norm": 0.8425928354263306,
+      "learning_rate": 1.4831728872183448e-05,
+      "loss": 1.0206,
+      "step": 319500
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8392213582992554,
+      "learning_rate": 1.481723599272175e-05,
+      "loss": 1.0088,
+      "step": 319600
+    },
+    {
+      "epoch": 0.0034,
+      "grad_norm": 0.8505594730377197,
+      "learning_rate": 1.480274721487618e-05,
+      "loss": 0.9964,
+      "step": 319700
+    },
+    {
+      "epoch": 0.0036,
+      "grad_norm": 0.7965133190155029,
+      "learning_rate": 1.4788262544482805e-05,
+      "loss": 1.0288,
+      "step": 319800
+    },
+    {
+      "epoch": 0.0038,
+      "grad_norm": 0.8193480372428894,
+      "learning_rate": 1.4773781987376061e-05,
+      "loss": 0.9985,
+      "step": 319900
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.8430262207984924,
+      "learning_rate": 1.4759305549388708e-05,
+      "loss": 1.0053,
+      "step": 320000
+    },
+    {
+      "epoch": 0.004,
+      "eval_loss": 2.515505790710449,
+      "eval_runtime": 51.658,
+      "eval_samples_per_second": 197.337,
+      "eval_steps_per_second": 1.549,
+      "step": 320000
+    },
+    {
+      "epoch": 0.0042,
+      "grad_norm": 0.8491013050079346,
+      "learning_rate": 1.4744833236351857e-05,
+      "loss": 1.0021,
+      "step": 320100
+    },
+    {
+      "epoch": 0.0044,
+      "grad_norm": 0.8557093739509583,
+      "learning_rate": 1.4730365054094947e-05,
+      "loss": 0.9974,
+      "step": 320200
+    },
+    {
+      "epoch": 0.0046,
+      "grad_norm": 0.8552497625350952,
+      "learning_rate": 1.471590100844577e-05,
+      "loss": 0.9937,
+      "step": 320300
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.7959555983543396,
+      "learning_rate": 1.4701441105230435e-05,
+      "loss": 1.0001,
+      "step": 320400
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.8395636081695557,
+      "learning_rate": 1.4686985350273391e-05,
+      "loss": 0.9984,
+      "step": 320500
+    },
+    {
+      "epoch": 0.0052,
+      "grad_norm": 0.8316648602485657,
+      "learning_rate": 1.4672533749397414e-05,
+      "loss": 0.988,
+      "step": 320600
+    },
+    {
+      "epoch": 0.0054,
+      "grad_norm": 0.8290709853172302,
+      "learning_rate": 1.4658086308423608e-05,
+      "loss": 0.9984,
+      "step": 320700
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.8538153767585754,
+      "learning_rate": 1.46436430331714e-05,
+      "loss": 1.0038,
+      "step": 320800
+    },
+    {
+      "epoch": 0.0058,
+      "grad_norm": 0.828048586845398,
+      "learning_rate": 1.462920392945854e-05,
+      "loss": 0.9952,
+      "step": 320900
+    },
+    {
+      "epoch": 0.006,
+      "grad_norm": 0.8509120941162109,
+      "learning_rate": 1.4614769003101097e-05,
+      "loss": 1.0151,
+      "step": 321000
+    },
+    {
+      "epoch": 0.006,
+      "eval_loss": 2.529923677444458,
+      "eval_runtime": 51.641,
+      "eval_samples_per_second": 197.401,
+      "eval_steps_per_second": 1.549,
+      "step": 321000
+    },
+    {
+      "epoch": 0.0062,
+      "grad_norm": 0.8277125358581543,
+      "learning_rate": 1.460033825991346e-05,
+      "loss": 1.0018,
+      "step": 321100
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8201048374176025,
+      "learning_rate": 1.4585911705708325e-05,
+      "loss": 1.0042,
+      "step": 321200
+    },
+    {
+      "epoch": 0.0066,
+      "grad_norm": 0.8629177212715149,
+      "learning_rate": 1.4571489346296718e-05,
+      "loss": 1.0076,
+      "step": 321300
+    },
+    {
+      "epoch": 0.0068,
+      "grad_norm": 0.8436629176139832,
+      "learning_rate": 1.4557071187487945e-05,
+      "loss": 1.0137,
+      "step": 321400
+    },
+    {
+      "epoch": 0.007,
+      "grad_norm": 0.9035348892211914,
+      "learning_rate": 1.4542657235089649e-05,
+      "loss": 0.9959,
+      "step": 321500
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.8393178582191467,
+      "learning_rate": 1.4528247494907768e-05,
+      "loss": 1.0055,
+      "step": 321600
+    },
+    {
+      "epoch": 0.0074,
+      "grad_norm": 0.8507473468780518,
+      "learning_rate": 1.4513841972746555e-05,
+      "loss": 1.0039,
+      "step": 321700
+    },
+    {
+      "epoch": 0.0076,
+      "grad_norm": 0.8492685556411743,
+      "learning_rate": 1.4499440674408529e-05,
+      "loss": 1.0109,
+      "step": 321800
+    },
+    {
+      "epoch": 0.0078,
+      "grad_norm": 0.8794492483139038,
+      "learning_rate": 1.4485043605694545e-05,
+      "loss": 0.9981,
+      "step": 321900
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.9299744963645935,
+      "learning_rate": 1.447065077240374e-05,
+      "loss": 0.999,
+      "step": 322000
+    },
+    {
+      "epoch": 0.008,
+      "eval_loss": 2.534123659133911,
+      "eval_runtime": 51.7664,
+      "eval_samples_per_second": 196.923,
+      "eval_steps_per_second": 1.545,
+      "step": 322000
+    },
+    {
+      "epoch": 0.0082,
+      "grad_norm": 0.8244746923446655,
+      "learning_rate": 1.4456262180333552e-05,
+      "loss": 0.9991,
+      "step": 322100
+    },
+    {
+      "epoch": 0.0084,
+      "grad_norm": 0.8086799383163452,
+      "learning_rate": 1.4441877835279691e-05,
+      "loss": 0.9995,
+      "step": 322200
+    },
+    {
+      "epoch": 0.0086,
+      "grad_norm": 0.8285476565361023,
+      "learning_rate": 1.4427497743036172e-05,
+      "loss": 1.0018,
+      "step": 322300
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.8461373448371887,
+      "learning_rate": 1.4413121909395299e-05,
+      "loss": 0.9767,
+      "step": 322400
+    },
+    {
+      "epoch": 0.009,
+      "grad_norm": 0.864859938621521,
+      "learning_rate": 1.4398750340147666e-05,
+      "loss": 1.001,
+      "step": 322500
+    },
+    {
+      "epoch": 0.0092,
+      "grad_norm": 0.8466659784317017,
+      "learning_rate": 1.4384383041082117e-05,
+      "loss": 0.9958,
+      "step": 322600
+    },
+    {
+      "epoch": 0.0094,
+      "grad_norm": 0.8037152290344238,
+      "learning_rate": 1.4370020017985807e-05,
+      "loss": 0.9959,
+      "step": 322700
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8187578320503235,
+      "learning_rate": 1.4355661276644178e-05,
+      "loss": 0.9955,
+      "step": 322800
+    },
+    {
+      "epoch": 0.0098,
+      "grad_norm": 0.8383049368858337,
+      "learning_rate": 1.43413068228409e-05,
+      "loss": 0.9861,
+      "step": 322900
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.8338568210601807,
+      "learning_rate": 1.432695666235796e-05,
+      "loss": 0.9907,
+      "step": 323000
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 2.5478382110595703,
+      "eval_runtime": 51.7181,
+      "eval_samples_per_second": 197.107,
+      "eval_steps_per_second": 1.547,
+      "step": 323000
+    },
+    {
+      "epoch": 0.0102,
+      "grad_norm": 0.9476732611656189,
+      "learning_rate": 1.4312610800975602e-05,
+      "loss": 0.9817,
+      "step": 323100
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.8296193480491638,
+      "learning_rate": 1.429826924447234e-05,
+      "loss": 0.9883,
+      "step": 323200
+    },
+    {
+      "epoch": 0.0106,
+      "grad_norm": 0.8237991333007812,
+      "learning_rate": 1.4283931998624938e-05,
+      "loss": 0.9966,
+      "step": 323300
+    },
+    {
+      "epoch": 0.0108,
+      "grad_norm": 0.8200727701187134,
+      "learning_rate": 1.426959906920845e-05,
+      "loss": 0.9925,
+      "step": 323400
+    },
+    {
+      "epoch": 0.011,
+      "grad_norm": 0.7869872450828552,
+      "learning_rate": 1.4255270461996171e-05,
+      "loss": 0.9913,
+      "step": 323500
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.8540888428688049,
+      "learning_rate": 1.4240946182759673e-05,
+      "loss": 0.9851,
+      "step": 323600
+    },
+    {
+      "epoch": 0.0114,
+      "grad_norm": 0.9450783729553223,
+      "learning_rate": 1.4226626237268758e-05,
+      "loss": 0.9841,
+      "step": 323700
+    },
+    {
+      "epoch": 0.0116,
+      "grad_norm": 0.8994350433349609,
+      "learning_rate": 1.421231063129151e-05,
+      "loss": 0.9751,
+      "step": 323800
+    },
+    {
+      "epoch": 0.0118,
+      "grad_norm": 0.9152923822402954,
+      "learning_rate": 1.4197999370594246e-05,
+      "loss": 0.9788,
+      "step": 323900
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.8692894577980042,
+      "learning_rate": 1.418369246094155e-05,
+      "loss": 0.9692,
+      "step": 324000
+    },
+    {
+      "epoch": 0.012,
+      "eval_loss": 2.554483652114868,
+      "eval_runtime": 51.714,
+      "eval_samples_per_second": 197.123,
+      "eval_steps_per_second": 1.547,
+      "step": 324000
+    },
+    {
+      "epoch": 0.0122,
+      "grad_norm": 0.8307340145111084,
+      "learning_rate": 1.4169389908096232e-05,
+      "loss": 0.9791,
+      "step": 324100
+    },
+    {
+      "epoch": 0.0124,
+      "grad_norm": 0.8067870736122131,
+      "learning_rate": 1.4155091717819363e-05,
+      "loss": 0.977,
+      "step": 324200
+    },
+    {
+      "epoch": 0.0126,
+      "grad_norm": 0.904922604560852,
+      "learning_rate": 1.414079789587025e-05,
+      "loss": 0.9615,
+      "step": 324300
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.8454153537750244,
+      "learning_rate": 1.4126508448006459e-05,
+      "loss": 0.9681,
+      "step": 324400
+    },
+    {
+      "epoch": 0.013,
+      "grad_norm": 0.8959038257598877,
+      "learning_rate": 1.4112223379983755e-05,
+      "loss": 0.9746,
+      "step": 324500
+    },
+    {
+      "epoch": 0.0132,
+      "grad_norm": 0.9153333306312561,
+      "learning_rate": 1.4097942697556172e-05,
+      "loss": 0.9728,
+      "step": 324600
+    },
+    {
+      "epoch": 0.0134,
+      "grad_norm": 0.809781551361084,
+      "learning_rate": 1.4083666406475976e-05,
+      "loss": 0.964,
+      "step": 324700
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.8854051232337952,
+      "learning_rate": 1.4069394512493634e-05,
+      "loss": 0.9826,
+      "step": 324800
+    },
+    {
+      "epoch": 0.0138,
+      "grad_norm": 0.8811824917793274,
+      "learning_rate": 1.4055127021357877e-05,
+      "loss": 0.9809,
+      "step": 324900
+    },
+    {
+      "epoch": 0.014,
+      "grad_norm": 0.8924720883369446,
+      "learning_rate": 1.4040863938815645e-05,
+      "loss": 0.9611,
+      "step": 325000
+    },
+    {
+      "epoch": 0.014,
+      "eval_loss": 2.559173583984375,
+      "eval_runtime": 51.7882,
+      "eval_samples_per_second": 196.84,
+      "eval_steps_per_second": 1.545,
+      "step": 325000
+    },
+    {
+      "epoch": 0.0142,
+      "grad_norm": 0.8205790519714355,
+      "learning_rate": 1.402660527061212e-05,
+      "loss": 0.9903,
+      "step": 325100
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.8341870903968811,
+      "learning_rate": 1.4012351022490672e-05,
+      "loss": 0.9615,
+      "step": 325200
+    },
+    {
+      "epoch": 0.0146,
+      "grad_norm": 0.8305156230926514,
+      "learning_rate": 1.3998101200192915e-05,
+      "loss": 0.9627,
+      "step": 325300
+    },
+    {
+      "epoch": 0.0148,
+      "grad_norm": 0.9122214317321777,
+      "learning_rate": 1.398385580945868e-05,
+      "loss": 0.9129,
+      "step": 325400
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.868425190448761,
+      "learning_rate": 1.3969614856026014e-05,
+      "loss": 0.968,
+      "step": 325500
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.8120792508125305,
+      "learning_rate": 1.3955378345631159e-05,
+      "loss": 0.9689,
+      "step": 325600
+    },
+    {
+      "epoch": 0.0154,
+      "grad_norm": 0.8308644890785217,
+      "learning_rate": 1.3941146284008582e-05,
+      "loss": 0.9404,
+      "step": 325700
+    },
+    {
+      "epoch": 0.0156,
+      "grad_norm": 0.7607423663139343,
+      "learning_rate": 1.3926918676890965e-05,
+      "loss": 0.9587,
+      "step": 325800
+    },
+    {
+      "epoch": 0.0158,
+      "grad_norm": 0.8530341386795044,
+      "learning_rate": 1.3912695530009184e-05,
+      "loss": 0.9584,
+      "step": 325900
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8315464854240417,
+      "learning_rate": 1.3898476849092312e-05,
+      "loss": 0.9507,
+      "step": 326000
+    },
+    {
+      "epoch": 0.016,
+      "eval_loss": 2.574967861175537,
+      "eval_runtime": 52.1092,
+      "eval_samples_per_second": 195.628,
+      "eval_steps_per_second": 1.535,
+      "step": 326000
+    },
+    {
+      "epoch": 0.0002,
+      "grad_norm": 0.87019944190979,
+      "learning_rate": 1.3884262639867638e-05,
+      "loss": 0.7316,
+      "step": 326100
+    },
+    {
+      "epoch": 0.0004,
+      "grad_norm": 0.8352780342102051,
+      "learning_rate": 1.3870052908060651e-05,
+      "loss": 0.7268,
+      "step": 326200
+    },
+    {
+      "epoch": 0.0006,
+      "grad_norm": 0.9428650736808777,
+      "learning_rate": 1.3855847659395013e-05,
+      "loss": 0.717,
+      "step": 326300
+    },
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.0137333869934082,
+      "learning_rate": 1.3841646899592603e-05,
+      "loss": 0.7362,
+      "step": 326400
+    },
+    {
+      "epoch": 0.001,
+      "grad_norm": 0.9063905477523804,
+      "learning_rate": 1.382745063437349e-05,
+      "loss": 0.7192,
+      "step": 326500
+    },
+    {
+      "epoch": 0.0012,
+      "grad_norm": 0.8576821088790894,
+      "learning_rate": 1.3813258869455936e-05,
+      "loss": 0.72,
+      "step": 326600
+    },
+    {
+      "epoch": 0.0014,
+      "grad_norm": 0.8997663259506226,
+      "learning_rate": 1.3799071610556358e-05,
+      "loss": 0.7216,
+      "step": 326700
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.8130722641944885,
+      "learning_rate": 1.37848888633894e-05,
+      "loss": 0.7251,
+      "step": 326800
+    },
+    {
+      "epoch": 0.0018,
+      "grad_norm": 0.9513541460037231,
+      "learning_rate": 1.3770710633667863e-05,
+      "loss": 0.7245,
+      "step": 326900
+    },
+    {
+      "epoch": 0.002,
+      "grad_norm": 0.8725600838661194,
+      "learning_rate": 1.3756536927102753e-05,
+      "loss": 0.7186,
+      "step": 327000
+    },
+    {
+      "epoch": 0.002,
+      "eval_loss": 2.0350961685180664,
+      "eval_runtime": 51.8928,
+      "eval_samples_per_second": 196.444,
+      "eval_steps_per_second": 1.542,
+      "step": 327000
+    },
+    {
+      "epoch": 0.0022,
+      "grad_norm": 0.9190706610679626,
+      "learning_rate": 1.3742367749403212e-05,
+      "loss": 0.7326,
+      "step": 327100
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.8598017692565918,
+      "learning_rate": 1.3728203106276594e-05,
+      "loss": 0.7282,
+      "step": 327200
+    },
+    {
+      "epoch": 0.0026,
+      "grad_norm": 0.833091139793396,
+      "learning_rate": 1.371404300342842e-05,
+      "loss": 0.7183,
+      "step": 327300
+    },
+    {
+      "epoch": 0.0028,
+      "grad_norm": 0.8222286105155945,
+      "learning_rate": 1.3699887446562382e-05,
+      "loss": 0.7139,
+      "step": 327400
+    },
+    {
+      "epoch": 0.003,
+      "grad_norm": 0.8653368353843689,
+      "learning_rate": 1.368573644138032e-05,
+      "loss": 0.7237,
+      "step": 327500
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9050326943397522,
+      "learning_rate": 1.3671589993582268e-05,
+      "loss": 0.7282,
+      "step": 327600
+    },
+    {
+      "epoch": 0.0034,
+      "grad_norm": 0.9215336441993713,
+      "learning_rate": 1.3657448108866423e-05,
+      "loss": 0.7107,
+      "step": 327700
+    },
+    {
+      "epoch": 0.0036,
+      "grad_norm": 0.8540416359901428,
+      "learning_rate": 1.364331079292911e-05,
+      "loss": 0.7176,
+      "step": 327800
+    },
+    {
+      "epoch": 0.0038,
+      "grad_norm": 0.8809969425201416,
+      "learning_rate": 1.3629178051464858e-05,
+      "loss": 0.7223,
+      "step": 327900
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.8728992342948914,
+      "learning_rate": 1.3615049890166323e-05,
+      "loss": 0.7169,
+      "step": 328000
+    },
+    {
+      "epoch": 0.004,
+      "eval_loss": 2.0252230167388916,
+      "eval_runtime": 51.5937,
+      "eval_samples_per_second": 197.582,
+      "eval_steps_per_second": 1.551,
+      "step": 328000
+    },
+    {
+      "epoch": 0.0042,
+      "grad_norm": 1.0202641487121582,
+      "learning_rate": 1.360092631472433e-05,
+      "loss": 0.7341,
+      "step": 328100
+    },
+    {
+      "epoch": 0.0044,
+      "grad_norm": 0.8477998375892639,
+      "learning_rate": 1.3586807330827861e-05,
+      "loss": 0.7145,
+      "step": 328200
+    },
+    {
+      "epoch": 0.0046,
+      "grad_norm": 0.8075670599937439,
+      "learning_rate": 1.3572692944164029e-05,
+      "loss": 0.7198,
+      "step": 328300
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8715834021568298,
+      "learning_rate": 1.3558583160418109e-05,
+      "loss": 0.7202,
+      "step": 328400
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.8973333239555359,
+      "learning_rate": 1.3544477985273524e-05,
+      "loss": 0.7165,
+      "step": 328500
+    },
+    {
+      "epoch": 0.0052,
+      "grad_norm": 0.923931360244751,
+      "learning_rate": 1.3530377424411849e-05,
+      "loss": 0.7214,
+      "step": 328600
+    },
+    {
+      "epoch": 0.0054,
+      "grad_norm": 0.9258859753608704,
+      "learning_rate": 1.3516281483512765e-05,
+      "loss": 0.7255,
+      "step": 328700
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.8883686661720276,
+      "learning_rate": 1.3502190168254125e-05,
+      "loss": 0.713,
+      "step": 328800
+    },
+    {
+      "epoch": 0.0058,
+      "grad_norm": 0.8454500436782837,
+      "learning_rate": 1.348810348431191e-05,
+      "loss": 0.7117,
+      "step": 328900
+    },
+    {
+      "epoch": 0.006,
+      "grad_norm": 0.9518053531646729,
+      "learning_rate": 1.3474021437360245e-05,
+      "loss": 0.7189,
+      "step": 329000
+    },
+    {
+      "epoch": 0.006,
+      "eval_loss": 2.032439708709717,
+      "eval_runtime": 51.733,
+      "eval_samples_per_second": 197.05,
+      "eval_steps_per_second": 1.546,
+      "step": 329000
+    },
+    {
+      "epoch": 0.0062,
+      "grad_norm": 0.878307044506073,
+      "learning_rate": 1.345994403307136e-05,
+      "loss": 0.7136,
+      "step": 329100
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8827186226844788,
+      "learning_rate": 1.3445871277115635e-05,
+      "loss": 0.7237,
+      "step": 329200
+    },
+    {
+      "epoch": 0.0066,
+      "grad_norm": 0.8805004954338074,
+      "learning_rate": 1.3431803175161586e-05,
+      "loss": 0.7024,
+      "step": 329300
+    },
+    {
+      "epoch": 0.0068,
+      "grad_norm": 0.8745920062065125,
+      "learning_rate": 1.3417739732875829e-05,
+      "loss": 0.7175,
+      "step": 329400
+    },
+    {
+      "epoch": 0.007,
+      "grad_norm": 0.8587835431098938,
+      "learning_rate": 1.340368095592312e-05,
+      "loss": 0.7054,
+      "step": 329500
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.8374196290969849,
+      "learning_rate": 1.3389626849966335e-05,
+      "loss": 0.7107,
+      "step": 329600
+    },
+    {
+      "epoch": 0.0074,
+      "grad_norm": 0.929682731628418,
+      "learning_rate": 1.3375577420666477e-05,
+      "loss": 0.7183,
+      "step": 329700
+    },
+    {
+      "epoch": 0.0076,
+      "grad_norm": 0.8738675713539124,
+      "learning_rate": 1.3361532673682633e-05,
+      "loss": 0.7236,
+      "step": 329800
+    },
+    {
+      "epoch": 0.0078,
+      "grad_norm": 0.8550043106079102,
+      "learning_rate": 1.3347492614672039e-05,
+      "loss": 0.7107,
+      "step": 329900
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.9196627736091614,
+      "learning_rate": 1.3333457249290024e-05,
+      "loss": 0.716,
+      "step": 330000
+    },
+    {
+      "epoch": 0.008,
+      "eval_loss": 2.035661220550537,
+      "eval_runtime": 51.7487,
+      "eval_samples_per_second": 196.99,
+      "eval_steps_per_second": 1.546,
+      "step": 330000
+    },
+    {
+      "epoch": 0.0082,
+      "grad_norm": 0.8340585231781006,
+      "learning_rate": 1.3319426583190042e-05,
+      "loss": 0.7279,
+      "step": 330100
+    },
+    {
+      "epoch": 0.0084,
+      "grad_norm": 0.858969509601593,
+      "learning_rate": 1.3305400622023628e-05,
+      "loss": 0.716,
+      "step": 330200
+    },
+    {
+      "epoch": 0.0086,
+      "grad_norm": 0.9872186183929443,
+      "learning_rate": 1.3291379371440446e-05,
+      "loss": 0.7278,
+      "step": 330300
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.8357021808624268,
+      "learning_rate": 1.3277362837088252e-05,
+      "loss": 0.7057,
+      "step": 330400
+    },
+    {
+      "epoch": 0.009,
+      "grad_norm": 0.8592823147773743,
+      "learning_rate": 1.3263351024612914e-05,
+      "loss": 0.7107,
+      "step": 330500
+    },
+    {
+      "epoch": 0.0092,
+      "grad_norm": 0.8655655384063721,
+      "learning_rate": 1.3249343939658371e-05,
+      "loss": 0.7093,
+      "step": 330600
+    },
+    {
+      "epoch": 0.0094,
+      "grad_norm": 0.8590738773345947,
+      "learning_rate": 1.3235341587866684e-05,
+      "loss": 0.7073,
+      "step": 330700
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8633531332015991,
+      "learning_rate": 1.322134397487801e-05,
+      "loss": 0.7129,
+      "step": 330800
+    },
+    {
+      "epoch": 0.0098,
+      "grad_norm": 0.8816627264022827,
+      "learning_rate": 1.3207351106330559e-05,
+      "loss": 0.7114,
+      "step": 330900
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.9330505132675171,
+      "learning_rate": 1.3193362987860675e-05,
+      "loss": 0.7059,
+      "step": 331000
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 2.0230836868286133,
+      "eval_runtime": 51.7504,
+      "eval_samples_per_second": 196.984,
+      "eval_steps_per_second": 1.546,
+      "step": 331000
+    },
+    {
+      "epoch": 0.0102,
+      "grad_norm": 0.8758464455604553,
+      "learning_rate": 1.317937962510277e-05,
+      "loss": 0.7078,
+      "step": 331100
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.9444248676300049,
+      "learning_rate": 1.3165401023689344e-05,
+      "loss": 0.7174,
+      "step": 331200
+    },
+    {
+      "epoch": 0.0106,
+      "grad_norm": 0.8706777095794678,
+      "learning_rate": 1.3151427189250965e-05,
+      "loss": 0.7058,
+      "step": 331300
+    },
+    {
+      "epoch": 0.0108,
+      "grad_norm": 0.8867092132568359,
+      "learning_rate": 1.3137458127416297e-05,
+      "loss": 0.7058,
+      "step": 331400
+    },
+    {
+      "epoch": 0.011,
+      "grad_norm": 0.968101978302002,
+      "learning_rate": 1.3123493843812074e-05,
+      "loss": 0.7212,
+      "step": 331500
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.8708505630493164,
+      "learning_rate": 1.3109534344063118e-05,
+      "loss": 0.7175,
+      "step": 331600
+    },
+    {
+      "epoch": 0.0114,
+      "grad_norm": 0.910325288772583,
+      "learning_rate": 1.30955796337923e-05,
+      "loss": 0.7078,
+      "step": 331700
+    },
+    {
+      "epoch": 0.0116,
+      "grad_norm": 0.8591578006744385,
+      "learning_rate": 1.308162971862058e-05,
+      "loss": 0.7101,
+      "step": 331800
+    },
+    {
+      "epoch": 0.0118,
+      "grad_norm": 0.9007583260536194,
+      "learning_rate": 1.3067684604166988e-05,
+      "loss": 0.7157,
+      "step": 331900
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.9580846428871155,
+      "learning_rate": 1.3053744296048617e-05,
+      "loss": 0.7102,
+      "step": 332000
+    },
+    {
+      "epoch": 0.012,
+      "eval_loss": 2.037156581878662,
+      "eval_runtime": 51.5881,
+      "eval_samples_per_second": 197.604,
+      "eval_steps_per_second": 1.551,
+      "step": 332000
+    },
+    {
+      "epoch": 0.0122,
+      "grad_norm": 0.8679760098457336,
+      "learning_rate": 1.3039808799880604e-05,
+      "loss": 0.7144,
+      "step": 332100
+    },
+    {
+      "epoch": 0.0124,
+      "grad_norm": 0.8794786334037781,
+      "learning_rate": 1.302587812127618e-05,
+      "loss": 0.7089,
+      "step": 332200
+    },
+    {
+      "epoch": 0.0126,
+      "grad_norm": 0.855987548828125,
+      "learning_rate": 1.3011952265846626e-05,
+      "loss": 0.7164,
+      "step": 332300
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.8838660717010498,
+      "learning_rate": 1.2998031239201252e-05,
+      "loss": 0.7166,
+      "step": 332400
+    },
+    {
+      "epoch": 0.013,
+      "grad_norm": 0.8379763960838318,
+      "learning_rate": 1.2984115046947463e-05,
+      "loss": 0.7168,
+      "step": 332500
+    },
+    {
+      "epoch": 0.0132,
+      "grad_norm": 0.8760377764701843,
+      "learning_rate": 1.2970203694690694e-05,
+      "loss": 0.7106,
+      "step": 332600
+    },
+    {
+      "epoch": 0.0134,
+      "grad_norm": 0.8472399711608887,
+      "learning_rate": 1.295629718803445e-05,
+      "loss": 0.7118,
+      "step": 332700
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.8849984407424927,
+      "learning_rate": 1.2942395532580247e-05,
+      "loss": 0.7207,
+      "step": 332800
+    },
+    {
+      "epoch": 0.0138,
+      "grad_norm": 0.8308677077293396,
+      "learning_rate": 1.2928498733927682e-05,
+      "loss": 0.7004,
+      "step": 332900
+    },
+    {
+      "epoch": 0.014,
+      "grad_norm": 0.9149287343025208,
+      "learning_rate": 1.2914606797674384e-05,
+      "loss": 0.7088,
+      "step": 333000
+    },
+    {
+      "epoch": 0.014,
+      "eval_loss": 2.029548168182373,
+      "eval_runtime": 51.6647,
+      "eval_samples_per_second": 197.311,
+      "eval_steps_per_second": 1.548,
+      "step": 333000
+    },
+    {
+      "epoch": 0.0142,
+      "grad_norm": 0.8902376890182495,
+      "learning_rate": 1.2900719729416033e-05,
+      "loss": 0.7095,
+      "step": 333100
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.9412351250648499,
+      "learning_rate": 1.2886837534746316e-05,
+      "loss": 0.7186,
+      "step": 333200
+    },
+    {
+      "epoch": 0.0146,
+      "grad_norm": 0.8445390462875366,
+      "learning_rate": 1.2872960219256992e-05,
+      "loss": 0.7093,
+      "step": 333300
+    },
+    {
+      "epoch": 0.0148,
+      "grad_norm": 0.8830252289772034,
+      "learning_rate": 1.2859087788537844e-05,
+      "loss": 0.7074,
+      "step": 333400
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.8642695546150208,
+      "learning_rate": 1.284522024817669e-05,
+      "loss": 0.7146,
+      "step": 333500
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.9142852425575256,
+      "learning_rate": 1.2831357603759358e-05,
+      "loss": 0.7126,
+      "step": 333600
+    },
+    {
+      "epoch": 0.0154,
+      "grad_norm": 0.9412261247634888,
+      "learning_rate": 1.2817499860869725e-05,
+      "loss": 0.7105,
+      "step": 333700
+    },
+    {
+      "epoch": 0.0156,
+      "grad_norm": 0.8529816269874573,
+      "learning_rate": 1.2803647025089705e-05,
+      "loss": 0.7086,
+      "step": 333800
+    },
+    {
+      "epoch": 0.0158,
+      "grad_norm": 0.8930657505989075,
+      "learning_rate": 1.2789799101999194e-05,
+      "loss": 0.7148,
+      "step": 333900
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9034160375595093,
+      "learning_rate": 1.2775956097176142e-05,
+      "loss": 0.7138,
+      "step": 334000
+    },
+    {
+      "epoch": 0.016,
+      "eval_loss": 2.034317970275879,
+      "eval_runtime": 52.1314,
+      "eval_samples_per_second": 195.544,
+      "eval_steps_per_second": 1.535,
+      "step": 334000
+    },
+    {
+      "epoch": 0.0162,
+      "grad_norm": 0.7935868501663208,
+      "learning_rate": 1.2762118016196514e-05,
+      "loss": 0.7061,
+      "step": 334100
+    },
+    {
+      "epoch": 0.0164,
+      "grad_norm": 0.8745686411857605,
+      "learning_rate": 1.2748284864634296e-05,
+      "loss": 0.7079,
+      "step": 334200
+    },
+    {
+      "epoch": 0.0166,
+      "grad_norm": 0.8833600878715515,
+      "learning_rate": 1.273445664806146e-05,
+      "loss": 0.7103,
+      "step": 334300
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.9068960547447205,
+      "learning_rate": 1.272063337204802e-05,
+      "loss": 0.7001,
+      "step": 334400
+    },
+    {
+      "epoch": 0.017,
+      "grad_norm": 0.8197974562644958,
+      "learning_rate": 1.2706815042161984e-05,
+      "loss": 0.7052,
+      "step": 334500
+    },
+    {
+      "epoch": 0.0172,
+      "grad_norm": 0.8796073794364929,
+      "learning_rate": 1.2693001663969395e-05,
+      "loss": 0.7123,
+      "step": 334600
+    },
+    {
+      "epoch": 0.0174,
+      "grad_norm": 0.883787989616394,
+      "learning_rate": 1.2679193243034249e-05,
+      "loss": 0.7028,
+      "step": 334700
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.885678768157959,
+      "learning_rate": 1.2665389784918597e-05,
+      "loss": 0.696,
+      "step": 334800
+    },
+    {
+      "epoch": 0.0178,
+      "grad_norm": 0.895122766494751,
+      "learning_rate": 1.2651591295182457e-05,
+      "loss": 0.7095,
+      "step": 334900
+    },
+    {
+      "epoch": 0.018,
+      "grad_norm": 0.8656454086303711,
+      "learning_rate": 1.2637797779383881e-05,
+      "loss": 0.7098,
+      "step": 335000
+    },
+    {
+      "epoch": 0.018,
+      "eval_loss": 2.041609764099121,
+      "eval_runtime": 51.8364,
+      "eval_samples_per_second": 196.657,
+      "eval_steps_per_second": 1.543,
+      "step": 335000
+    },
+    {
+      "epoch": 0.0182,
+      "grad_norm": 0.8860552906990051,
+      "learning_rate": 1.2624009243078872e-05,
+      "loss": 0.7323,
+      "step": 335100
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.9041178226470947,
+      "learning_rate": 1.261022569182146e-05,
+      "loss": 0.7102,
+      "step": 335200
+    },
+    {
+      "epoch": 0.0186,
+      "grad_norm": 0.8467496037483215,
+      "learning_rate": 1.2596447131163657e-05,
+      "loss": 0.7061,
+      "step": 335300
+    },
+    {
+      "epoch": 0.0188,
+      "grad_norm": 0.8838053941726685,
+      "learning_rate": 1.2582673566655474e-05,
+      "loss": 0.7032,
+      "step": 335400
+    },
+    {
+      "epoch": 0.019,
+      "grad_norm": 0.8892683982849121,
+      "learning_rate": 1.2568905003844885e-05,
+      "loss": 0.7032,
+      "step": 335500
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.8915057182312012,
+      "learning_rate": 1.2555141448277874e-05,
+      "loss": 0.7162,
+      "step": 335600
+    },
+    {
+      "epoch": 0.0194,
+      "grad_norm": 0.8544843196868896,
+      "learning_rate": 1.2541382905498411e-05,
+      "loss": 0.6972,
+      "step": 335700
+    },
+    {
+      "epoch": 0.0196,
+      "grad_norm": 0.9270769953727722,
+      "learning_rate": 1.2527629381048411e-05,
+      "loss": 0.6981,
+      "step": 335800
+    },
+    {
+      "epoch": 0.0198,
+      "grad_norm": 1.0345507860183716,
+      "learning_rate": 1.2513880880467807e-05,
+      "loss": 0.6987,
+      "step": 335900
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8447591662406921,
+      "learning_rate": 1.2500137409294488e-05,
+      "loss": 0.7021,
+      "step": 336000
+    },
+    {
+      "epoch": 0.02,
+      "eval_loss": 2.0394201278686523,
+      "eval_runtime": 52.0024,
+      "eval_samples_per_second": 196.029,
+      "eval_steps_per_second": 1.538,
+      "step": 336000
+    },
+    {
+      "epoch": 0.0202,
+      "grad_norm": 0.871530294418335,
+      "learning_rate": 1.2486398973064339e-05,
+      "loss": 0.7097,
+      "step": 336100
+    },
+    {
+      "epoch": 0.0204,
+      "grad_norm": 0.8515340089797974,
+      "learning_rate": 1.2472665577311176e-05,
+      "loss": 0.705,
+      "step": 336200
+    },
+    {
+      "epoch": 0.0206,
+      "grad_norm": 0.8740963339805603,
+      "learning_rate": 1.2458937227566819e-05,
+      "loss": 0.7004,
+      "step": 336300
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.8944967985153198,
+      "learning_rate": 1.244521392936106e-05,
+      "loss": 0.6948,
+      "step": 336400
+    },
+    {
+      "epoch": 0.021,
+      "grad_norm": 0.8867557644844055,
+      "learning_rate": 1.2431495688221618e-05,
+      "loss": 0.7037,
+      "step": 336500
+    },
+    {
+      "epoch": 0.0212,
+      "grad_norm": 0.925564706325531,
+      "learning_rate": 1.2417782509674216e-05,
+      "loss": 0.6971,
+      "step": 336600
+    },
+    {
+      "epoch": 0.0214,
+      "grad_norm": 0.8457061052322388,
+      "learning_rate": 1.240407439924251e-05,
+      "loss": 0.7007,
+      "step": 336700
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.8768745064735413,
+      "learning_rate": 1.2390371362448125e-05,
+      "loss": 0.7015,
+      "step": 336800
+    },
+    {
+      "epoch": 0.0218,
+      "grad_norm": 0.8154018521308899,
+      "learning_rate": 1.237667340481066e-05,
+      "loss": 0.6984,
+      "step": 336900
+    },
+    {
+      "epoch": 0.022,
+      "grad_norm": 0.8525890707969666,
+      "learning_rate": 1.2362980531847626e-05,
+      "loss": 0.6991,
+      "step": 337000
+    },
+    {
+      "epoch": 0.022,
+      "eval_loss": 2.052788496017456,
+      "eval_runtime": 52.066,
+      "eval_samples_per_second": 195.79,
+      "eval_steps_per_second": 1.537,
+      "step": 337000
+    },
+    {
+      "epoch": 0.0222,
+      "grad_norm": 0.8477676510810852,
+      "learning_rate": 1.2349292749074526e-05,
+      "loss": 0.6756,
+      "step": 337100
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8637740612030029,
+      "learning_rate": 1.233561006200479e-05,
+      "loss": 0.7043,
+      "step": 337200
+    },
+    {
+      "epoch": 0.0226,
+      "grad_norm": 0.9340733885765076,
+      "learning_rate": 1.232193247614982e-05,
+      "loss": 0.697,
+      "step": 337300
+    },
+    {
+      "epoch": 0.0228,
+      "grad_norm": 0.8994996547698975,
+      "learning_rate": 1.230825999701892e-05,
+      "loss": 0.6975,
+      "step": 337400
+    },
+    {
+      "epoch": 0.023,
+      "grad_norm": 0.9119468331336975,
+      "learning_rate": 1.2294592630119375e-05,
+      "loss": 0.695,
+      "step": 337500
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.8722793459892273,
+      "learning_rate": 1.2280930380956402e-05,
+      "loss": 0.694,
+      "step": 337600
+    },
+    {
+      "epoch": 0.0234,
+      "grad_norm": 0.9214362502098083,
+      "learning_rate": 1.2267273255033157e-05,
+      "loss": 0.7004,
+      "step": 337700
+    },
+    {
+      "epoch": 0.0236,
+      "grad_norm": 0.928554892539978,
+      "learning_rate": 1.2253621257850714e-05,
+      "loss": 0.6978,
+      "step": 337800
+    },
+    {
+      "epoch": 0.0238,
+      "grad_norm": 0.952670693397522,
+      "learning_rate": 1.2239974394908102e-05,
+      "loss": 0.7041,
+      "step": 337900
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.8799007534980774,
+      "learning_rate": 1.2226332671702282e-05,
+      "loss": 0.689,
+      "step": 338000
+    },
+    {
+      "epoch": 0.024,
+      "eval_loss": 2.0413691997528076,
+      "eval_runtime": 52.1051,
+      "eval_samples_per_second": 195.643,
+      "eval_steps_per_second": 1.535,
+      "step": 338000
+    },
+    {
+      "epoch": 0.0242,
+      "grad_norm": 1.0080713033676147,
+      "learning_rate": 1.2212696093728141e-05,
+      "loss": 0.7069,
+      "step": 338100
+    },
+    {
+      "epoch": 0.0244,
+      "grad_norm": 0.9208382964134216,
+      "learning_rate": 1.2199064666478474e-05,
+      "loss": 0.7086,
+      "step": 338200
+    },
+    {
+      "epoch": 0.0246,
+      "grad_norm": 0.9424040913581848,
+      "learning_rate": 1.2185438395444029e-05,
+      "loss": 0.699,
+      "step": 338300
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.9521956443786621,
+      "learning_rate": 1.2171817286113476e-05,
+      "loss": 0.6972,
+      "step": 338400
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.9072422385215759,
+      "learning_rate": 1.2158201343973377e-05,
+      "loss": 0.686,
+      "step": 338500
+    },
+    {
+      "epoch": 0.0252,
+      "grad_norm": 0.8915525078773499,
+      "learning_rate": 1.2144590574508241e-05,
+      "loss": 0.6992,
+      "step": 338600
+    },
+    {
+      "epoch": 0.0254,
+      "grad_norm": 0.8471651673316956,
+      "learning_rate": 1.2130984983200486e-05,
+      "loss": 0.6933,
+      "step": 338700
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.8865765929222107,
+      "learning_rate": 1.2117384575530446e-05,
+      "loss": 0.6899,
+      "step": 338800
+    },
+    {
+      "epoch": 0.0258,
+      "grad_norm": 0.8501929640769958,
+      "learning_rate": 1.2103789356976353e-05,
+      "loss": 0.6942,
+      "step": 338900
+    },
+    {
+      "epoch": 0.026,
+      "grad_norm": 0.8459005951881409,
+      "learning_rate": 1.2090199333014363e-05,
+      "loss": 0.6883,
+      "step": 339000
+    },
+    {
+      "epoch": 0.026,
+      "eval_loss": 2.0509862899780273,
+      "eval_runtime": 52.0225,
+      "eval_samples_per_second": 195.954,
+      "eval_steps_per_second": 1.538,
+      "step": 339000
+    },
+    {
+      "epoch": 0.0262,
+      "grad_norm": 0.8724320530891418,
+      "learning_rate": 1.2076614509118537e-05,
+      "loss": 0.6903,
+      "step": 339100
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.8801888227462769,
+      "learning_rate": 1.206303489076085e-05,
+      "loss": 0.6934,
+      "step": 339200
+    },
+    {
+      "epoch": 0.0266,
+      "grad_norm": 0.9318116903305054,
+      "learning_rate": 1.2049460483411154e-05,
+      "loss": 0.6909,
+      "step": 339300
+    },
+    {
+      "epoch": 0.0268,
+      "grad_norm": 0.892590343952179,
+      "learning_rate": 1.2035891292537228e-05,
+      "loss": 0.6931,
+      "step": 339400
+    },
+    {
+      "epoch": 0.027,
+      "grad_norm": 0.8987374901771545,
+      "learning_rate": 1.2022327323604735e-05,
+      "loss": 0.682,
+      "step": 339500
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.9365059733390808,
+      "learning_rate": 1.2008768582077257e-05,
+      "loss": 0.6849,
+      "step": 339600
+    },
+    {
+      "epoch": 0.0274,
+      "grad_norm": 0.908043384552002,
+      "learning_rate": 1.199521507341623e-05,
+      "loss": 0.6959,
+      "step": 339700
+    },
+    {
+      "epoch": 0.0276,
+      "grad_norm": 0.9550427794456482,
+      "learning_rate": 1.1981666803081015e-05,
+      "loss": 0.6928,
+      "step": 339800
+    },
+    {
+      "epoch": 0.0278,
+      "grad_norm": 0.9074381589889526,
+      "learning_rate": 1.1968123776528855e-05,
+      "loss": 0.6907,
+      "step": 339900
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.8685894012451172,
+      "learning_rate": 1.195458599921489e-05,
+      "loss": 0.6792,
+      "step": 340000
+    },
+    {
+      "epoch": 0.028,
+      "eval_loss": 2.0385003089904785,
+      "eval_runtime": 51.8057,
+      "eval_samples_per_second": 196.774,
+      "eval_steps_per_second": 1.544,
+      "step": 340000
+    },
+    {
+      "epoch": 0.0282,
+      "grad_norm": 0.8610774874687195,
+      "learning_rate": 1.1941053476592115e-05,
+      "loss": 0.6883,
+      "step": 340100
+    },
+    {
+      "epoch": 0.0284,
+      "grad_norm": 0.8860583901405334,
+      "learning_rate": 1.192752621411144e-05,
+      "loss": 0.6962,
+      "step": 340200
+    },
+    {
+      "epoch": 0.0286,
+      "grad_norm": 0.872675895690918,
+      "learning_rate": 1.191400421722165e-05,
+      "loss": 0.6941,
+      "step": 340300
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.9522199630737305,
+      "learning_rate": 1.1900487491369386e-05,
+      "loss": 0.6885,
+      "step": 340400
+    },
+    {
+      "epoch": 0.029,
+      "grad_norm": 0.8697762489318848,
+      "learning_rate": 1.1886976041999196e-05,
+      "loss": 0.688,
+      "step": 340500
+    },
+    {
+      "epoch": 0.0292,
+      "grad_norm": 0.939105749130249,
+      "learning_rate": 1.1873469874553486e-05,
+      "loss": 0.677,
+      "step": 340600
+    },
+    {
+      "epoch": 0.0294,
+      "grad_norm": 0.8797380328178406,
+      "learning_rate": 1.1859968994472551e-05,
+      "loss": 0.681,
+      "step": 340700
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.9089605212211609,
+      "learning_rate": 1.1846473407194522e-05,
+      "loss": 0.6916,
+      "step": 340800
+    },
+    {
+      "epoch": 0.0298,
+      "grad_norm": 0.8749380111694336,
+      "learning_rate": 1.1832983118155436e-05,
+      "loss": 0.6855,
+      "step": 340900
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.8690641522407532,
+      "learning_rate": 1.1819498132789173e-05,
+      "loss": 0.6923,
+      "step": 341000
+    },
+    {
+      "epoch": 0.03,
+      "eval_loss": 2.083442449569702,
+      "eval_runtime": 52.125,
+      "eval_samples_per_second": 195.568,
+      "eval_steps_per_second": 1.535,
+      "step": 341000
+    },
+    {
+      "epoch": 0.0302,
+      "grad_norm": 0.8813545107841492,
+      "learning_rate": 1.1806018456527495e-05,
+      "loss": 0.679,
+      "step": 341100
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.8487153649330139,
+      "learning_rate": 1.1792544094799995e-05,
+      "loss": 0.6851,
+      "step": 341200
+    },
+    {
+      "epoch": 0.0306,
+      "grad_norm": 0.8273248672485352,
+      "learning_rate": 1.1779075053034155e-05,
+      "loss": 0.6807,
+      "step": 341300
+    },
+    {
+      "epoch": 0.0308,
+      "grad_norm": 0.879425585269928,
+      "learning_rate": 1.1765611336655305e-05,
+      "loss": 0.6816,
+      "step": 341400
+    },
+    {
+      "epoch": 0.031,
+      "grad_norm": 0.8405166268348694,
+      "learning_rate": 1.1752152951086631e-05,
+      "loss": 0.6762,
+      "step": 341500
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.8572484254837036,
+      "learning_rate": 1.1738699901749157e-05,
+      "loss": 0.692,
+      "step": 341600
+    },
+    {
+      "epoch": 0.0314,
+      "grad_norm": 0.9151760935783386,
+      "learning_rate": 1.1725252194061775e-05,
+      "loss": 0.683,
+      "step": 341700
+    },
+    {
+      "epoch": 0.0316,
+      "grad_norm": 0.9075136780738831,
+      "learning_rate": 1.1711809833441235e-05,
+      "loss": 0.6859,
+      "step": 341800
+    },
+    {
+      "epoch": 0.0318,
+      "grad_norm": 0.9236798882484436,
+      "learning_rate": 1.1698372825302093e-05,
+      "loss": 0.6901,
+      "step": 341900
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.8434112071990967,
+      "learning_rate": 1.1684941175056785e-05,
+      "loss": 0.6844,
+      "step": 342000
+    },
+    {
+      "epoch": 0.032,
+      "eval_loss": 2.0505099296569824,
+      "eval_runtime": 52.0084,
+      "eval_samples_per_second": 196.007,
+      "eval_steps_per_second": 1.538,
+      "step": 342000
+    },
+    {
+      "epoch": 0.0322,
+      "grad_norm": 0.9039320349693298,
+      "learning_rate": 1.1671514888115582e-05,
+      "loss": 0.6859,
+      "step": 342100
+    },
+    {
+      "epoch": 0.0324,
+      "grad_norm": 0.8539577126502991,
+      "learning_rate": 1.1658093969886596e-05,
+      "loss": 0.6734,
+      "step": 342200
+    },
+    {
+      "epoch": 0.0326,
+      "grad_norm": 0.8575844168663025,
+      "learning_rate": 1.1644678425775755e-05,
+      "loss": 0.6762,
+      "step": 342300
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.9679238200187683,
+      "learning_rate": 1.1631268261186845e-05,
+      "loss": 0.676,
+      "step": 342400
+    },
+    {
+      "epoch": 0.033,
+      "grad_norm": 0.8782627582550049,
+      "learning_rate": 1.1617863481521483e-05,
+      "loss": 0.6758,
+      "step": 342500
+    },
+    {
+      "epoch": 0.0332,
+      "grad_norm": 0.9136931300163269,
+      "learning_rate": 1.1604464092179118e-05,
+      "loss": 0.6818,
+      "step": 342600
+    },
+    {
+      "epoch": 0.0334,
+      "grad_norm": 0.8847256302833557,
+      "learning_rate": 1.1591070098557006e-05,
+      "loss": 0.6728,
+      "step": 342700
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.8676889538764954,
+      "learning_rate": 1.1577681506050253e-05,
+      "loss": 0.682,
+      "step": 342800
+    },
+    {
+      "epoch": 0.0338,
+      "grad_norm": 0.8871778845787048,
+      "learning_rate": 1.1564298320051787e-05,
+      "loss": 0.6748,
+      "step": 342900
+    },
+    {
+      "epoch": 0.034,
+      "grad_norm": 0.9254991412162781,
+      "learning_rate": 1.155092054595236e-05,
+      "loss": 0.6791,
+      "step": 343000
+    },
+    {
+      "epoch": 0.034,
+      "eval_loss": 2.0616466999053955,
+      "eval_runtime": 52.253,
+      "eval_samples_per_second": 195.089,
+      "eval_steps_per_second": 1.531,
+      "step": 343000
     }
   ],
   "logging_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 2.993443664874701e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f0dc725694ec7d7e0bc3b408e66c887f704ad47bb8c1c9fcffc5533d57950135
 size 5777

 version https://git-lfs.github.com/spec/v1
+oid sha256:58ce66db74e88b1f68194d485c23157f7d0c8a9d6b255f56a99102bd66b1a145
 size 5777