Upload 10 files

Browse files

Files changed (6) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +2187 -3
training_args.bin +1 -1

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:68dfbe915ff4e03024cebbe33bde59cbf6b6d263e48d28395b6093519870427f
 size 598635032

 version https://git-lfs.github.com/spec/v1
+oid sha256:8fe5d6868ff2c81227fd2c969c46af4fb1f58973ee4e0966c10979962f78982d
 size 598635032

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7c99ecdaaf664092be0234fe077bbcd25baa9813c62c8c46bdea2a42455c5ff
 size 1197359627

 version https://git-lfs.github.com/spec/v1
+oid sha256:47b3c36d9ca1786cd8d0139a8f5f16980ca80bc74296c03c792d092074a01113
 size 1197359627

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:08c78b4c639ae6ded426a01aaa0cfe34a255d9fc38024fa012efae708fa63f88
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:de790fba095fbd174e2cbec99a78f74e882cac29a5ff6c7320d627732b666d8d
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d057880b7192dde278d129dfeefa0076ad8bd0f56219fa25a8eb938564ee0f19
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:18105fb0a6af67c0e74a6e195673b14e0b259cf81cc485bb090b6d14e24299a1
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.0446530735268672,
   "eval_steps": 1000,
-  "global_step": 89000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -6957,6 +6957,2190 @@
       "eval_samples_per_second": 196.535,
       "eval_steps_per_second": 1.542,
       "step": 89000
     }
   ],
   "logging_steps": 100,
@@ -6976,7 +9160,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7.767244494864384e+18,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.036280622240579596,
   "eval_steps": 1000,
+  "global_step": 117000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 196.535,
       "eval_steps_per_second": 1.542,
       "step": 89000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 1.6284213066101074,
+      "learning_rate": 4.331808372709617e-05,
+      "loss": 2.1737,
+      "step": 89100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 1.5599644184112549,
+      "learning_rate": 4.33030093580447e-05,
+      "loss": 2.1666,
+      "step": 89200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 1.5871340036392212,
+      "learning_rate": 4.328792063355065e-05,
+      "loss": 2.1656,
+      "step": 89300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 1.655039668083191,
+      "learning_rate": 4.327281756544842e-05,
+      "loss": 2.169,
+      "step": 89400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 1.549782633781433,
+      "learning_rate": 4.325770016558367e-05,
+      "loss": 2.1748,
+      "step": 89500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 1.6504789590835571,
+      "learning_rate": 4.3242568445813306e-05,
+      "loss": 2.1637,
+      "step": 89600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 1.6154310703277588,
+      "learning_rate": 4.322742241800545e-05,
+      "loss": 2.1582,
+      "step": 89700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 1.611567497253418,
+      "learning_rate": 4.321226209403947e-05,
+      "loss": 2.1519,
+      "step": 89800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 1.5689648389816284,
+      "learning_rate": 4.319708748580592e-05,
+      "loss": 2.1659,
+      "step": 89900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 1.630550503730774,
+      "learning_rate": 4.318189860520658e-05,
+      "loss": 2.1545,
+      "step": 90000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.2889482975006104,
+      "eval_runtime": 51.8104,
+      "eval_samples_per_second": 196.756,
+      "eval_steps_per_second": 1.544,
+      "step": 90000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 1.5548030138015747,
+      "learning_rate": 4.316669546415441e-05,
+      "loss": 2.1585,
+      "step": 90100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 1.675545573234558,
+      "learning_rate": 4.315147807457356e-05,
+      "loss": 2.1588,
+      "step": 90200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 1.6402935981750488,
+      "learning_rate": 4.313624644839936e-05,
+      "loss": 2.1548,
+      "step": 90300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 1.6139987707138062,
+      "learning_rate": 4.312100059757829e-05,
+      "loss": 2.1571,
+      "step": 90400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 1.625307321548462,
+      "learning_rate": 4.310574053406801e-05,
+      "loss": 2.1595,
+      "step": 90500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 1.6252094507217407,
+      "learning_rate": 4.3090466269837304e-05,
+      "loss": 2.1593,
+      "step": 90600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 1.6178643703460693,
+      "learning_rate": 4.307517781686611e-05,
+      "loss": 2.1393,
+      "step": 90700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 1.4477986097335815,
+      "learning_rate": 4.3059875187145495e-05,
+      "loss": 2.1526,
+      "step": 90800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 1.6491955518722534,
+      "learning_rate": 4.3044558392677627e-05,
+      "loss": 2.1456,
+      "step": 90900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 1.6281492710113525,
+      "learning_rate": 4.30292274454758e-05,
+      "loss": 2.1516,
+      "step": 91000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.270752191543579,
+      "eval_runtime": 51.3993,
+      "eval_samples_per_second": 198.329,
+      "eval_steps_per_second": 1.556,
+      "step": 91000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 1.6257696151733398,
+      "learning_rate": 4.301388235756442e-05,
+      "loss": 2.1455,
+      "step": 91100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 1.6170133352279663,
+      "learning_rate": 4.299852314097894e-05,
+      "loss": 2.1379,
+      "step": 91200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 1.5804738998413086,
+      "learning_rate": 4.298314980776594e-05,
+      "loss": 2.1415,
+      "step": 91300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 1.6626943349838257,
+      "learning_rate": 4.2967762369983065e-05,
+      "loss": 2.1513,
+      "step": 91400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 1.5107390880584717,
+      "learning_rate": 4.295236083969899e-05,
+      "loss": 2.1477,
+      "step": 91500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 1.6073352098464966,
+      "learning_rate": 4.293694522899349e-05,
+      "loss": 2.1468,
+      "step": 91600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 1.5642460584640503,
+      "learning_rate": 4.292151554995734e-05,
+      "loss": 2.1394,
+      "step": 91700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 1.4667211771011353,
+      "learning_rate": 4.290607181469236e-05,
+      "loss": 2.1446,
+      "step": 91800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 1.5254555940628052,
+      "learning_rate": 4.2890614035311425e-05,
+      "loss": 2.1516,
+      "step": 91900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 1.5663783550262451,
+      "learning_rate": 4.2875142223938395e-05,
+      "loss": 2.1407,
+      "step": 92000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.2744436264038086,
+      "eval_runtime": 51.2413,
+      "eval_samples_per_second": 198.941,
+      "eval_steps_per_second": 1.561,
+      "step": 92000
+    },
+    {
+      "epoch": 0.008651532995830519,
+      "grad_norm": 1.6381663084030151,
+      "learning_rate": 4.285965639270814e-05,
+      "loss": 2.1364,
+      "step": 92100
+    },
+    {
+      "epoch": 0.008930614705373438,
+      "grad_norm": 1.5091722011566162,
+      "learning_rate": 4.284415655376654e-05,
+      "loss": 2.1387,
+      "step": 92200
+    },
+    {
+      "epoch": 0.00920969641491636,
+      "grad_norm": 1.53826105594635,
+      "learning_rate": 4.282864271927042e-05,
+      "loss": 2.1422,
+      "step": 92300
+    },
+    {
+      "epoch": 0.00948877812445928,
+      "grad_norm": 1.5313266515731812,
+      "learning_rate": 4.281311490138765e-05,
+      "loss": 2.1358,
+      "step": 92400
+    },
+    {
+      "epoch": 0.0097678598340022,
+      "grad_norm": 1.5104668140411377,
+      "learning_rate": 4.279757311229702e-05,
+      "loss": 2.1324,
+      "step": 92500
+    },
+    {
+      "epoch": 0.010046941543545119,
+      "grad_norm": 1.5808652639389038,
+      "learning_rate": 4.278201736418828e-05,
+      "loss": 2.1479,
+      "step": 92600
+    },
+    {
+      "epoch": 0.010326023253088039,
+      "grad_norm": 1.5481529235839844,
+      "learning_rate": 4.276644766926213e-05,
+      "loss": 2.1359,
+      "step": 92700
+    },
+    {
+      "epoch": 0.01060510496263096,
+      "grad_norm": 1.6417638063430786,
+      "learning_rate": 4.275086403973023e-05,
+      "loss": 2.1421,
+      "step": 92800
+    },
+    {
+      "epoch": 0.01088418667217388,
+      "grad_norm": 1.6985423564910889,
+      "learning_rate": 4.2735266487815156e-05,
+      "loss": 2.1376,
+      "step": 92900
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "grad_norm": 1.7289358377456665,
+      "learning_rate": 4.271965502575039e-05,
+      "loss": 2.1446,
+      "step": 93000
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "eval_loss": 2.2753000259399414,
+      "eval_runtime": 51.3478,
+      "eval_samples_per_second": 198.529,
+      "eval_steps_per_second": 1.558,
+      "step": 93000
+    },
+    {
+      "epoch": 0.011442350091259719,
+      "grad_norm": 1.5933712720870972,
+      "learning_rate": 4.2704029665780354e-05,
+      "loss": 2.1341,
+      "step": 93100
+    },
+    {
+      "epoch": 0.011721431800802639,
+      "grad_norm": 1.4210540056228638,
+      "learning_rate": 4.2688390420160335e-05,
+      "loss": 2.125,
+      "step": 93200
+    },
+    {
+      "epoch": 0.012000513510345558,
+      "grad_norm": 1.7447474002838135,
+      "learning_rate": 4.267273730115654e-05,
+      "loss": 2.1259,
+      "step": 93300
+    },
+    {
+      "epoch": 0.01227959521988848,
+      "grad_norm": 1.6314444541931152,
+      "learning_rate": 4.265707032104603e-05,
+      "loss": 2.1334,
+      "step": 93400
+    },
+    {
+      "epoch": 0.0125586769294314,
+      "grad_norm": 1.6820199489593506,
+      "learning_rate": 4.264138949211678e-05,
+      "loss": 2.1263,
+      "step": 93500
+    },
+    {
+      "epoch": 0.012837758638974319,
+      "grad_norm": 1.6244908571243286,
+      "learning_rate": 4.2625694826667576e-05,
+      "loss": 2.1313,
+      "step": 93600
+    },
+    {
+      "epoch": 0.013116840348517239,
+      "grad_norm": 1.6561076641082764,
+      "learning_rate": 4.260998633700809e-05,
+      "loss": 2.1274,
+      "step": 93700
+    },
+    {
+      "epoch": 0.013395922058060158,
+      "grad_norm": 1.5971171855926514,
+      "learning_rate": 4.259426403545883e-05,
+      "loss": 2.1304,
+      "step": 93800
+    },
+    {
+      "epoch": 0.013675003767603078,
+      "grad_norm": 1.5423985719680786,
+      "learning_rate": 4.257852793435113e-05,
+      "loss": 2.1356,
+      "step": 93900
+    },
+    {
+      "epoch": 0.013954085477146,
+      "grad_norm": 1.6720298528671265,
+      "learning_rate": 4.256277804602715e-05,
+      "loss": 2.1297,
+      "step": 94000
+    },
+    {
+      "epoch": 0.013954085477146,
+      "eval_loss": 2.2692244052886963,
+      "eval_runtime": 51.3308,
+      "eval_samples_per_second": 198.594,
+      "eval_steps_per_second": 1.559,
+      "step": 94000
+    },
+    {
+      "epoch": 0.01423316718668892,
+      "grad_norm": 1.670507550239563,
+      "learning_rate": 4.254701438283987e-05,
+      "loss": 2.1326,
+      "step": 94100
+    },
+    {
+      "epoch": 0.014512248896231839,
+      "grad_norm": 1.6407012939453125,
+      "learning_rate": 4.253123695715307e-05,
+      "loss": 2.1201,
+      "step": 94200
+    },
+    {
+      "epoch": 0.014791330605774759,
+      "grad_norm": 1.6673415899276733,
+      "learning_rate": 4.2515445781341306e-05,
+      "loss": 2.1236,
+      "step": 94300
+    },
+    {
+      "epoch": 0.015070412315317678,
+      "grad_norm": 1.6600122451782227,
+      "learning_rate": 4.2499640867789955e-05,
+      "loss": 2.1387,
+      "step": 94400
+    },
+    {
+      "epoch": 0.015349494024860598,
+      "grad_norm": 1.669821858406067,
+      "learning_rate": 4.248382222889515e-05,
+      "loss": 2.1376,
+      "step": 94500
+    },
+    {
+      "epoch": 0.01562857573440352,
+      "grad_norm": 1.6163934469223022,
+      "learning_rate": 4.246798987706378e-05,
+      "loss": 2.1219,
+      "step": 94600
+    },
+    {
+      "epoch": 0.015907657443946437,
+      "grad_norm": 1.6449049711227417,
+      "learning_rate": 4.24521438247135e-05,
+      "loss": 2.1197,
+      "step": 94700
+    },
+    {
+      "epoch": 0.01618673915348936,
+      "grad_norm": 1.6894525289535522,
+      "learning_rate": 4.2436284084272706e-05,
+      "loss": 2.1185,
+      "step": 94800
+    },
+    {
+      "epoch": 0.01646582086303228,
+      "grad_norm": 1.6635791063308716,
+      "learning_rate": 4.242041066818053e-05,
+      "loss": 2.1202,
+      "step": 94900
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "grad_norm": 1.5680242776870728,
+      "learning_rate": 4.240452358888685e-05,
+      "loss": 2.1157,
+      "step": 95000
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "eval_loss": 2.2662878036499023,
+      "eval_runtime": 51.3865,
+      "eval_samples_per_second": 198.379,
+      "eval_steps_per_second": 1.557,
+      "step": 95000
+    },
+    {
+      "epoch": 0.01702398428211812,
+      "grad_norm": 1.6089997291564941,
+      "learning_rate": 4.2388622858852224e-05,
+      "loss": 2.1249,
+      "step": 95100
+    },
+    {
+      "epoch": 0.017303065991661037,
+      "grad_norm": 1.5722119808197021,
+      "learning_rate": 4.237270849054794e-05,
+      "loss": 2.1169,
+      "step": 95200
+    },
+    {
+      "epoch": 0.01758214770120396,
+      "grad_norm": 1.6253992319107056,
+      "learning_rate": 4.2356780496455984e-05,
+      "loss": 2.121,
+      "step": 95300
+    },
+    {
+      "epoch": 0.017861229410746877,
+      "grad_norm": 1.7267462015151978,
+      "learning_rate": 4.2340838889069014e-05,
+      "loss": 2.1174,
+      "step": 95400
+    },
+    {
+      "epoch": 0.018140311120289798,
+      "grad_norm": 1.639393925666809,
+      "learning_rate": 4.232488368089038e-05,
+      "loss": 2.1191,
+      "step": 95500
+    },
+    {
+      "epoch": 0.01841939282983272,
+      "grad_norm": 1.570025086402893,
+      "learning_rate": 4.2308914884434096e-05,
+      "loss": 2.1208,
+      "step": 95600
+    },
+    {
+      "epoch": 0.018698474539375638,
+      "grad_norm": 1.6924635171890259,
+      "learning_rate": 4.2292932512224835e-05,
+      "loss": 2.1218,
+      "step": 95700
+    },
+    {
+      "epoch": 0.01897755624891856,
+      "grad_norm": 1.5708248615264893,
+      "learning_rate": 4.22769365767979e-05,
+      "loss": 2.1191,
+      "step": 95800
+    },
+    {
+      "epoch": 0.019256637958461477,
+      "grad_norm": 1.6411412954330444,
+      "learning_rate": 4.226092709069926e-05,
+      "loss": 2.1017,
+      "step": 95900
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "grad_norm": 1.6997929811477661,
+      "learning_rate": 4.224490406648548e-05,
+      "loss": 2.1125,
+      "step": 96000
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "eval_loss": 2.2651915550231934,
+      "eval_runtime": 51.6612,
+      "eval_samples_per_second": 197.324,
+      "eval_steps_per_second": 1.549,
+      "step": 96000
+    },
+    {
+      "epoch": 0.01981480137754732,
+      "grad_norm": 1.678976058959961,
+      "learning_rate": 4.222886751672379e-05,
+      "loss": 2.1085,
+      "step": 96100
+    },
+    {
+      "epoch": 0.020093883087090238,
+      "grad_norm": 1.6502765417099,
+      "learning_rate": 4.221281745399197e-05,
+      "loss": 2.104,
+      "step": 96200
+    },
+    {
+      "epoch": 0.02037296479663316,
+      "grad_norm": 1.7067300081253052,
+      "learning_rate": 4.219675389087845e-05,
+      "loss": 2.0918,
+      "step": 96300
+    },
+    {
+      "epoch": 0.020652046506176077,
+      "grad_norm": 1.5428402423858643,
+      "learning_rate": 4.218067683998221e-05,
+      "loss": 2.0912,
+      "step": 96400
+    },
+    {
+      "epoch": 0.020931128215719,
+      "grad_norm": 1.5912175178527832,
+      "learning_rate": 4.2164586313912844e-05,
+      "loss": 2.1018,
+      "step": 96500
+    },
+    {
+      "epoch": 0.02121020992526192,
+      "grad_norm": 1.6541246175765991,
+      "learning_rate": 4.214848232529048e-05,
+      "loss": 2.1059,
+      "step": 96600
+    },
+    {
+      "epoch": 0.021489291634804838,
+      "grad_norm": 1.6385318040847778,
+      "learning_rate": 4.2132364886745834e-05,
+      "loss": 2.0992,
+      "step": 96700
+    },
+    {
+      "epoch": 0.02176837334434776,
+      "grad_norm": 1.6574422121047974,
+      "learning_rate": 4.2116234010920153e-05,
+      "loss": 2.1014,
+      "step": 96800
+    },
+    {
+      "epoch": 0.022047455053890677,
+      "grad_norm": 1.798996090888977,
+      "learning_rate": 4.210008971046522e-05,
+      "loss": 2.0837,
+      "step": 96900
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "grad_norm": 1.7654832601547241,
+      "learning_rate": 4.208393199804337e-05,
+      "loss": 2.0967,
+      "step": 97000
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "eval_loss": 2.2732906341552734,
+      "eval_runtime": 51.5925,
+      "eval_samples_per_second": 197.587,
+      "eval_steps_per_second": 1.551,
+      "step": 97000
+    },
+    {
+      "epoch": 0.022605618472976517,
+      "grad_norm": 1.659149169921875,
+      "learning_rate": 4.206776088632744e-05,
+      "loss": 2.0867,
+      "step": 97100
+    },
+    {
+      "epoch": 0.022884700182519438,
+      "grad_norm": 1.6352757215499878,
+      "learning_rate": 4.205157638800077e-05,
+      "loss": 2.0883,
+      "step": 97200
+    },
+    {
+      "epoch": 0.02316378189206236,
+      "grad_norm": 1.6729434728622437,
+      "learning_rate": 4.203537851575722e-05,
+      "loss": 2.0826,
+      "step": 97300
+    },
+    {
+      "epoch": 0.023442863601605277,
+      "grad_norm": 1.6211354732513428,
+      "learning_rate": 4.201916728230112e-05,
+      "loss": 2.0862,
+      "step": 97400
+    },
+    {
+      "epoch": 0.0237219453111482,
+      "grad_norm": 1.6426453590393066,
+      "learning_rate": 4.20029427003473e-05,
+      "loss": 2.1016,
+      "step": 97500
+    },
+    {
+      "epoch": 0.024001027020691117,
+      "grad_norm": 1.7282731533050537,
+      "learning_rate": 4.198670478262103e-05,
+      "loss": 2.0897,
+      "step": 97600
+    },
+    {
+      "epoch": 0.024280108730234038,
+      "grad_norm": 1.6720244884490967,
+      "learning_rate": 4.1970453541858075e-05,
+      "loss": 2.0887,
+      "step": 97700
+    },
+    {
+      "epoch": 0.02455919043977696,
+      "grad_norm": 1.6182509660720825,
+      "learning_rate": 4.195418899080462e-05,
+      "loss": 2.0941,
+      "step": 97800
+    },
+    {
+      "epoch": 0.024838272149319877,
+      "grad_norm": 1.5953019857406616,
+      "learning_rate": 4.19379111422173e-05,
+      "loss": 2.0799,
+      "step": 97900
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "grad_norm": 1.6194970607757568,
+      "learning_rate": 4.1921620008863193e-05,
+      "loss": 2.0915,
+      "step": 98000
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "eval_loss": 2.269259452819824,
+      "eval_runtime": 51.5672,
+      "eval_samples_per_second": 197.684,
+      "eval_steps_per_second": 1.551,
+      "step": 98000
+    },
+    {
+      "epoch": 0.025396435568405717,
+      "grad_norm": 1.6391347646713257,
+      "learning_rate": 4.1905315603519765e-05,
+      "loss": 2.0761,
+      "step": 98100
+    },
+    {
+      "epoch": 0.025675517277948638,
+      "grad_norm": 1.6369253396987915,
+      "learning_rate": 4.1888997938974935e-05,
+      "loss": 2.0862,
+      "step": 98200
+    },
+    {
+      "epoch": 0.025954598987491556,
+      "grad_norm": 1.628437876701355,
+      "learning_rate": 4.187266702802698e-05,
+      "loss": 2.0835,
+      "step": 98300
+    },
+    {
+      "epoch": 0.026233680697034478,
+      "grad_norm": 1.643228530883789,
+      "learning_rate": 4.1856322883484584e-05,
+      "loss": 2.078,
+      "step": 98400
+    },
+    {
+      "epoch": 0.0265127624065774,
+      "grad_norm": 1.6307592391967773,
+      "learning_rate": 4.183996551816681e-05,
+      "loss": 2.0703,
+      "step": 98500
+    },
+    {
+      "epoch": 0.026791844116120317,
+      "grad_norm": 1.6314188241958618,
+      "learning_rate": 4.18235949449031e-05,
+      "loss": 2.0926,
+      "step": 98600
+    },
+    {
+      "epoch": 0.02707092582566324,
+      "grad_norm": 1.5619598627090454,
+      "learning_rate": 4.180721117653323e-05,
+      "loss": 2.0827,
+      "step": 98700
+    },
+    {
+      "epoch": 0.027350007535206156,
+      "grad_norm": 1.5566825866699219,
+      "learning_rate": 4.179081422590736e-05,
+      "loss": 2.0754,
+      "step": 98800
+    },
+    {
+      "epoch": 0.027629089244749078,
+      "grad_norm": 1.621233344078064,
+      "learning_rate": 4.177440410588596e-05,
+      "loss": 2.0702,
+      "step": 98900
+    },
+    {
+      "epoch": 0.027908170954292,
+      "grad_norm": 1.6783591508865356,
+      "learning_rate": 4.1757980829339826e-05,
+      "loss": 2.0691,
+      "step": 99000
+    },
+    {
+      "epoch": 0.027908170954292,
+      "eval_loss": 2.2590763568878174,
+      "eval_runtime": 51.6452,
+      "eval_samples_per_second": 197.385,
+      "eval_steps_per_second": 1.549,
+      "step": 99000
+    },
+    {
+      "epoch": 0.028187252663834917,
+      "grad_norm": 1.6113650798797607,
+      "learning_rate": 4.1741544409150104e-05,
+      "loss": 2.0677,
+      "step": 99100
+    },
+    {
+      "epoch": 0.02846633437337784,
+      "grad_norm": 1.6120631694793701,
+      "learning_rate": 4.172509485820823e-05,
+      "loss": 2.0784,
+      "step": 99200
+    },
+    {
+      "epoch": 0.028745416082920756,
+      "grad_norm": 1.603555679321289,
+      "learning_rate": 4.170863218941593e-05,
+      "loss": 2.0685,
+      "step": 99300
+    },
+    {
+      "epoch": 0.029024497792463678,
+      "grad_norm": 1.5876059532165527,
+      "learning_rate": 4.1692156415685234e-05,
+      "loss": 2.0694,
+      "step": 99400
+    },
+    {
+      "epoch": 0.0293035795020066,
+      "grad_norm": 1.5650913715362549,
+      "learning_rate": 4.167566754993844e-05,
+      "loss": 2.0714,
+      "step": 99500
+    },
+    {
+      "epoch": 0.029582661211549517,
+      "grad_norm": 1.6612671613693237,
+      "learning_rate": 4.1659165605108134e-05,
+      "loss": 2.0754,
+      "step": 99600
+    },
+    {
+      "epoch": 0.02986174292109244,
+      "grad_norm": 1.6820577383041382,
+      "learning_rate": 4.1642650594137116e-05,
+      "loss": 2.0686,
+      "step": 99700
+    },
+    {
+      "epoch": 0.030140824630635357,
+      "grad_norm": 1.6811972856521606,
+      "learning_rate": 4.162612252997849e-05,
+      "loss": 2.0719,
+      "step": 99800
+    },
+    {
+      "epoch": 0.030419906340178278,
+      "grad_norm": 1.6226907968521118,
+      "learning_rate": 4.160958142559556e-05,
+      "loss": 2.0654,
+      "step": 99900
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "grad_norm": 1.5367672443389893,
+      "learning_rate": 4.159302729396186e-05,
+      "loss": 2.077,
+      "step": 100000
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "eval_loss": 2.2724859714508057,
+      "eval_runtime": 51.6394,
+      "eval_samples_per_second": 197.407,
+      "eval_steps_per_second": 1.549,
+      "step": 100000
+    },
+    {
+      "epoch": 0.030978069759264117,
+      "grad_norm": 1.6159389019012451,
+      "learning_rate": 4.157646014806117e-05,
+      "loss": 2.0694,
+      "step": 100100
+    },
+    {
+      "epoch": 0.03125715146880704,
+      "grad_norm": 1.6105570793151855,
+      "learning_rate": 4.155988000088745e-05,
+      "loss": 2.0527,
+      "step": 100200
+    },
+    {
+      "epoch": 0.03153623317834996,
+      "grad_norm": 1.6409541368484497,
+      "learning_rate": 4.1543286865444856e-05,
+      "loss": 2.0671,
+      "step": 100300
+    },
+    {
+      "epoch": 0.031815314887892875,
+      "grad_norm": 1.6089441776275635,
+      "learning_rate": 4.152668075474775e-05,
+      "loss": 2.0659,
+      "step": 100400
+    },
+    {
+      "epoch": 0.0320943965974358,
+      "grad_norm": 1.6299337148666382,
+      "learning_rate": 4.151006168182065e-05,
+      "loss": 2.0634,
+      "step": 100500
+    },
+    {
+      "epoch": 0.03237347830697872,
+      "grad_norm": 1.5447492599487305,
+      "learning_rate": 4.1493429659698266e-05,
+      "loss": 2.0709,
+      "step": 100600
+    },
+    {
+      "epoch": 0.032652560016521635,
+      "grad_norm": 1.6599881649017334,
+      "learning_rate": 4.147678470142544e-05,
+      "loss": 2.0663,
+      "step": 100700
+    },
+    {
+      "epoch": 0.03293164172606456,
+      "grad_norm": 1.6231052875518799,
+      "learning_rate": 4.146012682005717e-05,
+      "loss": 2.0691,
+      "step": 100800
+    },
+    {
+      "epoch": 0.03321072343560748,
+      "grad_norm": 1.67854642868042,
+      "learning_rate": 4.144345602865859e-05,
+      "loss": 2.0569,
+      "step": 100900
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "grad_norm": 1.655781626701355,
+      "learning_rate": 4.1426772340304964e-05,
+      "loss": 2.0546,
+      "step": 101000
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "eval_loss": 2.275222063064575,
+      "eval_runtime": 51.7111,
+      "eval_samples_per_second": 197.134,
+      "eval_steps_per_second": 1.547,
+      "step": 101000
+    },
+    {
+      "epoch": 0.033768886854693314,
+      "grad_norm": 1.6492592096328735,
+      "learning_rate": 4.141007576808166e-05,
+      "loss": 2.0605,
+      "step": 101100
+    },
+    {
+      "epoch": 0.03404796856423624,
+      "grad_norm": 1.5914729833602905,
+      "learning_rate": 4.139336632508415e-05,
+      "loss": 2.0417,
+      "step": 101200
+    },
+    {
+      "epoch": 0.03432705027377916,
+      "grad_norm": 1.6000899076461792,
+      "learning_rate": 4.1376644024418035e-05,
+      "loss": 2.0621,
+      "step": 101300
+    },
+    {
+      "epoch": 0.034606131983322075,
+      "grad_norm": 1.5423117876052856,
+      "learning_rate": 4.135990887919894e-05,
+      "loss": 2.0709,
+      "step": 101400
+    },
+    {
+      "epoch": 0.034885213692865,
+      "grad_norm": 1.5954508781433105,
+      "learning_rate": 4.134316090255263e-05,
+      "loss": 2.0585,
+      "step": 101500
+    },
+    {
+      "epoch": 0.03516429540240792,
+      "grad_norm": 1.603651523590088,
+      "learning_rate": 4.1326400107614877e-05,
+      "loss": 2.0688,
+      "step": 101600
+    },
+    {
+      "epoch": 0.035443377111950836,
+      "grad_norm": 1.6470582485198975,
+      "learning_rate": 4.130962650753154e-05,
+      "loss": 2.0548,
+      "step": 101700
+    },
+    {
+      "epoch": 0.035722458821493754,
+      "grad_norm": 1.6013911962509155,
+      "learning_rate": 4.129284011545852e-05,
+      "loss": 2.0502,
+      "step": 101800
+    },
+    {
+      "epoch": 0.03600154053103668,
+      "grad_norm": 1.6595914363861084,
+      "learning_rate": 4.127604094456174e-05,
+      "loss": 2.0653,
+      "step": 101900
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "grad_norm": 1.6441375017166138,
+      "learning_rate": 4.125922900801715e-05,
+      "loss": 2.0181,
+      "step": 102000
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "eval_loss": 2.27272891998291,
+      "eval_runtime": 51.6797,
+      "eval_samples_per_second": 197.253,
+      "eval_steps_per_second": 1.548,
+      "step": 102000
+    },
+    {
+      "epoch": 0.036559703950122514,
+      "grad_norm": 1.5254052877426147,
+      "learning_rate": 4.124240431901071e-05,
+      "loss": 2.0585,
+      "step": 102100
+    },
+    {
+      "epoch": 0.03683878565966544,
+      "grad_norm": 1.5843355655670166,
+      "learning_rate": 4.1225566890738384e-05,
+      "loss": 2.059,
+      "step": 102200
+    },
+    {
+      "epoch": 0.03711786736920836,
+      "grad_norm": 1.64484441280365,
+      "learning_rate": 4.120871673640613e-05,
+      "loss": 2.0522,
+      "step": 102300
+    },
+    {
+      "epoch": 0.037396949078751275,
+      "grad_norm": 1.7020013332366943,
+      "learning_rate": 4.119185386922988e-05,
+      "loss": 2.0519,
+      "step": 102400
+    },
+    {
+      "epoch": 0.0376760307882942,
+      "grad_norm": 1.6111907958984375,
+      "learning_rate": 4.117497830243555e-05,
+      "loss": 2.0563,
+      "step": 102500
+    },
+    {
+      "epoch": 0.03795511249783712,
+      "grad_norm": 1.5560317039489746,
+      "learning_rate": 4.1158090049259005e-05,
+      "loss": 2.0476,
+      "step": 102600
+    },
+    {
+      "epoch": 0.038234194207380036,
+      "grad_norm": 1.6740190982818604,
+      "learning_rate": 4.114118912294607e-05,
+      "loss": 2.0654,
+      "step": 102700
+    },
+    {
+      "epoch": 0.038513275916922954,
+      "grad_norm": 1.5790201425552368,
+      "learning_rate": 4.1124275536752494e-05,
+      "loss": 2.0557,
+      "step": 102800
+    },
+    {
+      "epoch": 0.03879235762646588,
+      "grad_norm": 1.6047033071517944,
+      "learning_rate": 4.110734930394397e-05,
+      "loss": 2.0472,
+      "step": 102900
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "grad_norm": 1.5579067468643188,
+      "learning_rate": 4.1090410437796104e-05,
+      "loss": 2.054,
+      "step": 103000
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "eval_loss": 2.261636257171631,
+      "eval_runtime": 51.6907,
+      "eval_samples_per_second": 197.212,
+      "eval_steps_per_second": 1.548,
+      "step": 103000
+    },
+    {
+      "epoch": 0.039350521045551715,
+      "grad_norm": 1.5974478721618652,
+      "learning_rate": 4.107345895159441e-05,
+      "loss": 2.0528,
+      "step": 103100
+    },
+    {
+      "epoch": 0.03962960275509464,
+      "grad_norm": 1.6119569540023804,
+      "learning_rate": 4.105649485863431e-05,
+      "loss": 2.0571,
+      "step": 103200
+    },
+    {
+      "epoch": 0.03990868446463756,
+      "grad_norm": 1.606919527053833,
+      "learning_rate": 4.1039518172221105e-05,
+      "loss": 2.0516,
+      "step": 103300
+    },
+    {
+      "epoch": 0.040187766174180475,
+      "grad_norm": 1.700379490852356,
+      "learning_rate": 4.1022528905669954e-05,
+      "loss": 2.0405,
+      "step": 103400
+    },
+    {
+      "epoch": 0.04046684788372339,
+      "grad_norm": 1.5789330005645752,
+      "learning_rate": 4.100552707230593e-05,
+      "loss": 2.0551,
+      "step": 103500
+    },
+    {
+      "epoch": 0.04074592959326632,
+      "grad_norm": 1.5743343830108643,
+      "learning_rate": 4.098851268546392e-05,
+      "loss": 2.0558,
+      "step": 103600
+    },
+    {
+      "epoch": 0.041025011302809236,
+      "grad_norm": 1.6383607387542725,
+      "learning_rate": 4.097148575848868e-05,
+      "loss": 2.0473,
+      "step": 103700
+    },
+    {
+      "epoch": 0.041304093012352154,
+      "grad_norm": 1.5813179016113281,
+      "learning_rate": 4.095444630473478e-05,
+      "loss": 2.0462,
+      "step": 103800
+    },
+    {
+      "epoch": 0.04158317472189508,
+      "grad_norm": 1.5996896028518677,
+      "learning_rate": 4.093739433756665e-05,
+      "loss": 2.0445,
+      "step": 103900
+    },
+    {
+      "epoch": 0.041862256431438,
+      "grad_norm": 1.627163290977478,
+      "learning_rate": 4.09203298703585e-05,
+      "loss": 2.0512,
+      "step": 104000
+    },
+    {
+      "epoch": 0.041862256431438,
+      "eval_loss": 2.2715542316436768,
+      "eval_runtime": 51.5969,
+      "eval_samples_per_second": 197.57,
+      "eval_steps_per_second": 1.55,
+      "step": 104000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 1.621848225593567,
+      "learning_rate": 4.090325291649436e-05,
+      "loss": 2.056,
+      "step": 104100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 1.5937111377716064,
+      "learning_rate": 4.088616348936804e-05,
+      "loss": 2.0527,
+      "step": 104200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 3.054250478744507,
+      "learning_rate": 4.0869061602383166e-05,
+      "loss": 2.1108,
+      "step": 104300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 1.599531888961792,
+      "learning_rate": 4.0851947268953096e-05,
+      "loss": 2.148,
+      "step": 104400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 1.4354475736618042,
+      "learning_rate": 4.083482050250098e-05,
+      "loss": 2.1317,
+      "step": 104500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 1.6972846984863281,
+      "learning_rate": 4.08176813164597e-05,
+      "loss": 2.1313,
+      "step": 104600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 1.3760380744934082,
+      "learning_rate": 4.0800529724271896e-05,
+      "loss": 2.1105,
+      "step": 104700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 1.5800480842590332,
+      "learning_rate": 4.0783365739389924e-05,
+      "loss": 2.1108,
+      "step": 104800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 1.6185262203216553,
+      "learning_rate": 4.076618937527585e-05,
+      "loss": 2.0927,
+      "step": 104900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 1.666864037513733,
+      "learning_rate": 4.07490006454015e-05,
+      "loss": 2.096,
+      "step": 105000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.255873441696167,
+      "eval_runtime": 52.3492,
+      "eval_samples_per_second": 194.731,
+      "eval_steps_per_second": 1.528,
+      "step": 105000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 1.558935284614563,
+      "learning_rate": 4.0731799563248334e-05,
+      "loss": 2.0891,
+      "step": 105100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 1.62444269657135,
+      "learning_rate": 4.0714586142307546e-05,
+      "loss": 2.0738,
+      "step": 105200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 1.6997108459472656,
+      "learning_rate": 4.069736039607998e-05,
+      "loss": 2.0742,
+      "step": 105300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 1.2430105209350586,
+      "learning_rate": 4.0680122338076156e-05,
+      "loss": 2.0892,
+      "step": 105400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 1.6265345811843872,
+      "learning_rate": 4.0662871981816266e-05,
+      "loss": 2.0759,
+      "step": 105500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 1.429768681526184,
+      "learning_rate": 4.064560934083012e-05,
+      "loss": 2.0731,
+      "step": 105600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 1.5453941822052002,
+      "learning_rate": 4.062833442865719e-05,
+      "loss": 2.0334,
+      "step": 105700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 1.6710811853408813,
+      "learning_rate": 4.061104725884654e-05,
+      "loss": 2.0727,
+      "step": 105800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 1.668752908706665,
+      "learning_rate": 4.0593747844956896e-05,
+      "loss": 2.0566,
+      "step": 105900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 1.6783727407455444,
+      "learning_rate": 4.057643620055654e-05,
+      "loss": 2.0607,
+      "step": 106000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.251800537109375,
+      "eval_runtime": 51.9592,
+      "eval_samples_per_second": 196.192,
+      "eval_steps_per_second": 1.54,
+      "step": 106000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 1.4173957109451294,
+      "learning_rate": 4.055911233922338e-05,
+      "loss": 2.0537,
+      "step": 106100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 1.6333705186843872,
+      "learning_rate": 4.054177627454487e-05,
+      "loss": 2.0679,
+      "step": 106200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 1.078372836112976,
+      "learning_rate": 4.0524428020118074e-05,
+      "loss": 2.0599,
+      "step": 106300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 1.7360562086105347,
+      "learning_rate": 4.0507067589549595e-05,
+      "loss": 2.0451,
+      "step": 106400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 1.0331660509109497,
+      "learning_rate": 4.048969499645559e-05,
+      "loss": 2.0569,
+      "step": 106500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 1.7799084186553955,
+      "learning_rate": 4.0472310254461765e-05,
+      "loss": 2.0452,
+      "step": 106600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 1.6667675971984863,
+      "learning_rate": 4.045491337720333e-05,
+      "loss": 2.0386,
+      "step": 106700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 1.5602868795394897,
+      "learning_rate": 4.043750437832504e-05,
+      "loss": 2.0379,
+      "step": 106800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 1.670214295387268,
+      "learning_rate": 4.0420083271481144e-05,
+      "loss": 2.0362,
+      "step": 106900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 1.6006022691726685,
+      "learning_rate": 4.040265007033538e-05,
+      "loss": 2.042,
+      "step": 107000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.252915620803833,
+      "eval_runtime": 51.9529,
+      "eval_samples_per_second": 196.216,
+      "eval_steps_per_second": 1.54,
+      "step": 107000
+    },
+    {
+      "epoch": 0.008651532995830519,
+      "grad_norm": 1.5068638324737549,
+      "learning_rate": 4.0385204788561e-05,
+      "loss": 2.0379,
+      "step": 107100
+    },
+    {
+      "epoch": 0.008930614705373438,
+      "grad_norm": 1.7357958555221558,
+      "learning_rate": 4.0367747439840694e-05,
+      "loss": 2.0289,
+      "step": 107200
+    },
+    {
+      "epoch": 0.00920969641491636,
+      "grad_norm": 1.6686527729034424,
+      "learning_rate": 4.0350278037866654e-05,
+      "loss": 2.0413,
+      "step": 107300
+    },
+    {
+      "epoch": 0.00948877812445928,
+      "grad_norm": 1.226962924003601,
+      "learning_rate": 4.0332796596340485e-05,
+      "loss": 2.0204,
+      "step": 107400
+    },
+    {
+      "epoch": 0.0097678598340022,
+      "grad_norm": 1.696763515472412,
+      "learning_rate": 4.031530312897327e-05,
+      "loss": 2.0296,
+      "step": 107500
+    },
+    {
+      "epoch": 0.010046941543545119,
+      "grad_norm": 1.6187881231307983,
+      "learning_rate": 4.0297797649485515e-05,
+      "loss": 2.0211,
+      "step": 107600
+    },
+    {
+      "epoch": 0.010326023253088039,
+      "grad_norm": 1.082560658454895,
+      "learning_rate": 4.028028017160712e-05,
+      "loss": 2.0304,
+      "step": 107700
+    },
+    {
+      "epoch": 0.01060510496263096,
+      "grad_norm": 1.551513910293579,
+      "learning_rate": 4.026275070907744e-05,
+      "loss": 2.0332,
+      "step": 107800
+    },
+    {
+      "epoch": 0.01088418667217388,
+      "grad_norm": 1.658700704574585,
+      "learning_rate": 4.024520927564521e-05,
+      "loss": 2.0134,
+      "step": 107900
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "grad_norm": 1.734298586845398,
+      "learning_rate": 4.022765588506854e-05,
+      "loss": 2.0259,
+      "step": 108000
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "eval_loss": 2.248534917831421,
+      "eval_runtime": 51.918,
+      "eval_samples_per_second": 196.348,
+      "eval_steps_per_second": 1.541,
+      "step": 108000
+    },
+    {
+      "epoch": 0.011442350091259719,
+      "grad_norm": 1.3358790874481201,
+      "learning_rate": 4.021009055111493e-05,
+      "loss": 2.0226,
+      "step": 108100
+    },
+    {
+      "epoch": 0.011721431800802639,
+      "grad_norm": 1.6433982849121094,
+      "learning_rate": 4.019251328756125e-05,
+      "loss": 2.0231,
+      "step": 108200
+    },
+    {
+      "epoch": 0.012000513510345558,
+      "grad_norm": 1.6571450233459473,
+      "learning_rate": 4.0174924108193734e-05,
+      "loss": 2.0272,
+      "step": 108300
+    },
+    {
+      "epoch": 0.01227959521988848,
+      "grad_norm": 1.5485866069793701,
+      "learning_rate": 4.015732302680795e-05,
+      "loss": 2.0154,
+      "step": 108400
+    },
+    {
+      "epoch": 0.0125586769294314,
+      "grad_norm": 1.717331886291504,
+      "learning_rate": 4.0139710057208794e-05,
+      "loss": 2.0186,
+      "step": 108500
+    },
+    {
+      "epoch": 0.012837758638974319,
+      "grad_norm": 1.3007159233093262,
+      "learning_rate": 4.012208521321049e-05,
+      "loss": 2.017,
+      "step": 108600
+    },
+    {
+      "epoch": 0.013116840348517239,
+      "grad_norm": 1.6857043504714966,
+      "learning_rate": 4.01044485086366e-05,
+      "loss": 2.0183,
+      "step": 108700
+    },
+    {
+      "epoch": 0.013395922058060158,
+      "grad_norm": 1.5786222219467163,
+      "learning_rate": 4.0086799957319965e-05,
+      "loss": 2.0081,
+      "step": 108800
+    },
+    {
+      "epoch": 0.013675003767603078,
+      "grad_norm": 1.708694338798523,
+      "learning_rate": 4.0069139573102715e-05,
+      "loss": 2.0052,
+      "step": 108900
+    },
+    {
+      "epoch": 0.013954085477146,
+      "grad_norm": 1.6401610374450684,
+      "learning_rate": 4.005146736983627e-05,
+      "loss": 1.998,
+      "step": 109000
+    },
+    {
+      "epoch": 0.013954085477146,
+      "eval_loss": 2.2400035858154297,
+      "eval_runtime": 52.1098,
+      "eval_samples_per_second": 195.625,
+      "eval_steps_per_second": 1.535,
+      "step": 109000
+    },
+    {
+      "epoch": 0.01423316718668892,
+      "grad_norm": 1.3219704627990723,
+      "learning_rate": 4.0033783361381324e-05,
+      "loss": 2.0154,
+      "step": 109100
+    },
+    {
+      "epoch": 0.014512248896231839,
+      "grad_norm": 1.7542061805725098,
+      "learning_rate": 4.001608756160781e-05,
+      "loss": 2.0129,
+      "step": 109200
+    },
+    {
+      "epoch": 0.014791330605774759,
+      "grad_norm": 1.690807819366455,
+      "learning_rate": 3.999837998439494e-05,
+      "loss": 2.0031,
+      "step": 109300
+    },
+    {
+      "epoch": 0.015070412315317678,
+      "grad_norm": 1.652106523513794,
+      "learning_rate": 3.9980660643631137e-05,
+      "loss": 2.0025,
+      "step": 109400
+    },
+    {
+      "epoch": 0.015349494024860598,
+      "grad_norm": 1.5204631090164185,
+      "learning_rate": 3.996292955321406e-05,
+      "loss": 2.0024,
+      "step": 109500
+    },
+    {
+      "epoch": 0.01562857573440352,
+      "grad_norm": 1.6795552968978882,
+      "learning_rate": 3.9945186727050574e-05,
+      "loss": 2.0078,
+      "step": 109600
+    },
+    {
+      "epoch": 0.015907657443946437,
+      "grad_norm": 1.7057318687438965,
+      "learning_rate": 3.992743217905678e-05,
+      "loss": 2.0012,
+      "step": 109700
+    },
+    {
+      "epoch": 0.01618673915348936,
+      "grad_norm": 1.5931191444396973,
+      "learning_rate": 3.990966592315793e-05,
+      "loss": 2.0042,
+      "step": 109800
+    },
+    {
+      "epoch": 0.01646582086303228,
+      "grad_norm": 1.6618363857269287,
+      "learning_rate": 3.989188797328851e-05,
+      "loss": 2.0029,
+      "step": 109900
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "grad_norm": 1.6900277137756348,
+      "learning_rate": 3.987409834339211e-05,
+      "loss": 1.9952,
+      "step": 110000
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "eval_loss": 2.2469289302825928,
+      "eval_runtime": 52.1385,
+      "eval_samples_per_second": 195.518,
+      "eval_steps_per_second": 1.534,
+      "step": 110000
+    },
+    {
+      "epoch": 0.01702398428211812,
+      "grad_norm": 1.4866619110107422,
+      "learning_rate": 3.985629704742153e-05,
+      "loss": 2.0031,
+      "step": 110100
+    },
+    {
+      "epoch": 0.017303065991661037,
+      "grad_norm": 1.759400725364685,
+      "learning_rate": 3.9838484099338714e-05,
+      "loss": 1.9986,
+      "step": 110200
+    },
+    {
+      "epoch": 0.01758214770120396,
+      "grad_norm": 1.6036171913146973,
+      "learning_rate": 3.9820659513114735e-05,
+      "loss": 2.0032,
+      "step": 110300
+    },
+    {
+      "epoch": 0.017861229410746877,
+      "grad_norm": 1.650638461112976,
+      "learning_rate": 3.9802823302729806e-05,
+      "loss": 1.9968,
+      "step": 110400
+    },
+    {
+      "epoch": 0.018140311120289798,
+      "grad_norm": 0.8783386945724487,
+      "learning_rate": 3.978497548217324e-05,
+      "loss": 1.927,
+      "step": 110500
+    },
+    {
+      "epoch": 0.01841939282983272,
+      "grad_norm": 1.0009950399398804,
+      "learning_rate": 3.9767116065443464e-05,
+      "loss": 1.8641,
+      "step": 110600
+    },
+    {
+      "epoch": 0.018698474539375638,
+      "grad_norm": 1.0648219585418701,
+      "learning_rate": 3.974924506654801e-05,
+      "loss": 1.8177,
+      "step": 110700
+    },
+    {
+      "epoch": 0.01897755624891856,
+      "grad_norm": 0.8869590759277344,
+      "learning_rate": 3.9731362499503474e-05,
+      "loss": 1.8099,
+      "step": 110800
+    },
+    {
+      "epoch": 0.019256637958461477,
+      "grad_norm": 1.1313343048095703,
+      "learning_rate": 3.971346837833556e-05,
+      "loss": 1.7797,
+      "step": 110900
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "grad_norm": 0.87049400806427,
+      "learning_rate": 3.969556271707898e-05,
+      "loss": 1.7763,
+      "step": 111000
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "eval_loss": 2.235299587249756,
+      "eval_runtime": 52.3725,
+      "eval_samples_per_second": 194.644,
+      "eval_steps_per_second": 1.528,
+      "step": 111000
+    },
+    {
+      "epoch": 0.01981480137754732,
+      "grad_norm": 0.9816115498542786,
+      "learning_rate": 3.967764552977754e-05,
+      "loss": 1.7607,
+      "step": 111100
+    },
+    {
+      "epoch": 0.020093883087090238,
+      "grad_norm": 1.1495822668075562,
+      "learning_rate": 3.9659716830484085e-05,
+      "loss": 1.7387,
+      "step": 111200
+    },
+    {
+      "epoch": 0.02037296479663316,
+      "grad_norm": 1.117358684539795,
+      "learning_rate": 3.9641776633260464e-05,
+      "loss": 1.7432,
+      "step": 111300
+    },
+    {
+      "epoch": 0.020652046506176077,
+      "grad_norm": 1.1056963205337524,
+      "learning_rate": 3.962382495217757e-05,
+      "loss": 1.7173,
+      "step": 111400
+    },
+    {
+      "epoch": 0.020931128215719,
+      "grad_norm": 1.3292206525802612,
+      "learning_rate": 3.960586180131528e-05,
+      "loss": 1.7183,
+      "step": 111500
+    },
+    {
+      "epoch": 0.02121020992526192,
+      "grad_norm": 1.182369589805603,
+      "learning_rate": 3.9587887194762485e-05,
+      "loss": 1.7172,
+      "step": 111600
+    },
+    {
+      "epoch": 0.021489291634804838,
+      "grad_norm": 0.8990152478218079,
+      "learning_rate": 3.956990114661705e-05,
+      "loss": 1.6941,
+      "step": 111700
+    },
+    {
+      "epoch": 0.02176837334434776,
+      "grad_norm": 1.1310220956802368,
+      "learning_rate": 3.955190367098582e-05,
+      "loss": 1.6928,
+      "step": 111800
+    },
+    {
+      "epoch": 0.022047455053890677,
+      "grad_norm": 0.880366325378418,
+      "learning_rate": 3.9533894781984606e-05,
+      "loss": 1.6886,
+      "step": 111900
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "grad_norm": 0.8530257940292358,
+      "learning_rate": 3.951587449373816e-05,
+      "loss": 1.6671,
+      "step": 112000
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "eval_loss": 2.2474164962768555,
+      "eval_runtime": 52.3853,
+      "eval_samples_per_second": 194.597,
+      "eval_steps_per_second": 1.527,
+      "step": 112000
+    },
+    {
+      "epoch": 0.022605618472976517,
+      "grad_norm": 1.0223313570022583,
+      "learning_rate": 3.949784282038018e-05,
+      "loss": 1.6847,
+      "step": 112100
+    },
+    {
+      "epoch": 0.022884700182519438,
+      "grad_norm": 0.878471314907074,
+      "learning_rate": 3.9479799776053306e-05,
+      "loss": 1.6647,
+      "step": 112200
+    },
+    {
+      "epoch": 0.02316378189206236,
+      "grad_norm": 1.2385903596878052,
+      "learning_rate": 3.9461745374909066e-05,
+      "loss": 1.6462,
+      "step": 112300
+    },
+    {
+      "epoch": 0.023442863601605277,
+      "grad_norm": 1.071331262588501,
+      "learning_rate": 3.9443679631107924e-05,
+      "loss": 1.641,
+      "step": 112400
+    },
+    {
+      "epoch": 0.0237219453111482,
+      "grad_norm": 0.8975522518157959,
+      "learning_rate": 3.942560255881922e-05,
+      "loss": 1.6262,
+      "step": 112500
+    },
+    {
+      "epoch": 0.024001027020691117,
+      "grad_norm": 0.8427426218986511,
+      "learning_rate": 3.94075141722212e-05,
+      "loss": 1.6467,
+      "step": 112600
+    },
+    {
+      "epoch": 0.024280108730234038,
+      "grad_norm": 0.8209208250045776,
+      "learning_rate": 3.938941448550098e-05,
+      "loss": 1.6257,
+      "step": 112700
+    },
+    {
+      "epoch": 0.02455919043977696,
+      "grad_norm": 0.8760822415351868,
+      "learning_rate": 3.937130351285452e-05,
+      "loss": 1.6292,
+      "step": 112800
+    },
+    {
+      "epoch": 0.024838272149319877,
+      "grad_norm": 0.9947307705879211,
+      "learning_rate": 3.935318126848664e-05,
+      "loss": 1.6358,
+      "step": 112900
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "grad_norm": 1.6038973331451416,
+      "learning_rate": 3.933504776661102e-05,
+      "loss": 1.8923,
+      "step": 113000
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "eval_loss": 2.267655849456787,
+      "eval_runtime": 51.8322,
+      "eval_samples_per_second": 196.673,
+      "eval_steps_per_second": 1.543,
+      "step": 113000
+    },
+    {
+      "epoch": 0.025396435568405717,
+      "grad_norm": 1.4726253747940063,
+      "learning_rate": 3.931690302145014e-05,
+      "loss": 1.9579,
+      "step": 113100
+    },
+    {
+      "epoch": 0.025675517277948638,
+      "grad_norm": 1.4382061958312988,
+      "learning_rate": 3.9298747047235327e-05,
+      "loss": 1.9466,
+      "step": 113200
+    },
+    {
+      "epoch": 0.025954598987491556,
+      "grad_norm": 1.5668888092041016,
+      "learning_rate": 3.928057985820668e-05,
+      "loss": 1.9479,
+      "step": 113300
+    },
+    {
+      "epoch": 0.026233680697034478,
+      "grad_norm": 1.5239572525024414,
+      "learning_rate": 3.926240146861314e-05,
+      "loss": 1.9306,
+      "step": 113400
+    },
+    {
+      "epoch": 0.0265127624065774,
+      "grad_norm": 1.6437016725540161,
+      "learning_rate": 3.924421189271239e-05,
+      "loss": 1.9364,
+      "step": 113500
+    },
+    {
+      "epoch": 0.026791844116120317,
+      "grad_norm": 1.632081151008606,
+      "learning_rate": 3.9226011144770904e-05,
+      "loss": 1.9297,
+      "step": 113600
+    },
+    {
+      "epoch": 0.02707092582566324,
+      "grad_norm": 1.577589750289917,
+      "learning_rate": 3.920779923906393e-05,
+      "loss": 1.9223,
+      "step": 113700
+    },
+    {
+      "epoch": 0.027350007535206156,
+      "grad_norm": 1.5894604921340942,
+      "learning_rate": 3.918957618987545e-05,
+      "loss": 1.9263,
+      "step": 113800
+    },
+    {
+      "epoch": 0.027629089244749078,
+      "grad_norm": 1.5186114311218262,
+      "learning_rate": 3.9171342011498185e-05,
+      "loss": 1.9261,
+      "step": 113900
+    },
+    {
+      "epoch": 0.027908170954292,
+      "grad_norm": 1.639272689819336,
+      "learning_rate": 3.9153096718233604e-05,
+      "loss": 1.9166,
+      "step": 114000
+    },
+    {
+      "epoch": 0.027908170954292,
+      "eval_loss": 2.2479372024536133,
+      "eval_runtime": 51.8659,
+      "eval_samples_per_second": 196.545,
+      "eval_steps_per_second": 1.542,
+      "step": 114000
+    },
+    {
+      "epoch": 0.028187252663834917,
+      "grad_norm": 1.5154080390930176,
+      "learning_rate": 3.913484032439187e-05,
+      "loss": 1.916,
+      "step": 114100
+    },
+    {
+      "epoch": 0.02846633437337784,
+      "grad_norm": 1.5147068500518799,
+      "learning_rate": 3.911657284429189e-05,
+      "loss": 1.9153,
+      "step": 114200
+    },
+    {
+      "epoch": 0.028745416082920756,
+      "grad_norm": 1.5977636575698853,
+      "learning_rate": 3.9098294292261205e-05,
+      "loss": 1.9027,
+      "step": 114300
+    },
+    {
+      "epoch": 0.029024497792463678,
+      "grad_norm": 1.4557409286499023,
+      "learning_rate": 3.908000468263609e-05,
+      "loss": 1.9073,
+      "step": 114400
+    },
+    {
+      "epoch": 0.0293035795020066,
+      "grad_norm": 1.5657786130905151,
+      "learning_rate": 3.9061704029761495e-05,
+      "loss": 1.9026,
+      "step": 114500
+    },
+    {
+      "epoch": 0.029582661211549517,
+      "grad_norm": 1.5239813327789307,
+      "learning_rate": 3.904339234799098e-05,
+      "loss": 1.9092,
+      "step": 114600
+    },
+    {
+      "epoch": 0.02986174292109244,
+      "grad_norm": 1.4764059782028198,
+      "learning_rate": 3.9025069651686816e-05,
+      "loss": 1.9019,
+      "step": 114700
+    },
+    {
+      "epoch": 0.030140824630635357,
+      "grad_norm": 1.5580713748931885,
+      "learning_rate": 3.9006735955219874e-05,
+      "loss": 1.8958,
+      "step": 114800
+    },
+    {
+      "epoch": 0.030419906340178278,
+      "grad_norm": 1.5598928928375244,
+      "learning_rate": 3.898839127296968e-05,
+      "loss": 1.8961,
+      "step": 114900
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "grad_norm": 1.4871125221252441,
+      "learning_rate": 3.897003561932434e-05,
+      "loss": 1.8978,
+      "step": 115000
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "eval_loss": 2.2493135929107666,
+      "eval_runtime": 51.934,
+      "eval_samples_per_second": 196.288,
+      "eval_steps_per_second": 1.54,
+      "step": 115000
+    },
+    {
+      "epoch": 0.030978069759264117,
+      "grad_norm": 1.4677025079727173,
+      "learning_rate": 3.89516690086806e-05,
+      "loss": 1.8905,
+      "step": 115100
+    },
+    {
+      "epoch": 0.03125715146880704,
+      "grad_norm": 1.4988230466842651,
+      "learning_rate": 3.8933291455443786e-05,
+      "loss": 1.8935,
+      "step": 115200
+    },
+    {
+      "epoch": 0.03153623317834996,
+      "grad_norm": 1.5289825201034546,
+      "learning_rate": 3.891490297402781e-05,
+      "loss": 1.8875,
+      "step": 115300
+    },
+    {
+      "epoch": 0.031815314887892875,
+      "grad_norm": 1.480161428451538,
+      "learning_rate": 3.889650357885514e-05,
+      "loss": 1.8897,
+      "step": 115400
+    },
+    {
+      "epoch": 0.0320943965974358,
+      "grad_norm": 1.6153870820999146,
+      "learning_rate": 3.887809328435683e-05,
+      "loss": 1.8878,
+      "step": 115500
+    },
+    {
+      "epoch": 0.03237347830697872,
+      "grad_norm": 1.409119725227356,
+      "learning_rate": 3.8859672104972454e-05,
+      "loss": 1.8976,
+      "step": 115600
+    },
+    {
+      "epoch": 0.032652560016521635,
+      "grad_norm": 1.5816664695739746,
+      "learning_rate": 3.884124005515015e-05,
+      "loss": 1.8876,
+      "step": 115700
+    },
+    {
+      "epoch": 0.03293164172606456,
+      "grad_norm": 1.5077482461929321,
+      "learning_rate": 3.882279714934657e-05,
+      "loss": 1.888,
+      "step": 115800
+    },
+    {
+      "epoch": 0.03321072343560748,
+      "grad_norm": 1.4968522787094116,
+      "learning_rate": 3.880434340202686e-05,
+      "loss": 1.8841,
+      "step": 115900
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "grad_norm": 1.467680811882019,
+      "learning_rate": 3.878587882766472e-05,
+      "loss": 1.8832,
+      "step": 116000
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "eval_loss": 2.2356553077697754,
+      "eval_runtime": 51.966,
+      "eval_samples_per_second": 196.167,
+      "eval_steps_per_second": 1.539,
+      "step": 116000
+    },
+    {
+      "epoch": 0.033768886854693314,
+      "grad_norm": 1.4993896484375,
+      "learning_rate": 3.87674034407423e-05,
+      "loss": 1.8783,
+      "step": 116100
+    },
+    {
+      "epoch": 0.03404796856423624,
+      "grad_norm": 1.479200005531311,
+      "learning_rate": 3.8748917255750225e-05,
+      "loss": 1.8864,
+      "step": 116200
+    },
+    {
+      "epoch": 0.03432705027377916,
+      "grad_norm": 1.4365732669830322,
+      "learning_rate": 3.873042028718764e-05,
+      "loss": 1.877,
+      "step": 116300
+    },
+    {
+      "epoch": 0.034606131983322075,
+      "grad_norm": 1.4553158283233643,
+      "learning_rate": 3.871191254956208e-05,
+      "loss": 1.8873,
+      "step": 116400
+    },
+    {
+      "epoch": 0.034885213692865,
+      "grad_norm": 1.505732774734497,
+      "learning_rate": 3.8693394057389574e-05,
+      "loss": 1.8737,
+      "step": 116500
+    },
+    {
+      "epoch": 0.03516429540240792,
+      "grad_norm": 1.4430396556854248,
+      "learning_rate": 3.8674864825194574e-05,
+      "loss": 1.8743,
+      "step": 116600
+    },
+    {
+      "epoch": 0.035443377111950836,
+      "grad_norm": 1.5058810710906982,
+      "learning_rate": 3.865632486750996e-05,
+      "loss": 1.8734,
+      "step": 116700
+    },
+    {
+      "epoch": 0.035722458821493754,
+      "grad_norm": 1.551901936531067,
+      "learning_rate": 3.8637774198877e-05,
+      "loss": 1.8754,
+      "step": 116800
+    },
+    {
+      "epoch": 0.03600154053103668,
+      "grad_norm": 1.5766918659210205,
+      "learning_rate": 3.86192128338454e-05,
+      "loss": 1.878,
+      "step": 116900
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "grad_norm": 1.5308256149291992,
+      "learning_rate": 3.860064078697323e-05,
+      "loss": 1.8833,
+      "step": 117000
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "eval_loss": 2.2324624061584473,
+      "eval_runtime": 51.883,
+      "eval_samples_per_second": 196.48,
+      "eval_steps_per_second": 1.542,
+      "step": 117000
     }
   ],
   "logging_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 1.0210871976394752e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f4f786a1ab971b3519761e9e75ce2bb6dc37b3b2f73ad1120f8a4c1f996b3a44
 size 5777

 version https://git-lfs.github.com/spec/v1
+oid sha256:da5f5517b1675eb630da2afe2ee47f40a6f105aba3407f1e48d33a873836c026
 size 5777