Upload 10 files

Browse files

Files changed (6) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +1095 -3
training_args.bin +1 -1

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a710773cfd7f93749b548b4dc475790d75538b97475d047166dceb50704eb746
 size 598635032

 version https://git-lfs.github.com/spec/v1
+oid sha256:f341e7c0d50547a5d48a2244cc30330ab7ed2ceaff5186455a531e8c69a77105
 size 598635032

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fedf1b4c8a508947f08f4a98315b58cd6a43e2a1adda4f18d9617c092f6a8844
 size 1197359627

 version https://git-lfs.github.com/spec/v1
+oid sha256:2b47ad41e6a351695e914e54e9b102721f48985ba894d88bae93aad1de73672f
 size 1197359627

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0412622810efe6fde95b3cfeff4557f637e942d79ee2fa68f136e7ee99e430b1
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:0065f5fa67d21a3e3251b9235347d2a9d93494140e986cefd3a276ca1160a3e0
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f5967d2fde5e8af8b726d755ee2aea2a1a3996cd4db019463bea602f6a5c353f
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:6ad1df73ab0092710b52025da1ad2250f73bf46d66d45d561b7da8dfce44525e
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.002796,
   "eval_steps": 1000,
-  "global_step": 368000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -28719,6 +28719,1098 @@
       "eval_samples_per_second": 196.867,
       "eval_steps_per_second": 1.545,
       "step": 368000
     }
   ],
   "logging_steps": 100,
@@ -28738,7 +29830,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.211620496844764e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 117.000154,
   "eval_steps": 1000,
+  "global_step": 382000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 196.867,
       "eval_steps_per_second": 1.545,
       "step": 368000
+    },
+    {
+      "epoch": 0.0002,
+      "grad_norm": 1.7583304643630981,
+      "learning_rate": 8.378246507831702e-06,
+      "loss": 2.0821,
+      "step": 368100
+    },
+    {
+      "epoch": 1.000162,
+      "grad_norm": 1.6928149461746216,
+      "learning_rate": 8.366398135029847e-06,
+      "loss": 1.9175,
+      "step": 368200
+    },
+    {
+      "epoch": 2.000124,
+      "grad_norm": 1.6941829919815063,
+      "learning_rate": 8.354556462240829e-06,
+      "loss": 1.8645,
+      "step": 368300
+    },
+    {
+      "epoch": 3.000086,
+      "grad_norm": 1.6606141328811646,
+      "learning_rate": 8.342721494234487e-06,
+      "loss": 1.8296,
+      "step": 368400
+    },
+    {
+      "epoch": 4.000048,
+      "grad_norm": 1.508047342300415,
+      "learning_rate": 8.330893235777929e-06,
+      "loss": 1.7982,
+      "step": 368500
+    },
+    {
+      "epoch": 5.00001,
+      "grad_norm": 1.6567221879959106,
+      "learning_rate": 8.31907169163558e-06,
+      "loss": 1.776,
+      "step": 368600
+    },
+    {
+      "epoch": 5.00021,
+      "grad_norm": 1.5388526916503906,
+      "learning_rate": 8.30725686656916e-06,
+      "loss": 1.7492,
+      "step": 368700
+    },
+    {
+      "epoch": 6.000172,
+      "grad_norm": 1.6148278713226318,
+      "learning_rate": 8.295448765337685e-06,
+      "loss": 1.7284,
+      "step": 368800
+    },
+    {
+      "epoch": 7.000134,
+      "grad_norm": 1.5249569416046143,
+      "learning_rate": 8.28364739269744e-06,
+      "loss": 1.7221,
+      "step": 368900
+    },
+    {
+      "epoch": 8.000096,
+      "grad_norm": 1.5550845861434937,
+      "learning_rate": 8.271852753402028e-06,
+      "loss": 1.7079,
+      "step": 369000
+    },
+    {
+      "epoch": 8.000096,
+      "eval_loss": 1.9369168281555176,
+      "eval_runtime": 55.0617,
+      "eval_samples_per_second": 185.138,
+      "eval_steps_per_second": 1.453,
+      "step": 369000
+    },
+    {
+      "epoch": 9.000058,
+      "grad_norm": 2.09401273727417,
+      "learning_rate": 8.260064852202329e-06,
+      "loss": 3.9424,
+      "step": 369100
+    },
+    {
+      "epoch": 10.00002,
+      "grad_norm": 1.9706476926803589,
+      "learning_rate": 8.248283693846509e-06,
+      "loss": 2.7687,
+      "step": 369200
+    },
+    {
+      "epoch": 10.00022,
+      "grad_norm": 2.0509135723114014,
+      "learning_rate": 8.23650928308001e-06,
+      "loss": 2.546,
+      "step": 369300
+    },
+    {
+      "epoch": 11.000182,
+      "grad_norm": 1.9125868082046509,
+      "learning_rate": 8.224741624645565e-06,
+      "loss": 2.4164,
+      "step": 369400
+    },
+    {
+      "epoch": 12.000144,
+      "grad_norm": 2.175070285797119,
+      "learning_rate": 8.212980723283186e-06,
+      "loss": 2.3405,
+      "step": 369500
+    },
+    {
+      "epoch": 13.000106,
+      "grad_norm": 1.9154648780822754,
+      "learning_rate": 8.201226583730175e-06,
+      "loss": 2.2729,
+      "step": 369600
+    },
+    {
+      "epoch": 14.000068,
+      "grad_norm": 2.021451711654663,
+      "learning_rate": 8.189479210721076e-06,
+      "loss": 2.2268,
+      "step": 369700
+    },
+    {
+      "epoch": 15.00003,
+      "grad_norm": 2.0009710788726807,
+      "learning_rate": 8.177738608987745e-06,
+      "loss": 2.1859,
+      "step": 369800
+    },
+    {
+      "epoch": 15.00023,
+      "grad_norm": 1.9311867952346802,
+      "learning_rate": 8.166004783259295e-06,
+      "loss": 2.1494,
+      "step": 369900
+    },
+    {
+      "epoch": 16.000192,
+      "grad_norm": 1.967115044593811,
+      "learning_rate": 8.154277738262097e-06,
+      "loss": 2.1181,
+      "step": 370000
+    },
+    {
+      "epoch": 16.000192,
+      "eval_loss": 2.407406806945801,
+      "eval_runtime": 54.9275,
+      "eval_samples_per_second": 185.59,
+      "eval_steps_per_second": 1.456,
+      "step": 370000
+    },
+    {
+      "epoch": 17.000154,
+      "grad_norm": 2.050703525543213,
+      "learning_rate": 8.142557478719814e-06,
+      "loss": 2.496,
+      "step": 370100
+    },
+    {
+      "epoch": 18.000116,
+      "grad_norm": 2.053346872329712,
+      "learning_rate": 8.130844009353362e-06,
+      "loss": 2.3323,
+      "step": 370200
+    },
+    {
+      "epoch": 19.000078,
+      "grad_norm": 1.9913196563720703,
+      "learning_rate": 8.119137334880933e-06,
+      "loss": 2.2625,
+      "step": 370300
+    },
+    {
+      "epoch": 20.00004,
+      "grad_norm": 2.018827438354492,
+      "learning_rate": 8.107437460017958e-06,
+      "loss": 2.2166,
+      "step": 370400
+    },
+    {
+      "epoch": 21.000002,
+      "grad_norm": 2.2157461643218994,
+      "learning_rate": 8.095744389477155e-06,
+      "loss": 2.1759,
+      "step": 370500
+    },
+    {
+      "epoch": 21.000202,
+      "grad_norm": 1.975722312927246,
+      "learning_rate": 8.084058127968497e-06,
+      "loss": 2.1349,
+      "step": 370600
+    },
+    {
+      "epoch": 22.000164,
+      "grad_norm": 2.118351459503174,
+      "learning_rate": 8.072378680199197e-06,
+      "loss": 2.1051,
+      "step": 370700
+    },
+    {
+      "epoch": 23.000126,
+      "grad_norm": 1.9632095098495483,
+      "learning_rate": 8.060706050873746e-06,
+      "loss": 2.0781,
+      "step": 370800
+    },
+    {
+      "epoch": 24.000088,
+      "grad_norm": 2.0141265392303467,
+      "learning_rate": 8.049040244693864e-06,
+      "loss": 2.0583,
+      "step": 370900
+    },
+    {
+      "epoch": 25.00005,
+      "grad_norm": 2.0297467708587646,
+      "learning_rate": 8.037381266358546e-06,
+      "loss": 2.0323,
+      "step": 371000
+    },
+    {
+      "epoch": 25.00005,
+      "eval_loss": 2.3608412742614746,
+      "eval_runtime": 55.107,
+      "eval_samples_per_second": 184.986,
+      "eval_steps_per_second": 1.452,
+      "step": 371000
+    },
+    {
+      "epoch": 26.000012,
+      "grad_norm": 2.0017659664154053,
+      "learning_rate": 8.025729120564025e-06,
+      "loss": 2.2111,
+      "step": 371100
+    },
+    {
+      "epoch": 26.000212,
+      "grad_norm": 2.087977409362793,
+      "learning_rate": 8.01408381200379e-06,
+      "loss": 2.1626,
+      "step": 371200
+    },
+    {
+      "epoch": 27.000174,
+      "grad_norm": 1.9115463495254517,
+      "learning_rate": 8.002445345368556e-06,
+      "loss": 2.1198,
+      "step": 371300
+    },
+    {
+      "epoch": 28.000136,
+      "grad_norm": 2.075347423553467,
+      "learning_rate": 7.990813725346307e-06,
+      "loss": 2.0987,
+      "step": 371400
+    },
+    {
+      "epoch": 29.000098,
+      "grad_norm": 2.004270553588867,
+      "learning_rate": 7.979188956622263e-06,
+      "loss": 2.0634,
+      "step": 371500
+    },
+    {
+      "epoch": 30.00006,
+      "grad_norm": 2.0730834007263184,
+      "learning_rate": 7.967571043878863e-06,
+      "loss": 2.0421,
+      "step": 371600
+    },
+    {
+      "epoch": 31.000022,
+      "grad_norm": 2.0204977989196777,
+      "learning_rate": 7.955959991795809e-06,
+      "loss": 2.0191,
+      "step": 371700
+    },
+    {
+      "epoch": 31.000222,
+      "grad_norm": 1.9809165000915527,
+      "learning_rate": 7.944355805050032e-06,
+      "loss": 1.9979,
+      "step": 371800
+    },
+    {
+      "epoch": 32.000184,
+      "grad_norm": 1.8896480798721313,
+      "learning_rate": 7.932758488315705e-06,
+      "loss": 1.9788,
+      "step": 371900
+    },
+    {
+      "epoch": 33.000146,
+      "grad_norm": 1.8905068635940552,
+      "learning_rate": 7.921168046264213e-06,
+      "loss": 1.9646,
+      "step": 372000
+    },
+    {
+      "epoch": 33.000146,
+      "eval_loss": 2.3312835693359375,
+      "eval_runtime": 55.0336,
+      "eval_samples_per_second": 185.232,
+      "eval_steps_per_second": 1.454,
+      "step": 372000
+    },
+    {
+      "epoch": 34.000108,
+      "grad_norm": 2.0993173122406006,
+      "learning_rate": 7.909584483564187e-06,
+      "loss": 2.0813,
+      "step": 372100
+    },
+    {
+      "epoch": 35.00007,
+      "grad_norm": 2.0958781242370605,
+      "learning_rate": 7.898007804881485e-06,
+      "loss": 2.0596,
+      "step": 372200
+    },
+    {
+      "epoch": 36.000032,
+      "grad_norm": 1.9180951118469238,
+      "learning_rate": 7.886438014879205e-06,
+      "loss": 2.0353,
+      "step": 372300
+    },
+    {
+      "epoch": 36.000232,
+      "grad_norm": 2.0129170417785645,
+      "learning_rate": 7.874875118217639e-06,
+      "loss": 2.007,
+      "step": 372400
+    },
+    {
+      "epoch": 37.000194,
+      "grad_norm": 1.9586989879608154,
+      "learning_rate": 7.863319119554325e-06,
+      "loss": 1.9911,
+      "step": 372500
+    },
+    {
+      "epoch": 38.000156,
+      "grad_norm": 2.0036728382110596,
+      "learning_rate": 7.851770023544022e-06,
+      "loss": 1.97,
+      "step": 372600
+    },
+    {
+      "epoch": 39.000118,
+      "grad_norm": 2.0655548572540283,
+      "learning_rate": 7.840227834838709e-06,
+      "loss": 1.9609,
+      "step": 372700
+    },
+    {
+      "epoch": 40.00008,
+      "grad_norm": 1.8536264896392822,
+      "learning_rate": 7.828692558087566e-06,
+      "loss": 1.9389,
+      "step": 372800
+    },
+    {
+      "epoch": 41.000042,
+      "grad_norm": 2.0123019218444824,
+      "learning_rate": 7.817164197937006e-06,
+      "loss": 1.9311,
+      "step": 372900
+    },
+    {
+      "epoch": 42.000004,
+      "grad_norm": 1.9356095790863037,
+      "learning_rate": 7.80564275903066e-06,
+      "loss": 1.9157,
+      "step": 373000
+    },
+    {
+      "epoch": 42.000004,
+      "eval_loss": 2.2908835411071777,
+      "eval_runtime": 54.8694,
+      "eval_samples_per_second": 185.787,
+      "eval_steps_per_second": 1.458,
+      "step": 373000
+    },
+    {
+      "epoch": 42.000204,
+      "grad_norm": 1.9983534812927246,
+      "learning_rate": 7.794128246009346e-06,
+      "loss": 1.9932,
+      "step": 373100
+    },
+    {
+      "epoch": 43.000166,
+      "grad_norm": 2.0036892890930176,
+      "learning_rate": 7.782620663511117e-06,
+      "loss": 1.9803,
+      "step": 373200
+    },
+    {
+      "epoch": 44.000128,
+      "grad_norm": 1.9349839687347412,
+      "learning_rate": 7.771120016171227e-06,
+      "loss": 1.9687,
+      "step": 373300
+    },
+    {
+      "epoch": 45.00009,
+      "grad_norm": 1.8848403692245483,
+      "learning_rate": 7.759626308622142e-06,
+      "loss": 1.9474,
+      "step": 373400
+    },
+    {
+      "epoch": 46.000052,
+      "grad_norm": 1.9943233728408813,
+      "learning_rate": 7.74813954549351e-06,
+      "loss": 1.9319,
+      "step": 373500
+    },
+    {
+      "epoch": 47.000014,
+      "grad_norm": 1.9002938270568848,
+      "learning_rate": 7.736659731412204e-06,
+      "loss": 1.9217,
+      "step": 373600
+    },
+    {
+      "epoch": 47.000214,
+      "grad_norm": 1.9708117246627808,
+      "learning_rate": 7.725186871002296e-06,
+      "loss": 1.9083,
+      "step": 373700
+    },
+    {
+      "epoch": 48.000176,
+      "grad_norm": 1.9721884727478027,
+      "learning_rate": 7.713720968885057e-06,
+      "loss": 1.8956,
+      "step": 373800
+    },
+    {
+      "epoch": 49.000138,
+      "grad_norm": 1.9223700761795044,
+      "learning_rate": 7.702262029678939e-06,
+      "loss": 1.8808,
+      "step": 373900
+    },
+    {
+      "epoch": 50.0001,
+      "grad_norm": 2.03428316116333,
+      "learning_rate": 7.690810057999607e-06,
+      "loss": 1.868,
+      "step": 374000
+    },
+    {
+      "epoch": 50.0001,
+      "eval_loss": 2.2805299758911133,
+      "eval_runtime": 55.0731,
+      "eval_samples_per_second": 185.099,
+      "eval_steps_per_second": 1.453,
+      "step": 374000
+    },
+    {
+      "epoch": 51.000062,
+      "grad_norm": 1.947739601135254,
+      "learning_rate": 7.67936505845991e-06,
+      "loss": 1.9356,
+      "step": 374100
+    },
+    {
+      "epoch": 52.000024,
+      "grad_norm": 1.939833164215088,
+      "learning_rate": 7.667927035669906e-06,
+      "loss": 1.9287,
+      "step": 374200
+    },
+    {
+      "epoch": 52.000224,
+      "grad_norm": 2.120412588119507,
+      "learning_rate": 7.656495994236813e-06,
+      "loss": 1.9083,
+      "step": 374300
+    },
+    {
+      "epoch": 53.000186,
+      "grad_norm": 1.9514408111572266,
+      "learning_rate": 7.645071938765055e-06,
+      "loss": 1.9005,
+      "step": 374400
+    },
+    {
+      "epoch": 54.000148,
+      "grad_norm": 1.9537405967712402,
+      "learning_rate": 7.633654873856258e-06,
+      "loss": 1.8885,
+      "step": 374500
+    },
+    {
+      "epoch": 55.00011,
+      "grad_norm": 1.9912673234939575,
+      "learning_rate": 7.6222448041091884e-06,
+      "loss": 1.8727,
+      "step": 374600
+    },
+    {
+      "epoch": 56.000072,
+      "grad_norm": 2.0160086154937744,
+      "learning_rate": 7.6108417341198366e-06,
+      "loss": 1.8652,
+      "step": 374700
+    },
+    {
+      "epoch": 57.000034,
+      "grad_norm": 1.962786078453064,
+      "learning_rate": 7.599445668481353e-06,
+      "loss": 1.8495,
+      "step": 374800
+    },
+    {
+      "epoch": 57.000234,
+      "grad_norm": 2.0677285194396973,
+      "learning_rate": 7.588056611784084e-06,
+      "loss": 1.8414,
+      "step": 374900
+    },
+    {
+      "epoch": 58.000196,
+      "grad_norm": 1.923409104347229,
+      "learning_rate": 7.576674568615519e-06,
+      "loss": 1.8278,
+      "step": 375000
+    },
+    {
+      "epoch": 58.000196,
+      "eval_loss": 2.2644314765930176,
+      "eval_runtime": 54.7576,
+      "eval_samples_per_second": 186.166,
+      "eval_steps_per_second": 1.461,
+      "step": 375000
+    },
+    {
+      "epoch": 59.000158,
+      "grad_norm": 2.0004312992095947,
+      "learning_rate": 7.565299543560353e-06,
+      "loss": 1.8848,
+      "step": 375100
+    },
+    {
+      "epoch": 60.00012,
+      "grad_norm": 2.0457980632781982,
+      "learning_rate": 7.553931541200448e-06,
+      "loss": 1.8788,
+      "step": 375200
+    },
+    {
+      "epoch": 61.000082,
+      "grad_norm": 1.9472349882125854,
+      "learning_rate": 7.54257056611484e-06,
+      "loss": 1.8666,
+      "step": 375300
+    },
+    {
+      "epoch": 62.000044,
+      "grad_norm": 2.019150733947754,
+      "learning_rate": 7.531216622879711e-06,
+      "loss": 1.8555,
+      "step": 375400
+    },
+    {
+      "epoch": 63.000006,
+      "grad_norm": 1.9674944877624512,
+      "learning_rate": 7.5198697160684365e-06,
+      "loss": 1.8495,
+      "step": 375500
+    },
+    {
+      "epoch": 63.000206,
+      "grad_norm": 1.959089756011963,
+      "learning_rate": 7.5085298502515525e-06,
+      "loss": 1.8353,
+      "step": 375600
+    },
+    {
+      "epoch": 64.000168,
+      "grad_norm": 1.9350240230560303,
+      "learning_rate": 7.4971970299967605e-06,
+      "loss": 1.8257,
+      "step": 375700
+    },
+    {
+      "epoch": 65.00013,
+      "grad_norm": 1.9134896993637085,
+      "learning_rate": 7.4858712598689014e-06,
+      "loss": 1.8124,
+      "step": 375800
+    },
+    {
+      "epoch": 66.000092,
+      "grad_norm": 2.0086705684661865,
+      "learning_rate": 7.474552544430008e-06,
+      "loss": 1.8052,
+      "step": 375900
+    },
+    {
+      "epoch": 67.000054,
+      "grad_norm": 1.9945427179336548,
+      "learning_rate": 7.4632408882392504e-06,
+      "loss": 1.8005,
+      "step": 376000
+    },
+    {
+      "epoch": 67.000054,
+      "eval_loss": 2.248349189758301,
+      "eval_runtime": 54.5876,
+      "eval_samples_per_second": 186.746,
+      "eval_steps_per_second": 1.466,
+      "step": 376000
+    },
+    {
+      "epoch": 68.000016,
+      "grad_norm": 1.9743598699569702,
+      "learning_rate": 7.451936295852976e-06,
+      "loss": 1.8454,
+      "step": 376100
+    },
+    {
+      "epoch": 68.000216,
+      "grad_norm": 1.898568034172058,
+      "learning_rate": 7.440638771824654e-06,
+      "loss": 1.8431,
+      "step": 376200
+    },
+    {
+      "epoch": 69.000178,
+      "grad_norm": 2.142463445663452,
+      "learning_rate": 7.429348320704935e-06,
+      "loss": 1.8277,
+      "step": 376300
+    },
+    {
+      "epoch": 70.00014,
+      "grad_norm": 1.9892468452453613,
+      "learning_rate": 7.41806494704162e-06,
+      "loss": 1.8119,
+      "step": 376400
+    },
+    {
+      "epoch": 71.000102,
+      "grad_norm": 2.005885601043701,
+      "learning_rate": 7.406788655379634e-06,
+      "loss": 1.8086,
+      "step": 376500
+    },
+    {
+      "epoch": 72.000064,
+      "grad_norm": 1.9385697841644287,
+      "learning_rate": 7.395519450261074e-06,
+      "loss": 1.8024,
+      "step": 376600
+    },
+    {
+      "epoch": 73.000026,
+      "grad_norm": 1.9773157835006714,
+      "learning_rate": 7.384257336225173e-06,
+      "loss": 1.7934,
+      "step": 376700
+    },
+    {
+      "epoch": 73.000226,
+      "grad_norm": 1.8618143796920776,
+      "learning_rate": 7.373002317808317e-06,
+      "loss": 1.7824,
+      "step": 376800
+    },
+    {
+      "epoch": 74.000188,
+      "grad_norm": 1.9531538486480713,
+      "learning_rate": 7.361754399544013e-06,
+      "loss": 1.7727,
+      "step": 376900
+    },
+    {
+      "epoch": 75.00015,
+      "grad_norm": 1.931515097618103,
+      "learning_rate": 7.350513585962926e-06,
+      "loss": 1.764,
+      "step": 377000
+    },
+    {
+      "epoch": 75.00015,
+      "eval_loss": 2.2430500984191895,
+      "eval_runtime": 54.6415,
+      "eval_samples_per_second": 186.561,
+      "eval_steps_per_second": 1.464,
+      "step": 377000
+    },
+    {
+      "epoch": 76.000112,
+      "grad_norm": 1.9521348476409912,
+      "learning_rate": 7.339279881592859e-06,
+      "loss": 1.8087,
+      "step": 377100
+    },
+    {
+      "epoch": 77.000074,
+      "grad_norm": 2.0013513565063477,
+      "learning_rate": 7.32805329095875e-06,
+      "loss": 1.8023,
+      "step": 377200
+    },
+    {
+      "epoch": 78.000036,
+      "grad_norm": 1.8955408334732056,
+      "learning_rate": 7.316833818582652e-06,
+      "loss": 1.7943,
+      "step": 377300
+    },
+    {
+      "epoch": 78.000236,
+      "grad_norm": 2.0025761127471924,
+      "learning_rate": 7.305621468983781e-06,
+      "loss": 1.7903,
+      "step": 377400
+    },
+    {
+      "epoch": 79.000198,
+      "grad_norm": 1.9769165515899658,
+      "learning_rate": 7.294416246678462e-06,
+      "loss": 1.7774,
+      "step": 377500
+    },
+    {
+      "epoch": 80.00016,
+      "grad_norm": 1.8650860786437988,
+      "learning_rate": 7.283218156180174e-06,
+      "loss": 1.7698,
+      "step": 377600
+    },
+    {
+      "epoch": 81.000122,
+      "grad_norm": 1.9133366346359253,
+      "learning_rate": 7.272027201999484e-06,
+      "loss": 1.7658,
+      "step": 377700
+    },
+    {
+      "epoch": 82.000084,
+      "grad_norm": 1.9629889726638794,
+      "learning_rate": 7.260843388644117e-06,
+      "loss": 1.7552,
+      "step": 377800
+    },
+    {
+      "epoch": 83.000046,
+      "grad_norm": 1.9844943284988403,
+      "learning_rate": 7.249666720618919e-06,
+      "loss": 1.7539,
+      "step": 377900
+    },
+    {
+      "epoch": 84.000008,
+      "grad_norm": 1.9470826387405396,
+      "learning_rate": 7.238497202425834e-06,
+      "loss": 1.7404,
+      "step": 378000
+    },
+    {
+      "epoch": 84.000008,
+      "eval_loss": 2.234076499938965,
+      "eval_runtime": 54.5427,
+      "eval_samples_per_second": 186.9,
+      "eval_steps_per_second": 1.467,
+      "step": 378000
+    },
+    {
+      "epoch": 84.000208,
+      "grad_norm": 2.091539144515991,
+      "learning_rate": 7.2273348385639535e-06,
+      "loss": 1.7783,
+      "step": 378100
+    },
+    {
+      "epoch": 85.00017,
+      "grad_norm": 1.9156265258789062,
+      "learning_rate": 7.216179633529477e-06,
+      "loss": 1.7714,
+      "step": 378200
+    },
+    {
+      "epoch": 86.000132,
+      "grad_norm": 2.0570554733276367,
+      "learning_rate": 7.205031591815723e-06,
+      "loss": 1.7658,
+      "step": 378300
+    },
+    {
+      "epoch": 87.000094,
+      "grad_norm": 2.0413947105407715,
+      "learning_rate": 7.193890717913107e-06,
+      "loss": 1.7564,
+      "step": 378400
+    },
+    {
+      "epoch": 88.000056,
+      "grad_norm": 1.91609787940979,
+      "learning_rate": 7.18275701630918e-06,
+      "loss": 1.7538,
+      "step": 378500
+    },
+    {
+      "epoch": 89.000018,
+      "grad_norm": 1.8070498704910278,
+      "learning_rate": 7.171630491488598e-06,
+      "loss": 1.7439,
+      "step": 378600
+    },
+    {
+      "epoch": 89.000218,
+      "grad_norm": 1.9066287279129028,
+      "learning_rate": 7.16051114793313e-06,
+      "loss": 1.7382,
+      "step": 378700
+    },
+    {
+      "epoch": 90.00018,
+      "grad_norm": 1.8805670738220215,
+      "learning_rate": 7.149398990121628e-06,
+      "loss": 1.7322,
+      "step": 378800
+    },
+    {
+      "epoch": 91.000142,
+      "grad_norm": 1.93112313747406,
+      "learning_rate": 7.138294022530081e-06,
+      "loss": 1.7221,
+      "step": 378900
+    },
+    {
+      "epoch": 92.000104,
+      "grad_norm": 1.9273699522018433,
+      "learning_rate": 7.127196249631565e-06,
+      "loss": 1.717,
+      "step": 379000
+    },
+    {
+      "epoch": 92.000104,
+      "eval_loss": 2.222762107849121,
+      "eval_runtime": 54.5793,
+      "eval_samples_per_second": 186.774,
+      "eval_steps_per_second": 1.466,
+      "step": 379000
+    },
+    {
+      "epoch": 93.000066,
+      "grad_norm": 1.9170584678649902,
+      "learning_rate": 7.116105675896276e-06,
+      "loss": 1.7486,
+      "step": 379100
+    },
+    {
+      "epoch": 94.000028,
+      "grad_norm": 1.886796474456787,
+      "learning_rate": 7.105022305791467e-06,
+      "loss": 1.7455,
+      "step": 379200
+    },
+    {
+      "epoch": 94.000228,
+      "grad_norm": 1.9963804483413696,
+      "learning_rate": 7.0939461437815354e-06,
+      "loss": 1.744,
+      "step": 379300
+    },
+    {
+      "epoch": 95.00019,
+      "grad_norm": 1.9092683792114258,
+      "learning_rate": 7.082877194327953e-06,
+      "loss": 1.7332,
+      "step": 379400
+    },
+    {
+      "epoch": 96.000152,
+      "grad_norm": 1.9792388677597046,
+      "learning_rate": 7.071815461889303e-06,
+      "loss": 1.728,
+      "step": 379500
+    },
+    {
+      "epoch": 97.000114,
+      "grad_norm": 1.9630019664764404,
+      "learning_rate": 7.060760950921233e-06,
+      "loss": 1.7224,
+      "step": 379600
+    },
+    {
+      "epoch": 98.000076,
+      "grad_norm": 1.9032080173492432,
+      "learning_rate": 7.049713665876509e-06,
+      "loss": 1.7176,
+      "step": 379700
+    },
+    {
+      "epoch": 99.000038,
+      "grad_norm": 1.9760445356369019,
+      "learning_rate": 7.038673611204971e-06,
+      "loss": 1.7142,
+      "step": 379800
+    },
+    {
+      "epoch": 99.000238,
+      "grad_norm": 2.5537993907928467,
+      "learning_rate": 7.027640791353562e-06,
+      "loss": 1.7043,
+      "step": 379900
+    },
+    {
+      "epoch": 100.0002,
+      "grad_norm": 1.9134443998336792,
+      "learning_rate": 7.016615210766287e-06,
+      "loss": 1.6935,
+      "step": 380000
+    },
+    {
+      "epoch": 100.0002,
+      "eval_loss": 2.2129366397857666,
+      "eval_runtime": 54.6255,
+      "eval_samples_per_second": 186.616,
+      "eval_steps_per_second": 1.465,
+      "step": 380000
+    },
+    {
+      "epoch": 101.000162,
+      "grad_norm": 1.8621317148208618,
+      "learning_rate": 7.005596873884254e-06,
+      "loss": 1.7287,
+      "step": 380100
+    },
+    {
+      "epoch": 102.000124,
+      "grad_norm": 2.0007071495056152,
+      "learning_rate": 6.994585785145647e-06,
+      "loss": 1.7216,
+      "step": 380200
+    },
+    {
+      "epoch": 103.000086,
+      "grad_norm": 1.981418490409851,
+      "learning_rate": 6.98358194898574e-06,
+      "loss": 1.7192,
+      "step": 380300
+    },
+    {
+      "epoch": 104.000048,
+      "grad_norm": 1.7912635803222656,
+      "learning_rate": 6.972585369836865e-06,
+      "loss": 1.7046,
+      "step": 380400
+    },
+    {
+      "epoch": 105.00001,
+      "grad_norm": 1.9558844566345215,
+      "learning_rate": 6.961596052128444e-06,
+      "loss": 1.708,
+      "step": 380500
+    },
+    {
+      "epoch": 105.00021,
+      "grad_norm": 1.9592783451080322,
+      "learning_rate": 6.9506140002869756e-06,
+      "loss": 1.699,
+      "step": 380600
+    },
+    {
+      "epoch": 106.000172,
+      "grad_norm": 1.9580655097961426,
+      "learning_rate": 6.939639218736041e-06,
+      "loss": 1.6912,
+      "step": 380700
+    },
+    {
+      "epoch": 107.000134,
+      "grad_norm": 1.9187573194503784,
+      "learning_rate": 6.928671711896259e-06,
+      "loss": 1.6864,
+      "step": 380800
+    },
+    {
+      "epoch": 108.000096,
+      "grad_norm": 2.0804340839385986,
+      "learning_rate": 6.917711484185349e-06,
+      "loss": 1.6843,
+      "step": 380900
+    },
+    {
+      "epoch": 109.000058,
+      "grad_norm": 1.9156286716461182,
+      "learning_rate": 6.906758540018099e-06,
+      "loss": 1.6788,
+      "step": 381000
+    },
+    {
+      "epoch": 109.000058,
+      "eval_loss": 2.2096140384674072,
+      "eval_runtime": 54.6776,
+      "eval_samples_per_second": 186.438,
+      "eval_steps_per_second": 1.463,
+      "step": 381000
+    },
+    {
+      "epoch": 110.00002,
+      "grad_norm": 1.8327763080596924,
+      "learning_rate": 6.895812883806341e-06,
+      "loss": 1.703,
+      "step": 381100
+    },
+    {
+      "epoch": 110.00022,
+      "grad_norm": 1.9155895709991455,
+      "learning_rate": 6.884874519958984e-06,
+      "loss": 1.6962,
+      "step": 381200
+    },
+    {
+      "epoch": 111.000182,
+      "grad_norm": 1.8222503662109375,
+      "learning_rate": 6.873943452882006e-06,
+      "loss": 1.6917,
+      "step": 381300
+    },
+    {
+      "epoch": 112.000144,
+      "grad_norm": 1.8987947702407837,
+      "learning_rate": 6.863019686978445e-06,
+      "loss": 1.6892,
+      "step": 381400
+    },
+    {
+      "epoch": 113.000106,
+      "grad_norm": 1.8653353452682495,
+      "learning_rate": 6.85210322664838e-06,
+      "loss": 1.6867,
+      "step": 381500
+    },
+    {
+      "epoch": 114.000068,
+      "grad_norm": 1.8713948726654053,
+      "learning_rate": 6.841194076288962e-06,
+      "loss": 1.6777,
+      "step": 381600
+    },
+    {
+      "epoch": 115.00003,
+      "grad_norm": 1.9354687929153442,
+      "learning_rate": 6.830292240294398e-06,
+      "loss": 1.6756,
+      "step": 381700
+    },
+    {
+      "epoch": 115.00023,
+      "grad_norm": 1.8539812564849854,
+      "learning_rate": 6.8193977230559565e-06,
+      "loss": 1.669,
+      "step": 381800
+    },
+    {
+      "epoch": 116.000192,
+      "grad_norm": 1.913901448249817,
+      "learning_rate": 6.808510528961928e-06,
+      "loss": 1.6632,
+      "step": 381900
+    },
+    {
+      "epoch": 117.000154,
+      "grad_norm": 1.8366894721984863,
+      "learning_rate": 6.797630662397683e-06,
+      "loss": 1.6619,
+      "step": 382000
+    },
+    {
+      "epoch": 117.000154,
+      "eval_loss": 2.1981077194213867,
+      "eval_runtime": 54.646,
+      "eval_samples_per_second": 186.546,
+      "eval_steps_per_second": 1.464,
+      "step": 382000
     }
   ],
   "logging_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 3.333426940465899e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:04f252a64f6373afbaec36fc31e345451d91b06580ee09a9823282cc3866516c
 size 5777

 version https://git-lfs.github.com/spec/v1
+oid sha256:a19fa79233fd468fcb689b7b8c5f704161aecb10646540b1133405c7c866d2ff
 size 5777