Upload 10 files

Browse files

Files changed (6) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +1953 -3
training_args.bin +1 -1

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:724be5ec56c8cea0a6bccb0fb0bcec03b849814458eb8b51ff9f3d953d0ed14c
 size 598635032

 version https://git-lfs.github.com/spec/v1
+oid sha256:a710773cfd7f93749b548b4dc475790d75538b97475d047166dceb50704eb746
 size 598635032

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aae36e7eb1c7e8d3c5cc3aa77fc98b6aae23dbfbb8ba5dbcfe46c7087de864d3
 size 1197359627

 version https://git-lfs.github.com/spec/v1
+oid sha256:fedf1b4c8a508947f08f4a98315b58cd6a43e2a1adda4f18d9617c092f6a8844
 size 1197359627

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d808ac48aeb2285a7d15fe96957631f4317dc7cd8cbbaa8b381b1638da837ef8
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:0412622810efe6fde95b3cfeff4557f637e942d79ee2fa68f136e7ee99e430b1
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2ef58d5b955824dfbbc6cf55d8b7019f163372cbafcda9d38b4c7e503714eff0
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:f5967d2fde5e8af8b726d755ee2aea2a1a3996cd4db019463bea602f6a5c353f
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.034,
   "eval_steps": 1000,
-  "global_step": 343000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -26769,6 +26769,1956 @@
       "eval_samples_per_second": 195.089,
       "eval_steps_per_second": 1.531,
       "step": 343000
     }
   ],
   "logging_steps": 100,
@@ -26788,7 +28738,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.993443664874701e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.002796,
   "eval_steps": 1000,
+  "global_step": 368000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 195.089,
       "eval_steps_per_second": 1.531,
       "step": 343000
+    },
+    {
+      "epoch": 0.0002,
+      "grad_norm": 0.8649879693984985,
+      "learning_rate": 1.1537548189140518e-05,
+      "loss": 0.6746,
+      "step": 343100
+    },
+    {
+      "epoch": 0.0004,
+      "grad_norm": 0.8530526161193848,
+      "learning_rate": 1.1524181255002655e-05,
+      "loss": 0.6714,
+      "step": 343200
+    },
+    {
+      "epoch": 0.0006,
+      "grad_norm": 0.8391575813293457,
+      "learning_rate": 1.1510819748922983e-05,
+      "loss": 0.673,
+      "step": 343300
+    },
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.8824005126953125,
+      "learning_rate": 1.149746367628349e-05,
+      "loss": 0.6745,
+      "step": 343400
+    },
+    {
+      "epoch": 0.001,
+      "grad_norm": 0.9381487965583801,
+      "learning_rate": 1.1484113042464018e-05,
+      "loss": 0.6775,
+      "step": 343500
+    },
+    {
+      "epoch": 0.0012,
+      "grad_norm": 0.8851874470710754,
+      "learning_rate": 1.1470767852842192e-05,
+      "loss": 0.6714,
+      "step": 343600
+    },
+    {
+      "epoch": 0.0014,
+      "grad_norm": 0.8769415616989136,
+      "learning_rate": 1.1457428112793467e-05,
+      "loss": 0.6649,
+      "step": 343700
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.8536527156829834,
+      "learning_rate": 1.1444093827691072e-05,
+      "loss": 0.6689,
+      "step": 343800
+    },
+    {
+      "epoch": 0.0018,
+      "grad_norm": 0.8344665765762329,
+      "learning_rate": 1.143076500290606e-05,
+      "loss": 0.6714,
+      "step": 343900
+    },
+    {
+      "epoch": 0.002,
+      "grad_norm": 0.857262372970581,
+      "learning_rate": 1.141744164380728e-05,
+      "loss": 0.668,
+      "step": 344000
+    },
+    {
+      "epoch": 0.002,
+      "eval_loss": 2.0636377334594727,
+      "eval_runtime": 52.1973,
+      "eval_samples_per_second": 195.297,
+      "eval_steps_per_second": 1.533,
+      "step": 344000
+    },
+    {
+      "epoch": 0.0022,
+      "grad_norm": 0.9240826964378357,
+      "learning_rate": 1.1404123755761394e-05,
+      "loss": 0.6738,
+      "step": 344100
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.864179790019989,
+      "learning_rate": 1.1390811344132823e-05,
+      "loss": 0.6675,
+      "step": 344200
+    },
+    {
+      "epoch": 0.0026,
+      "grad_norm": 0.9233891367912292,
+      "learning_rate": 1.1377504414283816e-05,
+      "loss": 0.6683,
+      "step": 344300
+    },
+    {
+      "epoch": 0.0028,
+      "grad_norm": 0.8253393769264221,
+      "learning_rate": 1.13642029715744e-05,
+      "loss": 0.6724,
+      "step": 344400
+    },
+    {
+      "epoch": 0.003,
+      "grad_norm": 0.9402153491973877,
+      "learning_rate": 1.1350907021362409e-05,
+      "loss": 0.6686,
+      "step": 344500
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8452779054641724,
+      "learning_rate": 1.1337616569003425e-05,
+      "loss": 0.6776,
+      "step": 344600
+    },
+    {
+      "epoch": 0.0034,
+      "grad_norm": 0.8500985503196716,
+      "learning_rate": 1.1324331619850856e-05,
+      "loss": 0.6654,
+      "step": 344700
+    },
+    {
+      "epoch": 0.0036,
+      "grad_norm": 0.8803905248641968,
+      "learning_rate": 1.1311052179255871e-05,
+      "loss": 0.675,
+      "step": 344800
+    },
+    {
+      "epoch": 0.0038,
+      "grad_norm": 0.9099257588386536,
+      "learning_rate": 1.1297778252567443e-05,
+      "loss": 0.6569,
+      "step": 344900
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.8804642558097839,
+      "learning_rate": 1.1284509845132297e-05,
+      "loss": 0.6655,
+      "step": 345000
+    },
+    {
+      "epoch": 0.004,
+      "eval_loss": 2.05592942237854,
+      "eval_runtime": 51.7883,
+      "eval_samples_per_second": 196.84,
+      "eval_steps_per_second": 1.545,
+      "step": 345000
+    },
+    {
+      "epoch": 0.0042,
+      "grad_norm": 0.8482286930084229,
+      "learning_rate": 1.1271246962294935e-05,
+      "loss": 0.6641,
+      "step": 345100
+    },
+    {
+      "epoch": 0.0044,
+      "grad_norm": 0.8636903166770935,
+      "learning_rate": 1.1257989609397654e-05,
+      "loss": 0.6632,
+      "step": 345200
+    },
+    {
+      "epoch": 0.0046,
+      "grad_norm": 0.8937559723854065,
+      "learning_rate": 1.1244737791780524e-05,
+      "loss": 0.6634,
+      "step": 345300
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8914988040924072,
+      "learning_rate": 1.123149151478136e-05,
+      "loss": 0.6693,
+      "step": 345400
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 1.0172580480575562,
+      "learning_rate": 1.1218250783735765e-05,
+      "loss": 0.6605,
+      "step": 345500
+    },
+    {
+      "epoch": 0.0052,
+      "grad_norm": 0.9080793857574463,
+      "learning_rate": 1.1205015603977107e-05,
+      "loss": 0.6706,
+      "step": 345600
+    },
+    {
+      "epoch": 0.0054,
+      "grad_norm": 0.8460882306098938,
+      "learning_rate": 1.1191785980836522e-05,
+      "loss": 0.6701,
+      "step": 345700
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.8949432373046875,
+      "learning_rate": 1.1178561919642885e-05,
+      "loss": 0.6571,
+      "step": 345800
+    },
+    {
+      "epoch": 0.0058,
+      "grad_norm": 0.8934834599494934,
+      "learning_rate": 1.1165343425722851e-05,
+      "loss": 0.6621,
+      "step": 345900
+    },
+    {
+      "epoch": 0.006,
+      "grad_norm": 0.8950237035751343,
+      "learning_rate": 1.1152130504400834e-05,
+      "loss": 0.6678,
+      "step": 346000
+    },
+    {
+      "epoch": 0.006,
+      "eval_loss": 2.0553648471832275,
+      "eval_runtime": 51.8108,
+      "eval_samples_per_second": 196.754,
+      "eval_steps_per_second": 1.544,
+      "step": 346000
+    },
+    {
+      "epoch": 0.0062,
+      "grad_norm": 0.9523611068725586,
+      "learning_rate": 1.1138923160999002e-05,
+      "loss": 0.673,
+      "step": 346100
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.874225914478302,
+      "learning_rate": 1.1125721400837255e-05,
+      "loss": 0.6609,
+      "step": 346200
+    },
+    {
+      "epoch": 0.0066,
+      "grad_norm": 0.9157487750053406,
+      "learning_rate": 1.1112525229233268e-05,
+      "loss": 0.6622,
+      "step": 346300
+    },
+    {
+      "epoch": 0.0068,
+      "grad_norm": 0.9365401864051819,
+      "learning_rate": 1.1099334651502466e-05,
+      "loss": 0.6603,
+      "step": 346400
+    },
+    {
+      "epoch": 0.007,
+      "grad_norm": 0.9212621450424194,
+      "learning_rate": 1.1086149672957993e-05,
+      "loss": 0.6618,
+      "step": 346500
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.9013537168502808,
+      "learning_rate": 1.107297029891077e-05,
+      "loss": 0.6665,
+      "step": 346600
+    },
+    {
+      "epoch": 0.0074,
+      "grad_norm": 0.8723328709602356,
+      "learning_rate": 1.1059796534669447e-05,
+      "loss": 0.6548,
+      "step": 346700
+    },
+    {
+      "epoch": 0.0076,
+      "grad_norm": 0.8133809566497803,
+      "learning_rate": 1.1046628385540419e-05,
+      "loss": 0.6352,
+      "step": 346800
+    },
+    {
+      "epoch": 0.0078,
+      "grad_norm": 0.8866004347801208,
+      "learning_rate": 1.1033465856827802e-05,
+      "loss": 0.6679,
+      "step": 346900
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.9575750231742859,
+      "learning_rate": 1.1020308953833467e-05,
+      "loss": 0.6658,
+      "step": 347000
+    },
+    {
+      "epoch": 0.008,
+      "eval_loss": 2.0689334869384766,
+      "eval_runtime": 51.6857,
+      "eval_samples_per_second": 197.231,
+      "eval_steps_per_second": 1.548,
+      "step": 347000
+    },
+    {
+      "epoch": 0.0082,
+      "grad_norm": 0.8472666144371033,
+      "learning_rate": 1.100715768185701e-05,
+      "loss": 0.6504,
+      "step": 347100
+    },
+    {
+      "epoch": 0.0084,
+      "grad_norm": 0.8880901336669922,
+      "learning_rate": 1.0994012046195779e-05,
+      "loss": 0.6706,
+      "step": 347200
+    },
+    {
+      "epoch": 0.0086,
+      "grad_norm": 0.8281514644622803,
+      "learning_rate": 1.0980872052144809e-05,
+      "loss": 0.6514,
+      "step": 347300
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.8914335370063782,
+      "learning_rate": 1.09677377049969e-05,
+      "loss": 0.6526,
+      "step": 347400
+    },
+    {
+      "epoch": 0.009,
+      "grad_norm": 0.9571097493171692,
+      "learning_rate": 1.0954609010042568e-05,
+      "loss": 0.6623,
+      "step": 347500
+    },
+    {
+      "epoch": 0.0092,
+      "grad_norm": 0.9575111865997314,
+      "learning_rate": 1.0941485972570053e-05,
+      "loss": 0.6526,
+      "step": 347600
+    },
+    {
+      "epoch": 0.0094,
+      "grad_norm": 0.7946931719779968,
+      "learning_rate": 1.0928368597865298e-05,
+      "loss": 0.6621,
+      "step": 347700
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.901408851146698,
+      "learning_rate": 1.0915256891211992e-05,
+      "loss": 0.6575,
+      "step": 347800
+    },
+    {
+      "epoch": 0.0098,
+      "grad_norm": 0.8669435977935791,
+      "learning_rate": 1.0902150857891532e-05,
+      "loss": 0.6603,
+      "step": 347900
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.8946738243103027,
+      "learning_rate": 1.0889050503183016e-05,
+      "loss": 0.6667,
+      "step": 348000
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 2.0592565536499023,
+      "eval_runtime": 51.912,
+      "eval_samples_per_second": 196.371,
+      "eval_steps_per_second": 1.541,
+      "step": 348000
+    },
+    {
+      "epoch": 0.0102,
+      "grad_norm": 0.8748307228088379,
+      "learning_rate": 1.0875955832363266e-05,
+      "loss": 0.6613,
+      "step": 348100
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.846490740776062,
+      "learning_rate": 1.0862866850706818e-05,
+      "loss": 0.6577,
+      "step": 348200
+    },
+    {
+      "epoch": 0.0106,
+      "grad_norm": 0.860930323600769,
+      "learning_rate": 1.0849783563485921e-05,
+      "loss": 0.6552,
+      "step": 348300
+    },
+    {
+      "epoch": 0.0108,
+      "grad_norm": 0.8625341653823853,
+      "learning_rate": 1.0836705975970504e-05,
+      "loss": 0.6437,
+      "step": 348400
+    },
+    {
+      "epoch": 0.011,
+      "grad_norm": 0.8479413986206055,
+      "learning_rate": 1.0823634093428226e-05,
+      "loss": 0.664,
+      "step": 348500
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.9355835914611816,
+      "learning_rate": 1.0810567921124436e-05,
+      "loss": 0.6606,
+      "step": 348600
+    },
+    {
+      "epoch": 0.0114,
+      "grad_norm": 0.9027217626571655,
+      "learning_rate": 1.0797507464322203e-05,
+      "loss": 0.6509,
+      "step": 348700
+    },
+    {
+      "epoch": 0.0116,
+      "grad_norm": 0.8765237927436829,
+      "learning_rate": 1.0784452728282257e-05,
+      "loss": 0.6564,
+      "step": 348800
+    },
+    {
+      "epoch": 0.0118,
+      "grad_norm": 0.9060245156288147,
+      "learning_rate": 1.0771403718263051e-05,
+      "loss": 0.6555,
+      "step": 348900
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.9202615022659302,
+      "learning_rate": 1.0758360439520727e-05,
+      "loss": 0.6522,
+      "step": 349000
+    },
+    {
+      "epoch": 0.012,
+      "eval_loss": 2.057035207748413,
+      "eval_runtime": 51.8702,
+      "eval_samples_per_second": 196.529,
+      "eval_steps_per_second": 1.542,
+      "step": 349000
+    },
+    {
+      "epoch": 0.0122,
+      "grad_norm": 0.8476743102073669,
+      "learning_rate": 1.0745322897309124e-05,
+      "loss": 0.6623,
+      "step": 349100
+    },
+    {
+      "epoch": 0.0124,
+      "grad_norm": 0.9493403434753418,
+      "learning_rate": 1.073229109687974e-05,
+      "loss": 0.6697,
+      "step": 349200
+    },
+    {
+      "epoch": 0.0126,
+      "grad_norm": 0.8388432860374451,
+      "learning_rate": 1.07192650434818e-05,
+      "loss": 0.6494,
+      "step": 349300
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9042513966560364,
+      "learning_rate": 1.0706244742362192e-05,
+      "loss": 0.6473,
+      "step": 349400
+    },
+    {
+      "epoch": 0.013,
+      "grad_norm": 0.8294413089752197,
+      "learning_rate": 1.06932301987655e-05,
+      "loss": 0.6652,
+      "step": 349500
+    },
+    {
+      "epoch": 0.0132,
+      "grad_norm": 0.9279148578643799,
+      "learning_rate": 1.0680221417933963e-05,
+      "loss": 0.6506,
+      "step": 349600
+    },
+    {
+      "epoch": 0.0134,
+      "grad_norm": 0.8778104782104492,
+      "learning_rate": 1.066721840510753e-05,
+      "loss": 0.663,
+      "step": 349700
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.8701128959655762,
+      "learning_rate": 1.0654221165523817e-05,
+      "loss": 0.6605,
+      "step": 349800
+    },
+    {
+      "epoch": 0.0138,
+      "grad_norm": 0.9396702647209167,
+      "learning_rate": 1.0641229704418093e-05,
+      "loss": 0.658,
+      "step": 349900
+    },
+    {
+      "epoch": 0.014,
+      "grad_norm": 0.891123354434967,
+      "learning_rate": 1.0628244027023329e-05,
+      "loss": 0.6186,
+      "step": 350000
+    },
+    {
+      "epoch": 0.014,
+      "eval_loss": 2.059767961502075,
+      "eval_runtime": 51.9881,
+      "eval_samples_per_second": 196.083,
+      "eval_steps_per_second": 1.539,
+      "step": 350000
+    },
+    {
+      "epoch": 0.0142,
+      "grad_norm": 0.8995864391326904,
+      "learning_rate": 1.061526413857015e-05,
+      "loss": 0.6545,
+      "step": 350100
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.8432427048683167,
+      "learning_rate": 1.0602290044286866e-05,
+      "loss": 0.6527,
+      "step": 350200
+    },
+    {
+      "epoch": 0.0146,
+      "grad_norm": 0.8539645671844482,
+      "learning_rate": 1.058932174939942e-05,
+      "loss": 0.66,
+      "step": 350300
+    },
+    {
+      "epoch": 0.0148,
+      "grad_norm": 0.8698434233665466,
+      "learning_rate": 1.0576359259131452e-05,
+      "loss": 0.6686,
+      "step": 350400
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.8616706728935242,
+      "learning_rate": 1.0563402578704248e-05,
+      "loss": 0.6605,
+      "step": 350500
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.891680121421814,
+      "learning_rate": 1.0550451713336768e-05,
+      "loss": 0.6471,
+      "step": 350600
+    },
+    {
+      "epoch": 0.0154,
+      "grad_norm": 0.9290798306465149,
+      "learning_rate": 1.05375066682456e-05,
+      "loss": 0.6575,
+      "step": 350700
+    },
+    {
+      "epoch": 0.0156,
+      "grad_norm": 0.8489027619361877,
+      "learning_rate": 1.0524567448645018e-05,
+      "loss": 0.6484,
+      "step": 350800
+    },
+    {
+      "epoch": 0.0158,
+      "grad_norm": 0.8927240371704102,
+      "learning_rate": 1.0511634059746935e-05,
+      "loss": 0.6637,
+      "step": 350900
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8975149393081665,
+      "learning_rate": 1.0498706506760933e-05,
+      "loss": 0.6729,
+      "step": 351000
+    },
+    {
+      "epoch": 0.016,
+      "eval_loss": 2.0625927448272705,
+      "eval_runtime": 52.1361,
+      "eval_samples_per_second": 195.527,
+      "eval_steps_per_second": 1.534,
+      "step": 351000
+    },
+    {
+      "epoch": 0.0162,
+      "grad_norm": 0.8605362176895142,
+      "learning_rate": 1.0485784794894205e-05,
+      "loss": 0.6494,
+      "step": 351100
+    },
+    {
+      "epoch": 0.0164,
+      "grad_norm": 0.9211152791976929,
+      "learning_rate": 1.0472868929351622e-05,
+      "loss": 0.6661,
+      "step": 351200
+    },
+    {
+      "epoch": 0.0166,
+      "grad_norm": 0.9342173337936401,
+      "learning_rate": 1.045995891533571e-05,
+      "loss": 0.6567,
+      "step": 351300
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.9137123227119446,
+      "learning_rate": 1.0447054758046598e-05,
+      "loss": 0.6396,
+      "step": 351400
+    },
+    {
+      "epoch": 0.017,
+      "grad_norm": 0.9604211449623108,
+      "learning_rate": 1.043415646268209e-05,
+      "loss": 0.6496,
+      "step": 351500
+    },
+    {
+      "epoch": 0.0172,
+      "grad_norm": 0.8666329979896545,
+      "learning_rate": 1.0421264034437616e-05,
+      "loss": 0.664,
+      "step": 351600
+    },
+    {
+      "epoch": 0.0174,
+      "grad_norm": 0.86720871925354,
+      "learning_rate": 1.0408377478506253e-05,
+      "loss": 0.657,
+      "step": 351700
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.9042288064956665,
+      "learning_rate": 1.0395496800078692e-05,
+      "loss": 0.6564,
+      "step": 351800
+    },
+    {
+      "epoch": 0.0178,
+      "grad_norm": 0.9693347811698914,
+      "learning_rate": 1.038262200434327e-05,
+      "loss": 0.644,
+      "step": 351900
+    },
+    {
+      "epoch": 0.018,
+      "grad_norm": 0.8999383449554443,
+      "learning_rate": 1.0369753096485957e-05,
+      "loss": 0.6534,
+      "step": 352000
+    },
+    {
+      "epoch": 0.018,
+      "eval_loss": 2.0669960975646973,
+      "eval_runtime": 52.2938,
+      "eval_samples_per_second": 194.937,
+      "eval_steps_per_second": 1.53,
+      "step": 352000
+    },
+    {
+      "epoch": 0.0182,
+      "grad_norm": 0.907943844795227,
+      "learning_rate": 1.0356890081690356e-05,
+      "loss": 0.6459,
+      "step": 352100
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.866569995880127,
+      "learning_rate": 1.034403296513767e-05,
+      "loss": 0.6519,
+      "step": 352200
+    },
+    {
+      "epoch": 0.0186,
+      "grad_norm": 0.904236376285553,
+      "learning_rate": 1.0331181752006755e-05,
+      "loss": 0.6554,
+      "step": 352300
+    },
+    {
+      "epoch": 0.0188,
+      "grad_norm": 0.9165827035903931,
+      "learning_rate": 1.0318336447474075e-05,
+      "loss": 0.6773,
+      "step": 352400
+    },
+    {
+      "epoch": 0.019,
+      "grad_norm": 0.8540114164352417,
+      "learning_rate": 1.0305497056713726e-05,
+      "loss": 0.6529,
+      "step": 352500
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.9309752583503723,
+      "learning_rate": 1.0292663584897396e-05,
+      "loss": 0.6535,
+      "step": 352600
+    },
+    {
+      "epoch": 0.0194,
+      "grad_norm": 0.8861046433448792,
+      "learning_rate": 1.0279836037194417e-05,
+      "loss": 0.6607,
+      "step": 352700
+    },
+    {
+      "epoch": 0.0196,
+      "grad_norm": 0.9103682637214661,
+      "learning_rate": 1.026701441877173e-05,
+      "loss": 0.6708,
+      "step": 352800
+    },
+    {
+      "epoch": 0.0198,
+      "grad_norm": 0.9763253927230835,
+      "learning_rate": 1.0254198734793865e-05,
+      "loss": 0.6319,
+      "step": 352900
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8923797011375427,
+      "learning_rate": 1.0241388990422986e-05,
+      "loss": 0.6605,
+      "step": 353000
+    },
+    {
+      "epoch": 0.02,
+      "eval_loss": 2.066145658493042,
+      "eval_runtime": 52.3003,
+      "eval_samples_per_second": 194.913,
+      "eval_steps_per_second": 1.53,
+      "step": 353000
+    },
+    {
+      "epoch": 0.0202,
+      "grad_norm": 0.8869938850402832,
+      "learning_rate": 1.0228585190818857e-05,
+      "loss": 0.6594,
+      "step": 353100
+    },
+    {
+      "epoch": 0.0204,
+      "grad_norm": 0.8605444431304932,
+      "learning_rate": 1.0215787341138854e-05,
+      "loss": 0.664,
+      "step": 353200
+    },
+    {
+      "epoch": 0.0206,
+      "grad_norm": 1.001497745513916,
+      "learning_rate": 1.0202995446537933e-05,
+      "loss": 0.6574,
+      "step": 353300
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.8902758359909058,
+      "learning_rate": 1.0190209512168677e-05,
+      "loss": 0.6536,
+      "step": 353400
+    },
+    {
+      "epoch": 0.021,
+      "grad_norm": 0.9075655341148376,
+      "learning_rate": 1.017742954318127e-05,
+      "loss": 0.6545,
+      "step": 353500
+    },
+    {
+      "epoch": 0.0212,
+      "grad_norm": 0.9329447746276855,
+      "learning_rate": 1.016465554472346e-05,
+      "loss": 0.6589,
+      "step": 353600
+    },
+    {
+      "epoch": 0.0214,
+      "grad_norm": 0.8853082656860352,
+      "learning_rate": 1.0151887521940628e-05,
+      "loss": 0.6532,
+      "step": 353700
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.8958137631416321,
+      "learning_rate": 1.0139125479975722e-05,
+      "loss": 0.6563,
+      "step": 353800
+    },
+    {
+      "epoch": 0.0218,
+      "grad_norm": 0.865190863609314,
+      "learning_rate": 1.0126369423969293e-05,
+      "loss": 0.6585,
+      "step": 353900
+    },
+    {
+      "epoch": 0.022,
+      "grad_norm": 0.9948294162750244,
+      "learning_rate": 1.0113619359059482e-05,
+      "loss": 0.65,
+      "step": 354000
+    },
+    {
+      "epoch": 0.022,
+      "eval_loss": 2.085937976837158,
+      "eval_runtime": 52.093,
+      "eval_samples_per_second": 195.689,
+      "eval_steps_per_second": 1.536,
+      "step": 354000
+    },
+    {
+      "epoch": 0.0222,
+      "grad_norm": 0.9526733160018921,
+      "learning_rate": 1.0100875290382022e-05,
+      "loss": 0.6509,
+      "step": 354100
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8897534608840942,
+      "learning_rate": 1.0088137223070205e-05,
+      "loss": 0.6609,
+      "step": 354200
+    },
+    {
+      "epoch": 0.0226,
+      "grad_norm": 0.8177494406700134,
+      "learning_rate": 1.007540516225493e-05,
+      "loss": 0.6531,
+      "step": 354300
+    },
+    {
+      "epoch": 0.0228,
+      "grad_norm": 0.9328579306602478,
+      "learning_rate": 1.006267911306468e-05,
+      "loss": 0.7497,
+      "step": 354400
+    },
+    {
+      "epoch": 0.023,
+      "grad_norm": 0.8657885193824768,
+      "learning_rate": 1.004995908062549e-05,
+      "loss": 0.7346,
+      "step": 354500
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.8872801661491394,
+      "learning_rate": 1.0037245070060991e-05,
+      "loss": 0.7475,
+      "step": 354600
+    },
+    {
+      "epoch": 0.0234,
+      "grad_norm": 0.8421425223350525,
+      "learning_rate": 1.002453708649239e-05,
+      "loss": 0.7338,
+      "step": 354700
+    },
+    {
+      "epoch": 0.0236,
+      "grad_norm": 0.8456546068191528,
+      "learning_rate": 1.0011835135038469e-05,
+      "loss": 0.7163,
+      "step": 354800
+    },
+    {
+      "epoch": 0.0238,
+      "grad_norm": 0.9232527613639832,
+      "learning_rate": 9.999139220815554e-06,
+      "loss": 0.715,
+      "step": 354900
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.8569039702415466,
+      "learning_rate": 9.986449348937568e-06,
+      "loss": 0.7392,
+      "step": 355000
+    },
+    {
+      "epoch": 0.024,
+      "eval_loss": 2.056723117828369,
+      "eval_runtime": 52.2992,
+      "eval_samples_per_second": 194.917,
+      "eval_steps_per_second": 1.53,
+      "step": 355000
+    },
+    {
+      "epoch": 0.0242,
+      "grad_norm": 0.8463347554206848,
+      "learning_rate": 9.973765524515988e-06,
+      "loss": 0.719,
+      "step": 355100
+    },
+    {
+      "epoch": 0.0244,
+      "grad_norm": 0.9859148263931274,
+      "learning_rate": 9.961087752659866e-06,
+      "loss": 0.7161,
+      "step": 355200
+    },
+    {
+      "epoch": 0.0246,
+      "grad_norm": 0.8795856833457947,
+      "learning_rate": 9.94841603847579e-06,
+      "loss": 0.7211,
+      "step": 355300
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.8623588681221008,
+      "learning_rate": 9.935750387067935e-06,
+      "loss": 0.7134,
+      "step": 355400
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.8915929794311523,
+      "learning_rate": 9.923090803538021e-06,
+      "loss": 0.718,
+      "step": 355500
+    },
+    {
+      "epoch": 0.0252,
+      "grad_norm": 0.9230467081069946,
+      "learning_rate": 9.91043729298534e-06,
+      "loss": 0.7092,
+      "step": 355600
+    },
+    {
+      "epoch": 0.0254,
+      "grad_norm": 0.9159933924674988,
+      "learning_rate": 9.8977898605067e-06,
+      "loss": 0.7139,
+      "step": 355700
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.0485515594482422,
+      "learning_rate": 9.885148511196502e-06,
+      "loss": 0.7071,
+      "step": 355800
+    },
+    {
+      "epoch": 0.0258,
+      "grad_norm": 0.8589327335357666,
+      "learning_rate": 9.872513250146681e-06,
+      "loss": 0.7102,
+      "step": 355900
+    },
+    {
+      "epoch": 0.026,
+      "grad_norm": 0.9215981960296631,
+      "learning_rate": 9.859884082446707e-06,
+      "loss": 0.6789,
+      "step": 356000
+    },
+    {
+      "epoch": 0.026,
+      "eval_loss": 2.081296920776367,
+      "eval_runtime": 52.2111,
+      "eval_samples_per_second": 195.246,
+      "eval_steps_per_second": 1.532,
+      "step": 356000
+    },
+    {
+      "epoch": 0.0262,
+      "grad_norm": 0.8868950605392456,
+      "learning_rate": 9.847261013183615e-06,
+      "loss": 0.6801,
+      "step": 356100
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.9825394749641418,
+      "learning_rate": 9.834644047441974e-06,
+      "loss": 0.6582,
+      "step": 356200
+    },
+    {
+      "epoch": 0.0266,
+      "grad_norm": 0.8572143316268921,
+      "learning_rate": 9.822033190303906e-06,
+      "loss": 0.6731,
+      "step": 356300
+    },
+    {
+      "epoch": 0.0268,
+      "grad_norm": 0.8867204785346985,
+      "learning_rate": 9.809428446849044e-06,
+      "loss": 0.6634,
+      "step": 356400
+    },
+    {
+      "epoch": 0.027,
+      "grad_norm": 0.8682609796524048,
+      "learning_rate": 9.796829822154589e-06,
+      "loss": 0.6678,
+      "step": 356500
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.8932370543479919,
+      "learning_rate": 9.784237321295262e-06,
+      "loss": 0.6707,
+      "step": 356600
+    },
+    {
+      "epoch": 0.0274,
+      "grad_norm": 0.860748291015625,
+      "learning_rate": 9.771650949343331e-06,
+      "loss": 0.6604,
+      "step": 356700
+    },
+    {
+      "epoch": 0.0276,
+      "grad_norm": 0.8779944181442261,
+      "learning_rate": 9.759070711368568e-06,
+      "loss": 0.6639,
+      "step": 356800
+    },
+    {
+      "epoch": 0.0278,
+      "grad_norm": 0.9277738928794861,
+      "learning_rate": 9.746496612438299e-06,
+      "loss": 0.6617,
+      "step": 356900
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.8405406475067139,
+      "learning_rate": 9.733928657617373e-06,
+      "loss": 0.6663,
+      "step": 357000
+    },
+    {
+      "epoch": 0.028,
+      "eval_loss": 2.0634403228759766,
+      "eval_runtime": 52.3193,
+      "eval_samples_per_second": 194.842,
+      "eval_steps_per_second": 1.529,
+      "step": 357000
+    },
+    {
+      "epoch": 0.0282,
+      "grad_norm": 0.8827060461044312,
+      "learning_rate": 9.721366851968165e-06,
+      "loss": 0.6748,
+      "step": 357100
+    },
+    {
+      "epoch": 0.0284,
+      "grad_norm": 0.908746063709259,
+      "learning_rate": 9.708811200550552e-06,
+      "loss": 0.6614,
+      "step": 357200
+    },
+    {
+      "epoch": 0.0286,
+      "grad_norm": 0.8800754547119141,
+      "learning_rate": 9.69626170842196e-06,
+      "loss": 0.6661,
+      "step": 357300
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.9010385870933533,
+      "learning_rate": 9.68371838063733e-06,
+      "loss": 0.6466,
+      "step": 357400
+    },
+    {
+      "epoch": 0.029,
+      "grad_norm": 0.868073046207428,
+      "learning_rate": 9.671181222249099e-06,
+      "loss": 0.6561,
+      "step": 357500
+    },
+    {
+      "epoch": 0.0292,
+      "grad_norm": 0.982118546962738,
+      "learning_rate": 9.658650238307235e-06,
+      "loss": 0.6696,
+      "step": 357600
+    },
+    {
+      "epoch": 0.0294,
+      "grad_norm": 0.832084059715271,
+      "learning_rate": 9.646125433859221e-06,
+      "loss": 0.6513,
+      "step": 357700
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.9348160028457642,
+      "learning_rate": 9.633606813950055e-06,
+      "loss": 0.6558,
+      "step": 357800
+    },
+    {
+      "epoch": 0.0298,
+      "grad_norm": 0.8417104482650757,
+      "learning_rate": 9.621094383622217e-06,
+      "loss": 0.6621,
+      "step": 357900
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.8583792448043823,
+      "learning_rate": 9.608588147915726e-06,
+      "loss": 0.6572,
+      "step": 358000
+    },
+    {
+      "epoch": 0.03,
+      "eval_loss": 2.086122512817383,
+      "eval_runtime": 52.2197,
+      "eval_samples_per_second": 195.214,
+      "eval_steps_per_second": 1.532,
+      "step": 358000
+    },
+    {
+      "epoch": 0.0002,
+      "grad_norm": 0.8814049959182739,
+      "learning_rate": 9.596088111868085e-06,
+      "loss": 0.653,
+      "step": 358100
+    },
+    {
+      "epoch": 0.0004,
+      "grad_norm": 0.8665258288383484,
+      "learning_rate": 9.583594280514318e-06,
+      "loss": 0.6518,
+      "step": 358200
+    },
+    {
+      "epoch": 0.0006,
+      "grad_norm": 0.9076094627380371,
+      "learning_rate": 9.571106658886925e-06,
+      "loss": 0.6583,
+      "step": 358300
+    },
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.9470544457435608,
+      "learning_rate": 9.558625252015924e-06,
+      "loss": 0.6539,
+      "step": 358400
+    },
+    {
+      "epoch": 0.001,
+      "grad_norm": 0.9310306310653687,
+      "learning_rate": 9.546150064928824e-06,
+      "loss": 0.661,
+      "step": 358500
+    },
+    {
+      "epoch": 0.0012,
+      "grad_norm": 0.8882910013198853,
+      "learning_rate": 9.53368110265064e-06,
+      "loss": 0.6644,
+      "step": 358600
+    },
+    {
+      "epoch": 0.0014,
+      "grad_norm": 0.912969172000885,
+      "learning_rate": 9.52121837020385e-06,
+      "loss": 0.6477,
+      "step": 358700
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.9159826040267944,
+      "learning_rate": 9.50876187260845e-06,
+      "loss": 0.6581,
+      "step": 358800
+    },
+    {
+      "epoch": 0.0018,
+      "grad_norm": 0.8334347605705261,
+      "learning_rate": 9.49631161488192e-06,
+      "loss": 0.6605,
+      "step": 358900
+    },
+    {
+      "epoch": 0.002,
+      "grad_norm": 0.9216808676719666,
+      "learning_rate": 9.483867602039212e-06,
+      "loss": 0.6609,
+      "step": 359000
+    },
+    {
+      "epoch": 0.002,
+      "eval_loss": 2.071388006210327,
+      "eval_runtime": 52.0422,
+      "eval_samples_per_second": 195.879,
+      "eval_steps_per_second": 1.537,
+      "step": 359000
+    },
+    {
+      "epoch": 0.0022,
+      "grad_norm": 0.9010413289070129,
+      "learning_rate": 9.471429839092777e-06,
+      "loss": 0.6428,
+      "step": 359100
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.8659740686416626,
+      "learning_rate": 9.458998331052546e-06,
+      "loss": 0.6462,
+      "step": 359200
+    },
+    {
+      "epoch": 0.0026,
+      "grad_norm": 0.9039402604103088,
+      "learning_rate": 9.446573082925938e-06,
+      "loss": 0.6413,
+      "step": 359300
+    },
+    {
+      "epoch": 0.0028,
+      "grad_norm": 0.9015378952026367,
+      "learning_rate": 9.434154099717824e-06,
+      "loss": 0.6521,
+      "step": 359400
+    },
+    {
+      "epoch": 0.003,
+      "grad_norm": 0.8885050415992737,
+      "learning_rate": 9.421741386430575e-06,
+      "loss": 0.647,
+      "step": 359500
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8669450879096985,
+      "learning_rate": 9.409334948064033e-06,
+      "loss": 0.6564,
+      "step": 359600
+    },
+    {
+      "epoch": 0.0034,
+      "grad_norm": 0.9445268511772156,
+      "learning_rate": 9.396934789615519e-06,
+      "loss": 0.6683,
+      "step": 359700
+    },
+    {
+      "epoch": 0.0036,
+      "grad_norm": 0.8911668062210083,
+      "learning_rate": 9.384540916079798e-06,
+      "loss": 0.6713,
+      "step": 359800
+    },
+    {
+      "epoch": 0.0038,
+      "grad_norm": 0.8700185418128967,
+      "learning_rate": 9.372153332449127e-06,
+      "loss": 0.6621,
+      "step": 359900
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.8949635028839111,
+      "learning_rate": 9.359772043713226e-06,
+      "loss": 0.6468,
+      "step": 360000
+    },
+    {
+      "epoch": 0.004,
+      "eval_loss": 2.0606133937835693,
+      "eval_runtime": 51.5712,
+      "eval_samples_per_second": 197.668,
+      "eval_steps_per_second": 1.551,
+      "step": 360000
+    },
+    {
+      "epoch": 0.0042,
+      "grad_norm": 0.875957190990448,
+      "learning_rate": 9.347397054859283e-06,
+      "loss": 0.6823,
+      "step": 360100
+    },
+    {
+      "epoch": 0.0044,
+      "grad_norm": 0.8829663395881653,
+      "learning_rate": 9.335028370871925e-06,
+      "loss": 0.6758,
+      "step": 360200
+    },
+    {
+      "epoch": 0.0046,
+      "grad_norm": 0.8770716786384583,
+      "learning_rate": 9.322665996733268e-06,
+      "loss": 0.6601,
+      "step": 360300
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.9599934220314026,
+      "learning_rate": 9.310309937422873e-06,
+      "loss": 0.666,
+      "step": 360400
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.8904752135276794,
+      "learning_rate": 9.297960197917766e-06,
+      "loss": 0.662,
+      "step": 360500
+    },
+    {
+      "epoch": 0.0052,
+      "grad_norm": 0.9215303659439087,
+      "learning_rate": 9.285616783192404e-06,
+      "loss": 0.6637,
+      "step": 360600
+    },
+    {
+      "epoch": 0.0054,
+      "grad_norm": 0.9662516117095947,
+      "learning_rate": 9.273279698218726e-06,
+      "loss": 0.6735,
+      "step": 360700
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.9039230346679688,
+      "learning_rate": 9.260948947966111e-06,
+      "loss": 0.682,
+      "step": 360800
+    },
+    {
+      "epoch": 0.0058,
+      "grad_norm": 0.914978563785553,
+      "learning_rate": 9.248624537401368e-06,
+      "loss": 0.6691,
+      "step": 360900
+    },
+    {
+      "epoch": 0.006,
+      "grad_norm": 0.8637982606887817,
+      "learning_rate": 9.236306471488779e-06,
+      "loss": 0.6775,
+      "step": 361000
+    },
+    {
+      "epoch": 0.006,
+      "eval_loss": 2.0751538276672363,
+      "eval_runtime": 51.7366,
+      "eval_samples_per_second": 197.037,
+      "eval_steps_per_second": 1.546,
+      "step": 361000
+    },
+    {
+      "epoch": 0.0062,
+      "grad_norm": 0.8795140981674194,
+      "learning_rate": 9.223994755190058e-06,
+      "loss": 0.683,
+      "step": 361100
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9144249558448792,
+      "learning_rate": 9.21168939346437e-06,
+      "loss": 0.7081,
+      "step": 361200
+    },
+    {
+      "epoch": 0.0066,
+      "grad_norm": 0.8885230422019958,
+      "learning_rate": 9.199390391268301e-06,
+      "loss": 0.6968,
+      "step": 361300
+    },
+    {
+      "epoch": 0.0068,
+      "grad_norm": 0.8315828442573547,
+      "learning_rate": 9.18709775355589e-06,
+      "loss": 0.6809,
+      "step": 361400
+    },
+    {
+      "epoch": 0.007,
+      "grad_norm": 0.8375496864318848,
+      "learning_rate": 9.174811485278614e-06,
+      "loss": 0.686,
+      "step": 361500
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.9053453207015991,
+      "learning_rate": 9.162531591385387e-06,
+      "loss": 0.6921,
+      "step": 361600
+    },
+    {
+      "epoch": 0.0074,
+      "grad_norm": 0.8914540410041809,
+      "learning_rate": 9.150258076822535e-06,
+      "loss": 0.6832,
+      "step": 361700
+    },
+    {
+      "epoch": 0.0076,
+      "grad_norm": 0.8982157707214355,
+      "learning_rate": 9.13799094653383e-06,
+      "loss": 0.6969,
+      "step": 361800
+    },
+    {
+      "epoch": 0.0078,
+      "grad_norm": 1.0123343467712402,
+      "learning_rate": 9.125730205460478e-06,
+      "loss": 0.6915,
+      "step": 361900
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.904523491859436,
+      "learning_rate": 9.113475858541118e-06,
+      "loss": 0.6884,
+      "step": 362000
+    },
+    {
+      "epoch": 0.008,
+      "eval_loss": 2.0824785232543945,
+      "eval_runtime": 51.6588,
+      "eval_samples_per_second": 197.333,
+      "eval_steps_per_second": 1.549,
+      "step": 362000
+    },
+    {
+      "epoch": 0.0082,
+      "grad_norm": 0.8671389818191528,
+      "learning_rate": 9.101227910711765e-06,
+      "loss": 0.706,
+      "step": 362100
+    },
+    {
+      "epoch": 0.0084,
+      "grad_norm": 0.8754188418388367,
+      "learning_rate": 9.088986366905908e-06,
+      "loss": 0.6918,
+      "step": 362200
+    },
+    {
+      "epoch": 0.0086,
+      "grad_norm": 0.8821722865104675,
+      "learning_rate": 9.076751232054439e-06,
+      "loss": 0.6902,
+      "step": 362300
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.8519936800003052,
+      "learning_rate": 9.064522511085677e-06,
+      "loss": 0.6897,
+      "step": 362400
+    },
+    {
+      "epoch": 0.009,
+      "grad_norm": 0.9249884486198425,
+      "learning_rate": 9.052300208925335e-06,
+      "loss": 0.6762,
+      "step": 362500
+    },
+    {
+      "epoch": 0.0092,
+      "grad_norm": 0.9254834651947021,
+      "learning_rate": 9.040084330496562e-06,
+      "loss": 0.6836,
+      "step": 362600
+    },
+    {
+      "epoch": 0.0094,
+      "grad_norm": 0.907455325126648,
+      "learning_rate": 9.027874880719911e-06,
+      "loss": 0.6816,
+      "step": 362700
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8891639709472656,
+      "learning_rate": 9.015671864513356e-06,
+      "loss": 0.6493,
+      "step": 362800
+    },
+    {
+      "epoch": 0.0098,
+      "grad_norm": 0.9093591570854187,
+      "learning_rate": 9.003475286792257e-06,
+      "loss": 0.659,
+      "step": 362900
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.8426594138145447,
+      "learning_rate": 8.991285152469395e-06,
+      "loss": 0.6498,
+      "step": 363000
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 2.0885329246520996,
+      "eval_runtime": 51.6994,
+      "eval_samples_per_second": 197.178,
+      "eval_steps_per_second": 1.547,
+      "step": 363000
+    },
+    {
+      "epoch": 0.0102,
+      "grad_norm": 0.9149935245513916,
+      "learning_rate": 8.979101466454962e-06,
+      "loss": 0.6595,
+      "step": 363100
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.893366277217865,
+      "learning_rate": 8.966924233656552e-06,
+      "loss": 0.6622,
+      "step": 363200
+    },
+    {
+      "epoch": 0.0106,
+      "grad_norm": 0.8946834206581116,
+      "learning_rate": 8.954753458979132e-06,
+      "loss": 0.6639,
+      "step": 363300
+    },
+    {
+      "epoch": 0.0108,
+      "grad_norm": 0.8848134279251099,
+      "learning_rate": 8.9425891473251e-06,
+      "loss": 0.6623,
+      "step": 363400
+    },
+    {
+      "epoch": 0.011,
+      "grad_norm": 0.8674115538597107,
+      "learning_rate": 8.93043130359425e-06,
+      "loss": 0.6483,
+      "step": 363500
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.8136773109436035,
+      "learning_rate": 8.91827993268374e-06,
+      "loss": 0.6598,
+      "step": 363600
+    },
+    {
+      "epoch": 0.0114,
+      "grad_norm": 0.9210416674613953,
+      "learning_rate": 8.906135039488148e-06,
+      "loss": 0.6427,
+      "step": 363700
+    },
+    {
+      "epoch": 0.0116,
+      "grad_norm": 0.8708541393280029,
+      "learning_rate": 8.89399662889944e-06,
+      "loss": 0.6523,
+      "step": 363800
+    },
+    {
+      "epoch": 0.0118,
+      "grad_norm": 0.8490440845489502,
+      "learning_rate": 8.881864705806971e-06,
+      "loss": 0.6571,
+      "step": 363900
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.8714786767959595,
+      "learning_rate": 8.869739275097464e-06,
+      "loss": 0.6535,
+      "step": 364000
+    },
+    {
+      "epoch": 0.012,
+      "eval_loss": 2.0917515754699707,
+      "eval_runtime": 51.7459,
+      "eval_samples_per_second": 197.001,
+      "eval_steps_per_second": 1.546,
+      "step": 364000
+    },
+    {
+      "epoch": 0.0122,
+      "grad_norm": 0.8995687961578369,
+      "learning_rate": 8.857620341655045e-06,
+      "loss": 0.6561,
+      "step": 364100
+    },
+    {
+      "epoch": 0.0124,
+      "grad_norm": 0.9087790846824646,
+      "learning_rate": 8.845507910361223e-06,
+      "loss": 0.6506,
+      "step": 364200
+    },
+    {
+      "epoch": 0.0126,
+      "grad_norm": 0.9006063342094421,
+      "learning_rate": 8.833401986094893e-06,
+      "loss": 0.6628,
+      "step": 364300
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9575886726379395,
+      "learning_rate": 8.821302573732302e-06,
+      "loss": 0.6563,
+      "step": 364400
+    },
+    {
+      "epoch": 0.013,
+      "grad_norm": 0.8845739960670471,
+      "learning_rate": 8.809209678147095e-06,
+      "loss": 0.649,
+      "step": 364500
+    },
+    {
+      "epoch": 0.0132,
+      "grad_norm": 0.8682934641838074,
+      "learning_rate": 8.797123304210298e-06,
+      "loss": 0.6513,
+      "step": 364600
+    },
+    {
+      "epoch": 0.0134,
+      "grad_norm": 0.8966580033302307,
+      "learning_rate": 8.785043456790302e-06,
+      "loss": 0.6443,
+      "step": 364700
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.8867930769920349,
+      "learning_rate": 8.772970140752854e-06,
+      "loss": 0.6473,
+      "step": 364800
+    },
+    {
+      "epoch": 0.0138,
+      "grad_norm": 0.8712829351425171,
+      "learning_rate": 8.760903360961096e-06,
+      "loss": 0.6428,
+      "step": 364900
+    },
+    {
+      "epoch": 0.014,
+      "grad_norm": 0.8830559253692627,
+      "learning_rate": 8.748843122275519e-06,
+      "loss": 0.657,
+      "step": 365000
+    },
+    {
+      "epoch": 0.014,
+      "eval_loss": 2.077829122543335,
+      "eval_runtime": 51.6249,
+      "eval_samples_per_second": 197.463,
+      "eval_steps_per_second": 1.55,
+      "step": 365000
+    },
+    {
+      "epoch": 0.0142,
+      "grad_norm": 0.9168245792388916,
+      "learning_rate": 8.736789429553998e-06,
+      "loss": 0.6542,
+      "step": 365100
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.9041379690170288,
+      "learning_rate": 8.724742287651741e-06,
+      "loss": 0.6422,
+      "step": 365200
+    },
+    {
+      "epoch": 0.0146,
+      "grad_norm": 0.8760838508605957,
+      "learning_rate": 8.712701701421344e-06,
+      "loss": 0.6532,
+      "step": 365300
+    },
+    {
+      "epoch": 0.0148,
+      "grad_norm": 0.8739610910415649,
+      "learning_rate": 8.700667675712764e-06,
+      "loss": 0.6485,
+      "step": 365400
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.9175285696983337,
+      "learning_rate": 8.688640215373287e-06,
+      "loss": 0.6433,
+      "step": 365500
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.8679957985877991,
+      "learning_rate": 8.676619325247578e-06,
+      "loss": 0.627,
+      "step": 365600
+    },
+    {
+      "epoch": 0.0154,
+      "grad_norm": 0.9219822287559509,
+      "learning_rate": 8.664605010177653e-06,
+      "loss": 0.6342,
+      "step": 365700
+    },
+    {
+      "epoch": 0.0156,
+      "grad_norm": 0.8707392811775208,
+      "learning_rate": 8.652597275002888e-06,
+      "loss": 0.6441,
+      "step": 365800
+    },
+    {
+      "epoch": 0.0158,
+      "grad_norm": 0.8975892663002014,
+      "learning_rate": 8.640596124559975e-06,
+      "loss": 0.6119,
+      "step": 365900
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8921619057655334,
+      "learning_rate": 8.628601563682986e-06,
+      "loss": 0.6493,
+      "step": 366000
+    },
+    {
+      "epoch": 0.016,
+      "eval_loss": 2.0901429653167725,
+      "eval_runtime": 51.9763,
+      "eval_samples_per_second": 196.128,
+      "eval_steps_per_second": 1.539,
+      "step": 366000
+    },
+    {
+      "epoch": 0.0162,
+      "grad_norm": 0.9101726412773132,
+      "learning_rate": 8.616613597203333e-06,
+      "loss": 0.6456,
+      "step": 366100
+    },
+    {
+      "epoch": 0.0164,
+      "grad_norm": 0.9642266035079956,
+      "learning_rate": 8.604632229949768e-06,
+      "loss": 0.6411,
+      "step": 366200
+    },
+    {
+      "epoch": 0.0166,
+      "grad_norm": 0.8600582480430603,
+      "learning_rate": 8.592657466748372e-06,
+      "loss": 0.635,
+      "step": 366300
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.9204874038696289,
+      "learning_rate": 8.580689312422587e-06,
+      "loss": 0.6456,
+      "step": 366400
+    },
+    {
+      "epoch": 0.017,
+      "grad_norm": 0.857318103313446,
+      "learning_rate": 8.568727771793186e-06,
+      "loss": 0.6385,
+      "step": 366500
+    },
+    {
+      "epoch": 0.0172,
+      "grad_norm": 0.9361177682876587,
+      "learning_rate": 8.55677284967828e-06,
+      "loss": 0.6299,
+      "step": 366600
+    },
+    {
+      "epoch": 1.000196,
+      "grad_norm": 0.9187692999839783,
+      "learning_rate": 8.544824550893294e-06,
+      "loss": 0.6425,
+      "step": 366700
+    },
+    {
+      "epoch": 1.000396,
+      "grad_norm": 0.8672967553138733,
+      "learning_rate": 8.532882880251011e-06,
+      "loss": 0.6341,
+      "step": 366800
+    },
+    {
+      "epoch": 1.000596,
+      "grad_norm": 0.888131320476532,
+      "learning_rate": 8.520947842561544e-06,
+      "loss": 0.6451,
+      "step": 366900
+    },
+    {
+      "epoch": 1.000796,
+      "grad_norm": 0.8518761992454529,
+      "learning_rate": 8.509019442632308e-06,
+      "loss": 0.637,
+      "step": 367000
+    },
+    {
+      "epoch": 1.000796,
+      "eval_loss": 2.082726240158081,
+      "eval_runtime": 51.6098,
+      "eval_samples_per_second": 197.521,
+      "eval_steps_per_second": 1.55,
+      "step": 367000
+    },
+    {
+      "epoch": 1.000996,
+      "grad_norm": 0.9279243350028992,
+      "learning_rate": 8.497097685268068e-06,
+      "loss": 0.6471,
+      "step": 367100
+    },
+    {
+      "epoch": 1.001196,
+      "grad_norm": 0.9042778611183167,
+      "learning_rate": 8.485182575270905e-06,
+      "loss": 0.6494,
+      "step": 367200
+    },
+    {
+      "epoch": 1.001396,
+      "grad_norm": 0.9116953611373901,
+      "learning_rate": 8.473274117440235e-06,
+      "loss": 0.6333,
+      "step": 367300
+    },
+    {
+      "epoch": 1.001596,
+      "grad_norm": 0.9247483611106873,
+      "learning_rate": 8.461372316572765e-06,
+      "loss": 0.6432,
+      "step": 367400
+    },
+    {
+      "epoch": 1.001796,
+      "grad_norm": 0.8390426635742188,
+      "learning_rate": 8.44947717746255e-06,
+      "loss": 0.6492,
+      "step": 367500
+    },
+    {
+      "epoch": 1.001996,
+      "grad_norm": 0.8003919720649719,
+      "learning_rate": 8.437588704900948e-06,
+      "loss": 0.6472,
+      "step": 367600
+    },
+    {
+      "epoch": 1.002196,
+      "grad_norm": 0.8807201981544495,
+      "learning_rate": 8.425706903676645e-06,
+      "loss": 0.6338,
+      "step": 367700
+    },
+    {
+      "epoch": 1.002396,
+      "grad_norm": 0.8409605622291565,
+      "learning_rate": 8.41383177857561e-06,
+      "loss": 0.6371,
+      "step": 367800
+    },
+    {
+      "epoch": 1.002596,
+      "grad_norm": 0.8772279024124146,
+      "learning_rate": 8.401963334381149e-06,
+      "loss": 0.6305,
+      "step": 367900
+    },
+    {
+      "epoch": 1.002796,
+      "grad_norm": 0.921270489692688,
+      "learning_rate": 8.390101575873871e-06,
+      "loss": 0.6414,
+      "step": 368000
+    },
+    {
+      "epoch": 1.002796,
+      "eval_loss": 2.0858559608459473,
+      "eval_runtime": 51.7813,
+      "eval_samples_per_second": 196.867,
+      "eval_steps_per_second": 1.545,
+      "step": 368000
     }
   ],
   "logging_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 3.211620496844764e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:58ce66db74e88b1f68194d485c23157f7d0c8a9d6b255f56a99102bd66b1a145
 size 5777

 version https://git-lfs.github.com/spec/v1
+oid sha256:04f252a64f6373afbaec36fc31e345451d91b06580ee09a9823282cc3866516c
 size 5777