Upload 10 files

Browse files

Files changed (8) hide show

config.json +2 -2
model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
tokenizer_config.json +1 -1
trainer_state.json +1329 -3
training_args.bin +2 -2

config.json CHANGED Viewed

@@ -14,7 +14,6 @@
   "cls_token_id": 50281,
   "decoder_bias": true,
   "deterministic_flash_attn": false,
-  "dtype": "float32",
   "embedding_dropout": 0.0,
   "eos_token_id": 50282,
   "global_attn_every_n_layers": 3,
@@ -43,6 +42,7 @@
   "sep_token_id": 50282,
   "sparse_pred_ignore_index": -100,
   "sparse_prediction": false,
-  "transformers_version": "4.56.1",
   "vocab_size": 50368
 }

   "cls_token_id": 50281,
   "decoder_bias": true,
   "deterministic_flash_attn": false,
   "embedding_dropout": 0.0,
   "eos_token_id": 50282,
   "global_attn_every_n_layers": 3,
   "sep_token_id": 50282,
   "sparse_pred_ignore_index": -100,
   "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
   "vocab_size": 50368
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8fe5d6868ff2c81227fd2c969c46af4fb1f58973ee4e0966c10979962f78982d
 size 598635032

 version https://git-lfs.github.com/spec/v1
+oid sha256:50accac5965491b2be88f637936f4d01b489952706122fe0d135ecab30a39e80
 size 598635032

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:47b3c36d9ca1786cd8d0139a8f5f16980ca80bc74296c03c792d092074a01113
 size 1197359627

 version https://git-lfs.github.com/spec/v1
+oid sha256:aab4c8f0590b9324c064c0ed46942e320d9a59713b8f281918dcd2d85799abe0
 size 1197359627

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:de790fba095fbd174e2cbec99a78f74e882cac29a5ff6c7320d627732b666d8d
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:7a0c558605c8d82b7652608db5c6f7c38916ecd8a2b4b556a203e1542326fcd7
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18105fb0a6af67c0e74a6e195673b14e0b259cf81cc485bb090b6d14e24299a1
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:a4e558df6977b3affd18572e35774f9ec32c672991e5e862c77249ff31a4e9ad
 size 1465

tokenizer_config.json CHANGED Viewed

@@ -940,6 +940,6 @@
   "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
-  "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "[UNK]"
 }

   "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
+  "tokenizer_class": "PreTrainedTokenizer",
   "unk_token": "[UNK]"
 }

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.036280622240579596,
   "eval_steps": 1000,
-  "global_step": 117000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -9141,6 +9141,1332 @@
       "eval_samples_per_second": 196.48,
       "eval_steps_per_second": 1.542,
       "step": 117000
     }
   ],
   "logging_steps": 100,
@@ -9160,7 +10486,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.0210871976394752e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.0027908170954292,
   "eval_steps": 1000,
+  "global_step": 134000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 196.48,
       "eval_steps_per_second": 1.542,
       "step": 117000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 1.2410722970962524,
+      "learning_rate": 3.858205807282694e-05,
+      "loss": 1.9933,
+      "step": 117100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 1.6566898822784424,
+      "learning_rate": 3.8563464705981354e-05,
+      "loss": 1.9959,
+      "step": 117200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 1.6581664085388184,
+      "learning_rate": 3.854486070101965e-05,
+      "loss": 1.9696,
+      "step": 117300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 1.6906877756118774,
+      "learning_rate": 3.8526246072533345e-05,
+      "loss": 1.9822,
+      "step": 117400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 1.5992618799209595,
+      "learning_rate": 3.850762083512229e-05,
+      "loss": 1.9615,
+      "step": 117500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 1.3387641906738281,
+      "learning_rate": 3.848898500339466e-05,
+      "loss": 1.973,
+      "step": 117600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 1.6966856718063354,
+      "learning_rate": 3.847033859196694e-05,
+      "loss": 1.9695,
+      "step": 117700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 1.525177240371704,
+      "learning_rate": 3.8451681615463915e-05,
+      "loss": 1.9705,
+      "step": 117800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 1.657475233078003,
+      "learning_rate": 3.843301408851864e-05,
+      "loss": 1.9697,
+      "step": 117900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 1.721798300743103,
+      "learning_rate": 3.8414336025772456e-05,
+      "loss": 1.9546,
+      "step": 118000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.221956968307495,
+      "eval_runtime": 52.1068,
+      "eval_samples_per_second": 195.637,
+      "eval_steps_per_second": 1.535,
+      "step": 118000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 1.5945574045181274,
+      "learning_rate": 3.839564744187498e-05,
+      "loss": 1.9359,
+      "step": 118100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 1.7270615100860596,
+      "learning_rate": 3.837694835148406e-05,
+      "loss": 1.9548,
+      "step": 118200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 1.653337836265564,
+      "learning_rate": 3.835823876926579e-05,
+      "loss": 1.9536,
+      "step": 118300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 1.7248106002807617,
+      "learning_rate": 3.833951870989451e-05,
+      "loss": 1.9573,
+      "step": 118400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 1.1816054582595825,
+      "learning_rate": 3.832078818805275e-05,
+      "loss": 1.9473,
+      "step": 118500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 1.6775774955749512,
+      "learning_rate": 3.8302047218431266e-05,
+      "loss": 1.9735,
+      "step": 118600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 1.046647071838379,
+      "learning_rate": 3.8283295815729e-05,
+      "loss": 1.9687,
+      "step": 118700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 1.624248743057251,
+      "learning_rate": 3.8264533994653087e-05,
+      "loss": 1.9574,
+      "step": 118800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 1.0700417757034302,
+      "learning_rate": 3.824576176991882e-05,
+      "loss": 1.9535,
+      "step": 118900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 1.6102774143218994,
+      "learning_rate": 3.8226979156249655e-05,
+      "loss": 1.9551,
+      "step": 119000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.2151541709899902,
+      "eval_runtime": 51.66,
+      "eval_samples_per_second": 197.329,
+      "eval_steps_per_second": 1.549,
+      "step": 119000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 1.7988970279693604,
+      "learning_rate": 3.820818616837719e-05,
+      "loss": 1.9406,
+      "step": 119100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 1.6762808561325073,
+      "learning_rate": 3.818938282104119e-05,
+      "loss": 1.9413,
+      "step": 119200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 1.2559093236923218,
+      "learning_rate": 3.817056912898951e-05,
+      "loss": 1.9393,
+      "step": 119300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 1.659754991531372,
+      "learning_rate": 3.815174510697813e-05,
+      "loss": 1.9473,
+      "step": 119400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 1.2323046922683716,
+      "learning_rate": 3.813291076977114e-05,
+      "loss": 1.9449,
+      "step": 119500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 1.651207685470581,
+      "learning_rate": 3.811406613214071e-05,
+      "loss": 1.9452,
+      "step": 119600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 1.6981886625289917,
+      "learning_rate": 3.80952112088671e-05,
+      "loss": 1.9506,
+      "step": 119700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 1.7373100519180298,
+      "learning_rate": 3.807634601473862e-05,
+      "loss": 1.9405,
+      "step": 119800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 1.8006949424743652,
+      "learning_rate": 3.805747056455166e-05,
+      "loss": 1.9428,
+      "step": 119900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 1.7427655458450317,
+      "learning_rate": 3.803858487311063e-05,
+      "loss": 1.9296,
+      "step": 120000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.2108328342437744,
+      "eval_runtime": 51.7291,
+      "eval_samples_per_second": 197.065,
+      "eval_steps_per_second": 1.547,
+      "step": 120000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 2.748073101043701,
+      "learning_rate": 3.8019688955227974e-05,
+      "loss": 3.8624,
+      "step": 120100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 3.3290884494781494,
+      "learning_rate": 3.800078282572419e-05,
+      "loss": 3.8983,
+      "step": 120200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 3.4201247692108154,
+      "learning_rate": 3.798186649942774e-05,
+      "loss": 3.8567,
+      "step": 120300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 3.4081761837005615,
+      "learning_rate": 3.796293999117511e-05,
+      "loss": 3.8651,
+      "step": 120400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 3.281001329421997,
+      "learning_rate": 3.7944003315810776e-05,
+      "loss": 3.8587,
+      "step": 120500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 2.8812153339385986,
+      "learning_rate": 3.792505648818715e-05,
+      "loss": 3.8612,
+      "step": 120600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 3.5344903469085693,
+      "learning_rate": 3.790609952316467e-05,
+      "loss": 3.8711,
+      "step": 120700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 2.995441436767578,
+      "learning_rate": 3.7887132435611677e-05,
+      "loss": 3.8788,
+      "step": 120800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 3.382432699203491,
+      "learning_rate": 3.786815524040446e-05,
+      "loss": 3.8611,
+      "step": 120900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 3.4205808639526367,
+      "learning_rate": 3.784916795242724e-05,
+      "loss": 3.855,
+      "step": 121000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.2230899333953857,
+      "eval_runtime": 52.6219,
+      "eval_samples_per_second": 193.722,
+      "eval_steps_per_second": 1.52,
+      "step": 121000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 3.3155264854431152,
+      "learning_rate": 3.783017058657215e-05,
+      "loss": 3.8192,
+      "step": 121100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 3.595088243484497,
+      "learning_rate": 3.7811163157739246e-05,
+      "loss": 3.8689,
+      "step": 121200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 3.4232356548309326,
+      "learning_rate": 3.7792145680836453e-05,
+      "loss": 3.8547,
+      "step": 121300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 3.4015252590179443,
+      "learning_rate": 3.7773118170779584e-05,
+      "loss": 3.8624,
+      "step": 121400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 2.724720001220703,
+      "learning_rate": 3.775408064249233e-05,
+      "loss": 3.8415,
+      "step": 121500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 3.4392824172973633,
+      "learning_rate": 3.773503311090622e-05,
+      "loss": 3.8783,
+      "step": 121600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 1.9879323244094849,
+      "learning_rate": 3.771597559096066e-05,
+      "loss": 3.873,
+      "step": 121700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 3.4553592205047607,
+      "learning_rate": 3.7696908097602844e-05,
+      "loss": 3.8727,
+      "step": 121800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 2.3339455127716064,
+      "learning_rate": 3.767783064578784e-05,
+      "loss": 3.8546,
+      "step": 121900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 3.121299982070923,
+      "learning_rate": 3.7658743250478495e-05,
+      "loss": 3.859,
+      "step": 122000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.219958782196045,
+      "eval_runtime": 51.8398,
+      "eval_samples_per_second": 196.644,
+      "eval_steps_per_second": 1.543,
+      "step": 122000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 3.5799753665924072,
+      "learning_rate": 3.763964592664546e-05,
+      "loss": 3.8475,
+      "step": 122100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 3.5003957748413086,
+      "learning_rate": 3.7620538689267186e-05,
+      "loss": 3.8556,
+      "step": 122200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 2.886596202850342,
+      "learning_rate": 3.7601421553329876e-05,
+      "loss": 3.8463,
+      "step": 122300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 3.3040432929992676,
+      "learning_rate": 3.758229453382751e-05,
+      "loss": 3.868,
+      "step": 122400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 2.8264660835266113,
+      "learning_rate": 3.756315764576183e-05,
+      "loss": 3.8631,
+      "step": 122500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 3.332125186920166,
+      "learning_rate": 3.754401090414229e-05,
+      "loss": 3.8392,
+      "step": 122600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 3.4241065979003906,
+      "learning_rate": 3.75248543239861e-05,
+      "loss": 3.8693,
+      "step": 122700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 3.448194980621338,
+      "learning_rate": 3.750568792031819e-05,
+      "loss": 3.8463,
+      "step": 122800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 3.5221004486083984,
+      "learning_rate": 3.748651170817116e-05,
+      "loss": 3.8652,
+      "step": 122900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 3.5034878253936768,
+      "learning_rate": 3.746732570258533e-05,
+      "loss": 3.8299,
+      "step": 123000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.2181193828582764,
+      "eval_runtime": 51.8674,
+      "eval_samples_per_second": 196.54,
+      "eval_steps_per_second": 1.542,
+      "step": 123000
+    },
+    {
+      "epoch": 0.008651532995830519,
+      "grad_norm": 2.127269744873047,
+      "learning_rate": 3.7448129918608706e-05,
+      "loss": 3.8731,
+      "step": 123100
+    },
+    {
+      "epoch": 0.008930614705373438,
+      "grad_norm": 3.29091739654541,
+      "learning_rate": 3.7428924371296935e-05,
+      "loss": 3.9066,
+      "step": 123200
+    },
+    {
+      "epoch": 0.00920969641491636,
+      "grad_norm": 2.9820969104766846,
+      "learning_rate": 3.740970907571336e-05,
+      "loss": 3.8654,
+      "step": 123300
+    },
+    {
+      "epoch": 0.00948877812445928,
+      "grad_norm": 3.3646039962768555,
+      "learning_rate": 3.739048404692893e-05,
+      "loss": 3.8893,
+      "step": 123400
+    },
+    {
+      "epoch": 0.0097678598340022,
+      "grad_norm": 2.2552099227905273,
+      "learning_rate": 3.737124930002226e-05,
+      "loss": 3.8794,
+      "step": 123500
+    },
+    {
+      "epoch": 0.010046941543545119,
+      "grad_norm": 3.449066638946533,
+      "learning_rate": 3.735200485007957e-05,
+      "loss": 3.8869,
+      "step": 123600
+    },
+    {
+      "epoch": 0.010326023253088039,
+      "grad_norm": 3.5163280963897705,
+      "learning_rate": 3.733275071219469e-05,
+      "loss": 3.8858,
+      "step": 123700
+    },
+    {
+      "epoch": 0.01060510496263096,
+      "grad_norm": 2.634568452835083,
+      "learning_rate": 3.731348690146906e-05,
+      "loss": 3.878,
+      "step": 123800
+    },
+    {
+      "epoch": 0.01088418667217388,
+      "grad_norm": 3.3576948642730713,
+      "learning_rate": 3.72942134330117e-05,
+      "loss": 3.8731,
+      "step": 123900
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "grad_norm": 3.476285219192505,
+      "learning_rate": 3.7274930321939205e-05,
+      "loss": 3.887,
+      "step": 124000
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "eval_loss": 2.2249643802642822,
+      "eval_runtime": 51.7986,
+      "eval_samples_per_second": 196.801,
+      "eval_steps_per_second": 1.544,
+      "step": 124000
+    },
+    {
+      "epoch": 0.011442350091259719,
+      "grad_norm": 3.3951408863067627,
+      "learning_rate": 3.7255637583375725e-05,
+      "loss": 3.8708,
+      "step": 124100
+    },
+    {
+      "epoch": 0.011721431800802639,
+      "grad_norm": 3.3581931591033936,
+      "learning_rate": 3.7236335232452977e-05,
+      "loss": 3.8622,
+      "step": 124200
+    },
+    {
+      "epoch": 0.012000513510345558,
+      "grad_norm": 3.3235626220703125,
+      "learning_rate": 3.7217023284310196e-05,
+      "loss": 3.8526,
+      "step": 124300
+    },
+    {
+      "epoch": 0.01227959521988848,
+      "grad_norm": 3.508228063583374,
+      "learning_rate": 3.719770175409417e-05,
+      "loss": 3.848,
+      "step": 124400
+    },
+    {
+      "epoch": 0.0125586769294314,
+      "grad_norm": 3.0591793060302734,
+      "learning_rate": 3.717837065695918e-05,
+      "loss": 3.8698,
+      "step": 124500
+    },
+    {
+      "epoch": 0.012837758638974319,
+      "grad_norm": 3.3803138732910156,
+      "learning_rate": 3.715903000806703e-05,
+      "loss": 3.8825,
+      "step": 124600
+    },
+    {
+      "epoch": 0.013116840348517239,
+      "grad_norm": 3.422788143157959,
+      "learning_rate": 3.7139679822586996e-05,
+      "loss": 3.856,
+      "step": 124700
+    },
+    {
+      "epoch": 0.013395922058060158,
+      "grad_norm": 3.3941519260406494,
+      "learning_rate": 3.7120320115695857e-05,
+      "loss": 3.8594,
+      "step": 124800
+    },
+    {
+      "epoch": 0.013675003767603078,
+      "grad_norm": 2.929302453994751,
+      "learning_rate": 3.710095090257782e-05,
+      "loss": 3.8679,
+      "step": 124900
+    },
+    {
+      "epoch": 0.013954085477146,
+      "grad_norm": 3.4934051036834717,
+      "learning_rate": 3.708157219842461e-05,
+      "loss": 3.8595,
+      "step": 125000
+    },
+    {
+      "epoch": 0.013954085477146,
+      "eval_loss": 2.225058078765869,
+      "eval_runtime": 51.8264,
+      "eval_samples_per_second": 196.695,
+      "eval_steps_per_second": 1.544,
+      "step": 125000
+    },
+    {
+      "epoch": 0.01423316718668892,
+      "grad_norm": 2.81558895111084,
+      "learning_rate": 3.706218401843532e-05,
+      "loss": 3.8671,
+      "step": 125100
+    },
+    {
+      "epoch": 0.014512248896231839,
+      "grad_norm": 3.2801120281219482,
+      "learning_rate": 3.704278637781655e-05,
+      "loss": 3.8591,
+      "step": 125200
+    },
+    {
+      "epoch": 0.014791330605774759,
+      "grad_norm": 3.1782455444335938,
+      "learning_rate": 3.702337929178226e-05,
+      "loss": 3.8703,
+      "step": 125300
+    },
+    {
+      "epoch": 0.015070412315317678,
+      "grad_norm": 3.1894006729125977,
+      "learning_rate": 3.7003962775553866e-05,
+      "loss": 3.8597,
+      "step": 125400
+    },
+    {
+      "epoch": 0.015349494024860598,
+      "grad_norm": 3.327129602432251,
+      "learning_rate": 3.698453684436014e-05,
+      "loss": 3.859,
+      "step": 125500
+    },
+    {
+      "epoch": 0.01562857573440352,
+      "grad_norm": 3.5265657901763916,
+      "learning_rate": 3.6965101513437267e-05,
+      "loss": 3.8468,
+      "step": 125600
+    },
+    {
+      "epoch": 0.015907657443946437,
+      "grad_norm": 3.223062038421631,
+      "learning_rate": 3.6945656798028785e-05,
+      "loss": 3.8544,
+      "step": 125700
+    },
+    {
+      "epoch": 0.01618673915348936,
+      "grad_norm": 3.4239840507507324,
+      "learning_rate": 3.6926202713385606e-05,
+      "loss": 3.8502,
+      "step": 125800
+    },
+    {
+      "epoch": 0.01646582086303228,
+      "grad_norm": 1.897346019744873,
+      "learning_rate": 3.6906739274765986e-05,
+      "loss": 3.6361,
+      "step": 125900
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "grad_norm": 1.675214171409607,
+      "learning_rate": 3.6887266497435516e-05,
+      "loss": 3.3334,
+      "step": 126000
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "eval_loss": 2.207582950592041,
+      "eval_runtime": 52.021,
+      "eval_samples_per_second": 195.959,
+      "eval_steps_per_second": 1.538,
+      "step": 126000
+    },
+    {
+      "epoch": 0.01702398428211812,
+      "grad_norm": 2.446707248687744,
+      "learning_rate": 3.686778439666712e-05,
+      "loss": 3.2678,
+      "step": 126100
+    },
+    {
+      "epoch": 0.017303065991661037,
+      "grad_norm": 1.7556930780410767,
+      "learning_rate": 3.6848292987741006e-05,
+      "loss": 3.2757,
+      "step": 126200
+    },
+    {
+      "epoch": 0.01758214770120396,
+      "grad_norm": 2.2318058013916016,
+      "learning_rate": 3.682879228594472e-05,
+      "loss": 3.2595,
+      "step": 126300
+    },
+    {
+      "epoch": 0.017861229410746877,
+      "grad_norm": 2.536383867263794,
+      "learning_rate": 3.680928230657308e-05,
+      "loss": 3.2332,
+      "step": 126400
+    },
+    {
+      "epoch": 0.018140311120289798,
+      "grad_norm": 2.480672836303711,
+      "learning_rate": 3.678976306492819e-05,
+      "loss": 3.2357,
+      "step": 126500
+    },
+    {
+      "epoch": 0.01841939282983272,
+      "grad_norm": 2.3426804542541504,
+      "learning_rate": 3.677023457631939e-05,
+      "loss": 3.2118,
+      "step": 126600
+    },
+    {
+      "epoch": 0.018698474539375638,
+      "grad_norm": 2.1600797176361084,
+      "learning_rate": 3.6750696856063304e-05,
+      "loss": 3.2129,
+      "step": 126700
+    },
+    {
+      "epoch": 0.01897755624891856,
+      "grad_norm": 1.6937828063964844,
+      "learning_rate": 3.673114991948379e-05,
+      "loss": 3.2046,
+      "step": 126800
+    },
+    {
+      "epoch": 0.019256637958461477,
+      "grad_norm": 2.2766456604003906,
+      "learning_rate": 3.671159378191191e-05,
+      "loss": 3.1943,
+      "step": 126900
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "grad_norm": 1.9862797260284424,
+      "learning_rate": 3.669202845868597e-05,
+      "loss": 3.1908,
+      "step": 127000
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "eval_loss": 2.2138376235961914,
+      "eval_runtime": 51.9698,
+      "eval_samples_per_second": 196.152,
+      "eval_steps_per_second": 1.539,
+      "step": 127000
+    },
+    {
+      "epoch": 0.01981480137754732,
+      "grad_norm": 1.726231336593628,
+      "learning_rate": 3.6672453965151485e-05,
+      "loss": 3.1654,
+      "step": 127100
+    },
+    {
+      "epoch": 0.020093883087090238,
+      "grad_norm": 2.486175537109375,
+      "learning_rate": 3.6652870316661133e-05,
+      "loss": 3.1584,
+      "step": 127200
+    },
+    {
+      "epoch": 0.02037296479663316,
+      "grad_norm": 1.9903950691223145,
+      "learning_rate": 3.663327752857481e-05,
+      "loss": 3.1698,
+      "step": 127300
+    },
+    {
+      "epoch": 0.020652046506176077,
+      "grad_norm": 1.7355810403823853,
+      "learning_rate": 3.661367561625954e-05,
+      "loss": 3.119,
+      "step": 127400
+    },
+    {
+      "epoch": 0.020931128215719,
+      "grad_norm": 2.132373809814453,
+      "learning_rate": 3.6594064595089534e-05,
+      "loss": 3.1671,
+      "step": 127500
+    },
+    {
+      "epoch": 0.02121020992526192,
+      "grad_norm": 1.8258633613586426,
+      "learning_rate": 3.657444448044612e-05,
+      "loss": 3.1271,
+      "step": 127600
+    },
+    {
+      "epoch": 0.021489291634804838,
+      "grad_norm": 2.02746844291687,
+      "learning_rate": 3.65548152877178e-05,
+      "loss": 3.1182,
+      "step": 127700
+    },
+    {
+      "epoch": 0.02176837334434776,
+      "grad_norm": 2.4161229133605957,
+      "learning_rate": 3.6535177032300144e-05,
+      "loss": 3.1113,
+      "step": 127800
+    },
+    {
+      "epoch": 0.022047455053890677,
+      "grad_norm": 1.707440733909607,
+      "learning_rate": 3.651552972959588e-05,
+      "loss": 3.0544,
+      "step": 127900
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "grad_norm": 1.7457507848739624,
+      "learning_rate": 3.649587339501479e-05,
+      "loss": 3.1207,
+      "step": 128000
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "eval_loss": 2.2254478931427,
+      "eval_runtime": 52.0855,
+      "eval_samples_per_second": 195.717,
+      "eval_steps_per_second": 1.536,
+      "step": 128000
+    },
+    {
+      "epoch": 0.022605618472976517,
+      "grad_norm": 1.717077612876892,
+      "learning_rate": 3.647620804397378e-05,
+      "loss": 3.0992,
+      "step": 128100
+    },
+    {
+      "epoch": 0.022884700182519438,
+      "grad_norm": 1.6803147792816162,
+      "learning_rate": 3.6456533691896785e-05,
+      "loss": 3.095,
+      "step": 128200
+    },
+    {
+      "epoch": 0.02316378189206236,
+      "grad_norm": 2.2515881061553955,
+      "learning_rate": 3.643685035421483e-05,
+      "loss": 3.1254,
+      "step": 128300
+    },
+    {
+      "epoch": 0.023442863601605277,
+      "grad_norm": 2.9855501651763916,
+      "learning_rate": 3.641715804636598e-05,
+      "loss": 3.5587,
+      "step": 128400
+    },
+    {
+      "epoch": 0.0237219453111482,
+      "grad_norm": 3.2831578254699707,
+      "learning_rate": 3.6397456783795336e-05,
+      "loss": 3.7799,
+      "step": 128500
+    },
+    {
+      "epoch": 0.024001027020691117,
+      "grad_norm": 3.2432754039764404,
+      "learning_rate": 3.637774658195501e-05,
+      "loss": 3.7901,
+      "step": 128600
+    },
+    {
+      "epoch": 0.024280108730234038,
+      "grad_norm": 3.28851056098938,
+      "learning_rate": 3.6358027456304144e-05,
+      "loss": 3.7778,
+      "step": 128700
+    },
+    {
+      "epoch": 0.02455919043977696,
+      "grad_norm": 3.192739963531494,
+      "learning_rate": 3.633829942230888e-05,
+      "loss": 3.7389,
+      "step": 128800
+    },
+    {
+      "epoch": 0.024838272149319877,
+      "grad_norm": 3.0515987873077393,
+      "learning_rate": 3.6318562495442315e-05,
+      "loss": 3.742,
+      "step": 128900
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "grad_norm": 2.9723589420318604,
+      "learning_rate": 3.629881669118456e-05,
+      "loss": 3.7461,
+      "step": 129000
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "eval_loss": 2.2262234687805176,
+      "eval_runtime": 52.1567,
+      "eval_samples_per_second": 195.449,
+      "eval_steps_per_second": 1.534,
+      "step": 129000
+    },
+    {
+      "epoch": 0.025396435568405717,
+      "grad_norm": 3.1359448432922363,
+      "learning_rate": 3.627906202502267e-05,
+      "loss": 3.7374,
+      "step": 129100
+    },
+    {
+      "epoch": 0.025675517277948638,
+      "grad_norm": 3.2023541927337646,
+      "learning_rate": 3.6259298512450645e-05,
+      "loss": 3.7531,
+      "step": 129200
+    },
+    {
+      "epoch": 0.025954598987491556,
+      "grad_norm": 2.9798941612243652,
+      "learning_rate": 3.623952616896945e-05,
+      "loss": 3.7509,
+      "step": 129300
+    },
+    {
+      "epoch": 0.026233680697034478,
+      "grad_norm": 3.104867458343506,
+      "learning_rate": 3.621974501008695e-05,
+      "loss": 3.7258,
+      "step": 129400
+    },
+    {
+      "epoch": 0.0265127624065774,
+      "grad_norm": 2.8948721885681152,
+      "learning_rate": 3.6199955051317914e-05,
+      "loss": 3.7291,
+      "step": 129500
+    },
+    {
+      "epoch": 0.026791844116120317,
+      "grad_norm": 3.4071433544158936,
+      "learning_rate": 3.618015630818406e-05,
+      "loss": 3.7334,
+      "step": 129600
+    },
+    {
+      "epoch": 0.02707092582566324,
+      "grad_norm": 3.0882623195648193,
+      "learning_rate": 3.6160348796213936e-05,
+      "loss": 3.7099,
+      "step": 129700
+    },
+    {
+      "epoch": 0.027350007535206156,
+      "grad_norm": 3.026120185852051,
+      "learning_rate": 3.6140532530943025e-05,
+      "loss": 3.7309,
+      "step": 129800
+    },
+    {
+      "epoch": 0.027629089244749078,
+      "grad_norm": 3.150139570236206,
+      "learning_rate": 3.612070752791363e-05,
+      "loss": 3.7094,
+      "step": 129900
+    },
+    {
+      "epoch": 0.027908170954292,
+      "grad_norm": 3.0325634479522705,
+      "learning_rate": 3.610087380267495e-05,
+      "loss": 3.7265,
+      "step": 130000
+    },
+    {
+      "epoch": 0.027908170954292,
+      "eval_loss": 2.2089452743530273,
+      "eval_runtime": 52.1784,
+      "eval_samples_per_second": 195.368,
+      "eval_steps_per_second": 1.533,
+      "step": 130000
+    },
+    {
+      "epoch": 0.028187252663834917,
+      "grad_norm": 2.9735958576202393,
+      "learning_rate": 3.6081031370782974e-05,
+      "loss": 3.7094,
+      "step": 130100
+    },
+    {
+      "epoch": 0.02846633437337784,
+      "grad_norm": 2.9261887073516846,
+      "learning_rate": 3.6061180247800564e-05,
+      "loss": 3.7091,
+      "step": 130200
+    },
+    {
+      "epoch": 0.028745416082920756,
+      "grad_norm": 2.927654266357422,
+      "learning_rate": 3.604132044929736e-05,
+      "loss": 3.7146,
+      "step": 130300
+    },
+    {
+      "epoch": 0.029024497792463678,
+      "grad_norm": 3.1641175746917725,
+      "learning_rate": 3.602145199084986e-05,
+      "loss": 3.706,
+      "step": 130400
+    },
+    {
+      "epoch": 0.0293035795020066,
+      "grad_norm": 3.108304023742676,
+      "learning_rate": 3.600157488804129e-05,
+      "loss": 3.7051,
+      "step": 130500
+    },
+    {
+      "epoch": 0.029582661211549517,
+      "grad_norm": 3.076998710632324,
+      "learning_rate": 3.598168915646171e-05,
+      "loss": 3.7033,
+      "step": 130600
+    },
+    {
+      "epoch": 0.02986174292109244,
+      "grad_norm": 3.1134989261627197,
+      "learning_rate": 3.5961794811707915e-05,
+      "loss": 3.6995,
+      "step": 130700
+    },
+    {
+      "epoch": 0.030140824630635357,
+      "grad_norm": 3.0833628177642822,
+      "learning_rate": 3.5941891869383474e-05,
+      "loss": 3.7156,
+      "step": 130800
+    },
+    {
+      "epoch": 0.030419906340178278,
+      "grad_norm": 2.9580423831939697,
+      "learning_rate": 3.592198034509868e-05,
+      "loss": 3.6818,
+      "step": 130900
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "grad_norm": 3.1217994689941406,
+      "learning_rate": 3.590206025447058e-05,
+      "loss": 3.6902,
+      "step": 131000
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "eval_loss": 2.215585708618164,
+      "eval_runtime": 52.1786,
+      "eval_samples_per_second": 195.368,
+      "eval_steps_per_second": 1.533,
+      "step": 131000
+    },
+    {
+      "epoch": 0.030978069759264117,
+      "grad_norm": 3.0639188289642334,
+      "learning_rate": 3.588213161312291e-05,
+      "loss": 3.6862,
+      "step": 131100
+    },
+    {
+      "epoch": 0.03125715146880704,
+      "grad_norm": 2.920884132385254,
+      "learning_rate": 3.5862194436686156e-05,
+      "loss": 3.6815,
+      "step": 131200
+    },
+    {
+      "epoch": 0.03153623317834996,
+      "grad_norm": 3.0907158851623535,
+      "learning_rate": 3.584224874079745e-05,
+      "loss": 3.6951,
+      "step": 131300
+    },
+    {
+      "epoch": 0.031815314887892875,
+      "grad_norm": 3.105325222015381,
+      "learning_rate": 3.582229454110065e-05,
+      "loss": 3.7043,
+      "step": 131400
+    },
+    {
+      "epoch": 0.0320943965974358,
+      "grad_norm": 3.2944116592407227,
+      "learning_rate": 3.5802331853246245e-05,
+      "loss": 3.6847,
+      "step": 131500
+    },
+    {
+      "epoch": 0.03237347830697872,
+      "grad_norm": 3.0563266277313232,
+      "learning_rate": 3.578236069289141e-05,
+      "loss": 3.692,
+      "step": 131600
+    },
+    {
+      "epoch": 0.032652560016521635,
+      "grad_norm": 3.1431596279144287,
+      "learning_rate": 3.576238107569994e-05,
+      "loss": 3.6776,
+      "step": 131700
+    },
+    {
+      "epoch": 0.03293164172606456,
+      "grad_norm": 2.9235949516296387,
+      "learning_rate": 3.5742393017342294e-05,
+      "loss": 3.6924,
+      "step": 131800
+    },
+    {
+      "epoch": 0.03321072343560748,
+      "grad_norm": 2.9574053287506104,
+      "learning_rate": 3.572239653349552e-05,
+      "loss": 3.6733,
+      "step": 131900
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "grad_norm": 3.0737438201904297,
+      "learning_rate": 3.570239163984331e-05,
+      "loss": 3.6787,
+      "step": 132000
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "eval_loss": 2.210517644882202,
+      "eval_runtime": 52.1762,
+      "eval_samples_per_second": 195.376,
+      "eval_steps_per_second": 1.533,
+      "step": 132000
+    },
+    {
+      "epoch": 0.033768886854693314,
+      "grad_norm": 2.946190595626831,
+      "learning_rate": 3.568237835207591e-05,
+      "loss": 3.6731,
+      "step": 132100
+    },
+    {
+      "epoch": 0.03404796856423624,
+      "grad_norm": 2.947497844696045,
+      "learning_rate": 3.566235668589017e-05,
+      "loss": 3.6757,
+      "step": 132200
+    },
+    {
+      "epoch": 0.03432705027377916,
+      "grad_norm": 3.0203778743743896,
+      "learning_rate": 3.5642326656989525e-05,
+      "loss": 3.6767,
+      "step": 132300
+    },
+    {
+      "epoch": 0.034606131983322075,
+      "grad_norm": 3.1348791122436523,
+      "learning_rate": 3.562228828108396e-05,
+      "loss": 3.6904,
+      "step": 132400
+    },
+    {
+      "epoch": 0.034885213692865,
+      "grad_norm": 3.3154730796813965,
+      "learning_rate": 3.5602241573889984e-05,
+      "loss": 3.6876,
+      "step": 132500
+    },
+    {
+      "epoch": 0.03516429540240792,
+      "grad_norm": 2.9193050861358643,
+      "learning_rate": 3.558218655113066e-05,
+      "loss": 3.6869,
+      "step": 132600
+    },
+    {
+      "epoch": 0.035443377111950836,
+      "grad_norm": 2.8012895584106445,
+      "learning_rate": 3.5562123228535594e-05,
+      "loss": 3.6905,
+      "step": 132700
+    },
+    {
+      "epoch": 0.035722458821493754,
+      "grad_norm": 3.0849194526672363,
+      "learning_rate": 3.554205162184087e-05,
+      "loss": 3.6691,
+      "step": 132800
+    },
+    {
+      "epoch": 0.03600154053103668,
+      "grad_norm": 3.2069287300109863,
+      "learning_rate": 3.552197174678907e-05,
+      "loss": 3.6841,
+      "step": 132900
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "grad_norm": 2.993010997772217,
+      "learning_rate": 3.550188361912927e-05,
+      "loss": 3.6776,
+      "step": 133000
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "eval_loss": 2.211658477783203,
+      "eval_runtime": 52.2134,
+      "eval_samples_per_second": 195.237,
+      "eval_steps_per_second": 1.532,
+      "step": 133000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 3.175614833831787,
+      "learning_rate": 3.548178725461704e-05,
+      "loss": 3.6787,
+      "step": 133100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 3.2140350341796875,
+      "learning_rate": 3.546168266901436e-05,
+      "loss": 3.6835,
+      "step": 133200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 3.0778603553771973,
+      "learning_rate": 3.544156987808971e-05,
+      "loss": 3.662,
+      "step": 133300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 2.9668691158294678,
+      "learning_rate": 3.542144889761798e-05,
+      "loss": 3.67,
+      "step": 133400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 2.9988768100738525,
+      "learning_rate": 3.5401319743380477e-05,
+      "loss": 3.6596,
+      "step": 133500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 3.003577470779419,
+      "learning_rate": 3.538118243116494e-05,
+      "loss": 3.6823,
+      "step": 133600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 3.0805444717407227,
+      "learning_rate": 3.536103697676548e-05,
+      "loss": 3.656,
+      "step": 133700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 3.2663848400115967,
+      "learning_rate": 3.5340883395982617e-05,
+      "loss": 3.6776,
+      "step": 133800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 3.1434547901153564,
+      "learning_rate": 3.532072170462324e-05,
+      "loss": 3.6624,
+      "step": 133900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 3.08821964263916,
+      "learning_rate": 3.53005519185006e-05,
+      "loss": 3.6923,
+      "step": 134000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.2045140266418457,
+      "eval_runtime": 51.8625,
+      "eval_samples_per_second": 196.558,
+      "eval_steps_per_second": 1.543,
+      "step": 134000
     }
   ],
   "logging_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 1.1694502947323904e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da5f5517b1675eb630da2afe2ee47f40a6f105aba3407f1e48d33a873836c026
-size 5777

 version https://git-lfs.github.com/spec/v1
+oid sha256:66d39cf86390d3a1f1bf05e9571d4d0939bf6f5fc60ae060f7397e9b450ea61c
+size 5713