Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.gitattributes +1 -0
README.md +1 -1
adapter_config.json +71 -95
adapter_model.safetensors +2 -2
latest +1 -1
ocr.json +3 -0
scheduler.pt +1 -1
trainer_state.json +901 -306
training_args.bin +1 -1

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
+ocr.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-base_model: Qwen/Qwen2.5-VL-3B-Instruct
 library_name: peft
 ---

 ---
+base_model: Qwen/Qwen2.5-VL-7B-Instruct
 library_name: peft
 ---

adapter_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "alpha_pattern": {},
   "auto_mapping": null,
-  "base_model_name_or_path": "Qwen/Qwen2.5-VL-3B-Instruct",
   "bias": "none",
   "corda_config": null,
   "eva_config": null,
@@ -24,118 +24,94 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "layers.28.mlp.up_proj",
-    "layers.24.mlp.up_proj",
-    "layers.18.mlp.down_proj",
-    "layers.21.mlp.down_proj",
-    "layers.24.mlp.gate_proj",
     "layers.25.mlp.up_proj",
-    "layers.28.mlp.gate_proj",
-    "32.mlp.gate_proj",
-    "layers.3.mlp.down_proj",
-    "layers.14.mlp.gate_proj",
-    "layers.30.mlp.gate_proj",
-    "layers.22.mlp.up_proj",
-    "32.mlp.up_proj",
-    "layers.15.mlp.down_proj",
-    "layers.7.mlp.gate_proj",
-    "layers.29.mlp.down_proj",
-    "34.mlp.down_proj",
-    "layers.2.mlp.up_proj",
     "v_proj",
-    "layers.17.mlp.down_proj",
     "layers.18.mlp.up_proj",
-    "layers.5.mlp.gate_proj",
-    "layers.26.mlp.gate_proj",
-    "layers.3.mlp.gate_proj",
     "layers.24.mlp.down_proj",
-    "layers.11.mlp.up_proj",
-    "layers.2.mlp.down_proj",
     "layers.27.mlp.down_proj",
-    "layers.25.mlp.gate_proj",
     "layers.2.mlp.gate_proj",
     "layers.20.mlp.up_proj",
     "layers.20.mlp.gate_proj",
-    "layers.0.mlp.gate_proj",
-    "33.mlp.up_proj",
-    "32.mlp.down_proj",
-    "layers.25.mlp.down_proj",
-    "layers.28.mlp.down_proj",
-    "layers.0.mlp.down_proj",
-    "35.mlp.down_proj",
-    "layers.6.mlp.down_proj",
-    "o_proj",
-    "layers.29.mlp.up_proj",
-    "layers.3.mlp.up_proj",
-    "35.mlp.gate_proj",
     "layers.20.mlp.down_proj",
-    "layers.31.mlp.down_proj",
-    "layers.13.mlp.gate_proj",
-    "layers.7.mlp.down_proj",
-    "layers.5.mlp.down_proj",
-    "layers.5.mlp.up_proj",
-    "q_proj",
-    "layers.21.mlp.up_proj",
-    "layers.10.mlp.down_proj",
-    "layers.10.mlp.gate_proj",
-    "layers.31.mlp.up_proj",
-    "layers.0.mlp.up_proj",
-    "layers.14.mlp.up_proj",
-    "33.mlp.gate_proj",
-    "35.mlp.up_proj",
-    "layers.17.mlp.up_proj",
-    "layers.11.mlp.down_proj",
-    "layers.30.mlp.down_proj",
-    "layers.6.mlp.up_proj",
-    "layers.19.mlp.up_proj",
-    "layers.4.mlp.up_proj",
     "layers.10.mlp.up_proj",
-    "34.mlp.up_proj",
-    "33.mlp.down_proj",
-    "layers.22.mlp.gate_proj",
-    "k_proj",
-    "layers.9.mlp.gate_proj",
-    "layers.9.mlp.up_proj",
-    "layers.11.mlp.gate_proj",
-    "layers.23.mlp.down_proj",
-    "layers.7.mlp.up_proj",
     "layers.9.mlp.down_proj",
-    "layers.21.mlp.gate_proj",
-    "layers.17.mlp.gate_proj",
-    "layers.29.mlp.gate_proj",
-    "layers.13.mlp.down_proj",
-    "layers.19.mlp.gate_proj",
-    "layers.6.mlp.gate_proj",
-    "layers.13.mlp.up_proj",
-    "layers.26.mlp.down_proj",
-    "layers.15.mlp.gate_proj",
-    "layers.22.mlp.down_proj",
-    "layers.30.mlp.up_proj",
-    "layers.16.mlp.gate_proj",
-    "layers.23.mlp.gate_proj",
-    "layers.31.mlp.gate_proj",
-    "layers.12.mlp.down_proj",
-    "layers.1.mlp.up_proj",
-    "layers.8.mlp.up_proj",
     "layers.4.mlp.down_proj",
-    "layers.27.mlp.gate_proj",
     "layers.8.mlp.gate_proj",
     "layers.19.mlp.down_proj",
-    "layers.8.mlp.down_proj",
-    "34.mlp.gate_proj",
-    "layers.16.mlp.down_proj",
     "layers.16.mlp.up_proj",
-    "layers.18.mlp.gate_proj",
     "layers.12.mlp.up_proj",
-    "layers.1.mlp.gate_proj",
-    "layers.23.mlp.up_proj",
-    "layers.15.mlp.up_proj",
-    "layers.26.mlp.up_proj",
     "layers.1.mlp.down_proj",
-    "layers.14.mlp.down_proj",
-    "layers.12.mlp.gate_proj",
     "layers.4.mlp.gate_proj",
-    "layers.27.mlp.up_proj"
   ],
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,

 {
   "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
   "bias": "none",
   "corda_config": null,
   "eva_config": null,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "layers.16.mlp.down_proj",
+    "layers.12.mlp.gate_proj",
+    "layers.17.mlp.up_proj",
+    "layers.3.mlp.up_proj",
+    "layers.5.mlp.gate_proj",
+    "k_proj",
     "layers.25.mlp.up_proj",
     "v_proj",
+    "layers.8.mlp.down_proj",
+    "layers.24.mlp.up_proj",
+    "layers.15.mlp.up_proj",
+    "layers.7.mlp.gate_proj",
     "layers.18.mlp.up_proj",
+    "layers.13.mlp.gate_proj",
+    "layers.6.mlp.gate_proj",
+    "layers.9.mlp.gate_proj",
+    "layers.16.mlp.gate_proj",
+    "layers.22.mlp.up_proj",
+    "layers.21.mlp.up_proj",
+    "layers.27.mlp.gate_proj",
+    "layers.23.mlp.down_proj",
+    "layers.24.mlp.gate_proj",
+    "layers.23.mlp.up_proj",
     "layers.24.mlp.down_proj",
+    "layers.0.mlp.down_proj",
+    "layers.0.mlp.up_proj",
+    "layers.26.mlp.gate_proj",
     "layers.27.mlp.down_proj",
+    "q_proj",
+    "layers.1.mlp.up_proj",
+    "layers.21.mlp.down_proj",
     "layers.2.mlp.gate_proj",
+    "layers.11.mlp.gate_proj",
+    "layers.14.mlp.up_proj",
+    "layers.22.mlp.down_proj",
+    "layers.5.mlp.down_proj",
+    "layers.19.mlp.gate_proj",
+    "layers.17.mlp.down_proj",
+    "layers.9.mlp.up_proj",
     "layers.20.mlp.up_proj",
+    "layers.3.mlp.gate_proj",
+    "layers.1.mlp.gate_proj",
+    "layers.13.mlp.up_proj",
     "layers.20.mlp.gate_proj",
+    "layers.26.mlp.down_proj",
     "layers.20.mlp.down_proj",
+    "layers.14.mlp.down_proj",
     "layers.10.mlp.up_proj",
     "layers.9.mlp.down_proj",
+    "layers.6.mlp.up_proj",
+    "layers.11.mlp.up_proj",
+    "layers.10.mlp.gate_proj",
+    "layers.4.mlp.up_proj",
+    "layers.14.mlp.gate_proj",
+    "layers.2.mlp.up_proj",
     "layers.4.mlp.down_proj",
+    "layers.7.mlp.down_proj",
+    "o_proj",
+    "layers.0.mlp.gate_proj",
+    "layers.17.mlp.gate_proj",
     "layers.8.mlp.gate_proj",
     "layers.19.mlp.down_proj",
+    "layers.25.mlp.gate_proj",
+    "layers.11.mlp.down_proj",
     "layers.16.mlp.up_proj",
+    "layers.2.mlp.down_proj",
+    "layers.19.mlp.up_proj",
+    "layers.15.mlp.gate_proj",
+    "layers.22.mlp.gate_proj",
+    "layers.23.mlp.gate_proj",
+    "layers.6.mlp.down_proj",
+    "layers.10.mlp.down_proj",
     "layers.12.mlp.up_proj",
+    "layers.8.mlp.up_proj",
+    "layers.5.mlp.up_proj",
+    "layers.3.mlp.down_proj",
     "layers.1.mlp.down_proj",
+    "layers.21.mlp.gate_proj",
+    "layers.26.mlp.up_proj",
     "layers.4.mlp.gate_proj",
+    "layers.27.mlp.up_proj",
+    "layers.12.mlp.down_proj",
+    "layers.15.mlp.down_proj",
+    "layers.18.mlp.gate_proj",
+    "layers.18.mlp.down_proj",
+    "layers.7.mlp.up_proj",
+    "layers.25.mlp.down_proj",
+    "layers.13.mlp.down_proj"
   ],
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:07f01f8f0b4027eaf41807cbd6678cfc2044054280411ce3145aec238302c90d
-size 239536776

 version https://git-lfs.github.com/spec/v1
+oid sha256:c65869936ca775b0b7e34856b673e70c7b88167d895229d6337d74ee4f2a74aa
+size 323014560

latest CHANGED Viewed

	@@ -1 +1 @@
1	- ~~global_step375~~


1	+ global_step800

ocr.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ad6da340f2f6403e305dac3b8deef53107b7c5de50f73da9c490b8b174d6e40
+size 34218525

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dcffc811ab0ada587a376eecb6fc27cadcbc02597d7ed30d159fba6bf764c2b2
 size 1401

 version https://git-lfs.github.com/spec/v1
+oid sha256:bbb214732a7d2d242635ceda6bf2c890011d7f28b61b690994eae9c7558a5c03
 size 1401

trainer_state.json CHANGED Viewed

@@ -2,543 +2,1138 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 375,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.013333333333333334,
-      "grad_norm": 0.1966240406036377,
-      "learning_rate": 2.105263157894737e-05,
-      "loss": 0.0919,
       "step": 5
     },
     {
-      "epoch": 0.02666666666666667,
-      "grad_norm": 0.109957255423069,
-      "learning_rate": 4.736842105263158e-05,
-      "loss": 0.0647,
       "step": 10
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 0.06744097173213959,
-      "learning_rate": 7.368421052631579e-05,
-      "loss": 0.053,
       "step": 15
     },
     {
-      "epoch": 0.05333333333333334,
-      "grad_norm": 0.054616399109363556,
-      "learning_rate": 0.0001,
-      "loss": 0.0415,
       "step": 20
     },
     {
-      "epoch": 0.06666666666666667,
-      "grad_norm": 0.07561139017343521,
-      "learning_rate": 9.995133583167832e-05,
-      "loss": 0.0447,
       "step": 25
     },
     {
-      "epoch": 0.08,
-      "grad_norm": 0.048080697655677795,
-      "learning_rate": 9.980543805476446e-05,
-      "loss": 0.0345,
       "step": 30
     },
     {
-      "epoch": 0.09333333333333334,
-      "grad_norm": 0.052209969609975815,
-      "learning_rate": 9.956259066901733e-05,
-      "loss": 0.0316,
       "step": 35
     },
     {
-      "epoch": 0.10666666666666667,
-      "grad_norm": 0.04528443142771721,
-      "learning_rate": 9.922326639307917e-05,
-      "loss": 0.0338,
       "step": 40
     },
     {
-      "epoch": 0.12,
-      "grad_norm": 0.06078013405203819,
-      "learning_rate": 9.878812574429721e-05,
-      "loss": 0.0338,
       "step": 45
     },
     {
-      "epoch": 0.13333333333333333,
-      "grad_norm": 0.04745374619960785,
-      "learning_rate": 9.825801575298248e-05,
-      "loss": 0.0284,
       "step": 50
     },
     {
-      "epoch": 0.14666666666666667,
-      "grad_norm": 0.051620353013277054,
-      "learning_rate": 9.763396831360884e-05,
-      "loss": 0.0345,
       "step": 55
     },
     {
-      "epoch": 0.16,
-      "grad_norm": 0.049457110464572906,
-      "learning_rate": 9.691719817616147e-05,
-      "loss": 0.0317,
       "step": 60
     },
     {
-      "epoch": 0.17333333333333334,
-      "grad_norm": 0.04445967078208923,
-      "learning_rate": 9.61091005815451e-05,
-      "loss": 0.0278,
       "step": 65
     },
     {
-      "epoch": 0.18666666666666668,
-      "grad_norm": 0.04890437796711922,
-      "learning_rate": 9.521124854565425e-05,
-      "loss": 0.0316,
       "step": 70
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 0.030447857454419136,
-      "learning_rate": 9.422538979739307e-05,
-      "loss": 0.0304,
       "step": 75
     },
     {
-      "epoch": 0.21333333333333335,
-      "grad_norm": 0.052393049001693726,
-      "learning_rate": 9.315344337660421e-05,
-      "loss": 0.032,
       "step": 80
     },
     {
-      "epoch": 0.22666666666666666,
-      "grad_norm": 0.05007480829954147,
-      "learning_rate": 9.19974958985298e-05,
-      "loss": 0.0259,
       "step": 85
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 0.05939432606101036,
-      "learning_rate": 9.075979749207561e-05,
-      "loss": 0.0286,
       "step": 90
     },
     {
-      "epoch": 0.25333333333333335,
-      "grad_norm": 0.04563208296895027,
-      "learning_rate": 8.944275741978493e-05,
-      "loss": 0.025,
       "step": 95
     },
     {
-      "epoch": 0.26666666666666666,
-      "grad_norm": 0.05097965523600578,
-      "learning_rate": 8.80489393880484e-05,
-      "loss": 0.0267,
       "step": 100
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 0.03556825965642929,
-      "learning_rate": 8.65810565566782e-05,
-      "loss": 0.0216,
       "step": 105
     },
     {
-      "epoch": 0.29333333333333333,
-      "grad_norm": 0.04471671208739281,
-      "learning_rate": 8.504196625756166e-05,
-      "loss": 0.0213,
       "step": 110
     },
     {
-      "epoch": 0.30666666666666664,
-      "grad_norm": 0.038890305906534195,
-      "learning_rate": 8.343466443267391e-05,
-      "loss": 0.0238,
       "step": 115
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 0.04462519288063049,
-      "learning_rate": 8.176227980227694e-05,
-      "loss": 0.024,
       "step": 120
     },
     {
-      "epoch": 0.3333333333333333,
-      "grad_norm": 0.0567193366587162,
-      "learning_rate": 8.002806777465685e-05,
-      "loss": 0.0309,
       "step": 125
     },
     {
-      "epoch": 0.3466666666666667,
-      "grad_norm": 0.047946710139513016,
-      "learning_rate": 7.823540410925435e-05,
-      "loss": 0.0254,
       "step": 130
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 0.050420425832271576,
-      "learning_rate": 7.63877783455237e-05,
-      "loss": 0.0259,
       "step": 135
     },
     {
-      "epoch": 0.37333333333333335,
-      "grad_norm": 0.044385556131601334,
-      "learning_rate": 7.448878701031142e-05,
-      "loss": 0.0256,
       "step": 140
     },
     {
-      "epoch": 0.38666666666666666,
-      "grad_norm": 0.038669321686029434,
-      "learning_rate": 7.254212661697659e-05,
-      "loss": 0.0221,
       "step": 145
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 0.039255157113075256,
-      "learning_rate": 7.055158646988109e-05,
-      "loss": 0.0209,
       "step": 150
     },
     {
-      "epoch": 0.41333333333333333,
-      "grad_norm": 0.052665840834379196,
-      "learning_rate": 6.85210412882557e-05,
-      "loss": 0.0254,
       "step": 155
     },
     {
-      "epoch": 0.4266666666666667,
-      "grad_norm": 0.05023692920804024,
-      "learning_rate": 6.64544436638005e-05,
-      "loss": 0.0328,
       "step": 160
     },
     {
-      "epoch": 0.44,
-      "grad_norm": 0.06168365851044655,
-      "learning_rate": 6.435581636670154e-05,
-      "loss": 0.0275,
       "step": 165
     },
     {
-      "epoch": 0.4533333333333333,
-      "grad_norm": 0.04346410557627678,
-      "learning_rate": 6.222924451504001e-05,
-      "loss": 0.0199,
       "step": 170
     },
     {
-      "epoch": 0.4666666666666667,
-      "grad_norm": 0.04977494478225708,
-      "learning_rate": 6.0078867622837395e-05,
-      "loss": 0.0217,
       "step": 175
     },
     {
-      "epoch": 0.48,
-      "grad_norm": 0.048214834183454514,
-      "learning_rate": 5.79088715422152e-05,
-      "loss": 0.0217,
       "step": 180
     },
     {
-      "epoch": 0.49333333333333335,
-      "grad_norm": 0.04549489915370941,
-      "learning_rate": 5.572348031535441e-05,
-      "loss": 0.0243,
       "step": 185
     },
     {
-      "epoch": 0.5066666666666667,
-      "grad_norm": 0.03898506984114647,
-      "learning_rate": 5.352694795211555e-05,
-      "loss": 0.0225,
       "step": 190
     },
     {
-      "epoch": 0.52,
-      "grad_norm": 0.04560156539082527,
-      "learning_rate": 5.132355014932455e-05,
-      "loss": 0.0306,
       "step": 195
     },
     {
-      "epoch": 0.5333333333333333,
-      "grad_norm": 0.04489945247769356,
-      "learning_rate": 4.911757596784357e-05,
-      "loss": 0.0196,
       "step": 200
     },
     {
-      "epoch": 0.5466666666666666,
-      "grad_norm": 0.04646344482898712,
-      "learning_rate": 4.691331948362789e-05,
-      "loss": 0.022,
       "step": 205
     },
     {
-      "epoch": 0.56,
-      "grad_norm": 0.04694559797644615,
-      "learning_rate": 4.471507142902036e-05,
-      "loss": 0.0267,
       "step": 210
     },
     {
-      "epoch": 0.5733333333333334,
-      "grad_norm": 0.07199534773826599,
-      "learning_rate": 4.252711084055467e-05,
-      "loss": 0.0207,
       "step": 215
     },
     {
-      "epoch": 0.5866666666666667,
-      "grad_norm": 0.035585273057222366,
-      "learning_rate": 4.035369672952516e-05,
-      "loss": 0.0174,
       "step": 220
     },
     {
-      "epoch": 0.6,
-      "grad_norm": 0.04401592165231705,
-      "learning_rate": 3.81990597915371e-05,
-      "loss": 0.0215,
       "step": 225
     },
     {
-      "epoch": 0.6133333333333333,
-      "grad_norm": 0.05258890613913536,
-      "learning_rate": 3.6067394171175394e-05,
-      "loss": 0.0302,
       "step": 230
     },
     {
-      "epoch": 0.6266666666666667,
-      "grad_norm": 0.03661968186497688,
-      "learning_rate": 3.3962849297822226e-05,
-      "loss": 0.024,
       "step": 235
     },
     {
-      "epoch": 0.64,
-      "grad_norm": 0.05009055882692337,
-      "learning_rate": 3.188952180851589e-05,
-      "loss": 0.0206,
       "step": 240
     },
     {
-      "epoch": 0.6533333333333333,
-      "grad_norm": 0.043911464512348175,
-      "learning_rate": 2.9851447573573384e-05,
-      "loss": 0.0237,
       "step": 245
     },
     {
-      "epoch": 0.6666666666666666,
-      "grad_norm": 0.041712645441293716,
-      "learning_rate": 2.785259384049959e-05,
-      "loss": 0.0209,
       "step": 250
     },
     {
-      "epoch": 0.68,
-      "grad_norm": 0.046845823526382446,
-      "learning_rate": 2.5896851511475186e-05,
-      "loss": 0.0269,
       "step": 255
     },
     {
-      "epoch": 0.6933333333333334,
-      "grad_norm": 0.03639785200357437,
-      "learning_rate": 2.3988027569455895e-05,
-      "loss": 0.0224,
       "step": 260
     },
     {
-      "epoch": 0.7066666666666667,
-      "grad_norm": 0.03725459799170494,
-      "learning_rate": 2.2129837667626145e-05,
-      "loss": 0.0224,
       "step": 265
     },
     {
-      "epoch": 0.72,
-      "grad_norm": 0.04400669410824776,
-      "learning_rate": 2.0325898896632177e-05,
-      "loss": 0.0202,
       "step": 270
     },
     {
-      "epoch": 0.7333333333333333,
-      "grad_norm": 0.04929959774017334,
-      "learning_rate": 1.8579722743673773e-05,
-      "loss": 0.027,
       "step": 275
     },
     {
-      "epoch": 0.7466666666666667,
-      "grad_norm": 0.05031820759177208,
-      "learning_rate": 1.689470825715998e-05,
-      "loss": 0.023,
       "step": 280
     },
     {
-      "epoch": 0.76,
-      "grad_norm": 0.03400196135044098,
-      "learning_rate": 1.5274135430234654e-05,
-      "loss": 0.0222,
       "step": 285
     },
     {
-      "epoch": 0.7733333333333333,
-      "grad_norm": 0.04948483780026436,
-      "learning_rate": 1.3721158816050873e-05,
-      "loss": 0.0237,
       "step": 290
     },
     {
-      "epoch": 0.7866666666666666,
-      "grad_norm": 0.036222368478775024,
-      "learning_rate": 1.2238801387222714e-05,
-      "loss": 0.017,
       "step": 295
     },
     {
-      "epoch": 0.8,
-      "grad_norm": 0.05248803272843361,
-      "learning_rate": 1.0829948651407374e-05,
-      "loss": 0.0225,
       "step": 300
     },
     {
-      "epoch": 0.8133333333333334,
-      "grad_norm": 0.04245099052786827,
-      "learning_rate": 9.497343034471895e-06,
-      "loss": 0.0206,
       "step": 305
     },
     {
-      "epoch": 0.8266666666666667,
-      "grad_norm": 0.054338328540325165,
-      "learning_rate": 8.243578542178226e-06,
-      "loss": 0.0263,
       "step": 310
     },
     {
-      "epoch": 0.84,
-      "grad_norm": 0.053041160106658936,
-      "learning_rate": 7.071095710777925e-06,
-      "loss": 0.0261,
       "step": 315
     },
     {
-      "epoch": 0.8533333333333334,
-      "grad_norm": 0.04103247821331024,
-      "learning_rate": 5.982176856345445e-06,
-      "loss": 0.0277,
       "step": 320
     },
     {
-      "epoch": 0.8666666666666667,
-      "grad_norm": 0.038258735090494156,
-      "learning_rate": 4.978941632097611e-06,
-      "loss": 0.0196,
       "step": 325
     },
     {
-      "epoch": 0.88,
-      "grad_norm": 0.034113574773073196,
-      "learning_rate": 4.0633429023472e-06,
-      "loss": 0.0228,
       "step": 330
     },
     {
-      "epoch": 0.8933333333333333,
-      "grad_norm": 0.037502411752939224,
-      "learning_rate": 3.2371629411221848e-06,
-      "loss": 0.0216,
       "step": 335
     },
     {
-      "epoch": 0.9066666666666666,
-      "grad_norm": 0.041906435042619705,
-      "learning_rate": 2.50200996285046e-06,
-      "loss": 0.0245,
       "step": 340
     },
     {
-      "epoch": 0.92,
-      "grad_norm": 0.03619668632745743,
-      "learning_rate": 1.8593149918630925e-06,
-      "loss": 0.0193,
       "step": 345
     },
     {
-      "epoch": 0.9333333333333333,
-      "grad_norm": 0.053711794316768646,
-      "learning_rate": 1.3103290768099797e-06,
-      "loss": 0.0209,
       "step": 350
     },
     {
-      "epoch": 0.9466666666666667,
-      "grad_norm": 0.04455176740884781,
-      "learning_rate": 8.561208554101863e-07,
-      "loss": 0.0211,
       "step": 355
     },
     {
-      "epoch": 0.96,
-      "grad_norm": 0.04045643284916878,
-      "learning_rate": 4.975744742772848e-07,
-      "loss": 0.0214,
       "step": 360
     },
     {
-      "epoch": 0.9733333333333334,
-      "grad_norm": 0.039244893938302994,
-      "learning_rate": 2.3538786786896915e-07,
-      "loss": 0.0194,
       "step": 365
     },
     {
-      "epoch": 0.9866666666666667,
-      "grad_norm": 0.03764765337109566,
-      "learning_rate": 7.007139991108135e-08,
-      "loss": 0.0246,
       "step": 370
     },
     {
-      "epoch": 1.0,
-      "grad_norm": 0.04596920311450958,
-      "learning_rate": 1.9468699405444934e-09,
-      "loss": 0.0253,
       "step": 375
     }
   ],
   "logging_steps": 5,
-  "max_steps": 375,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -547,12 +1142,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": true
       },
       "attributes": {}
     }
   },
-  "total_flos": 5.816262106860749e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.8921103986618344,
   "eval_steps": 500,
+  "global_step": 800,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.005575689991636465,
+      "grad_norm": 0.16284966468811035,
+      "learning_rate": 7.142857142857143e-06,
+      "loss": 0.0626,
       "step": 5
     },
     {
+      "epoch": 0.01115137998327293,
+      "grad_norm": 0.10163773596286774,
+      "learning_rate": 1.6071428571428572e-05,
+      "loss": 0.051,
       "step": 10
     },
     {
+      "epoch": 0.016727069974909393,
+      "grad_norm": 0.12601175904273987,
+      "learning_rate": 2.5e-05,
+      "loss": 0.041,
       "step": 15
     },
     {
+      "epoch": 0.02230275996654586,
+      "grad_norm": 0.04396357387304306,
+      "learning_rate": 3.392857142857143e-05,
+      "loss": 0.0336,
       "step": 20
     },
     {
+      "epoch": 0.027878449958182325,
+      "grad_norm": 0.04541896656155586,
+      "learning_rate": 4.2857142857142856e-05,
+      "loss": 0.03,
       "step": 25
     },
     {
+      "epoch": 0.03345413994981879,
+      "grad_norm": 0.035561174154281616,
+      "learning_rate": 5.1785714285714296e-05,
+      "loss": 0.026,
       "step": 30
     },
     {
+      "epoch": 0.039029829941455256,
+      "grad_norm": 0.04301896691322327,
+      "learning_rate": 6.0714285714285715e-05,
+      "loss": 0.0232,
       "step": 35
     },
     {
+      "epoch": 0.04460551993309172,
+      "grad_norm": 0.04101714491844177,
+      "learning_rate": 6.964285714285715e-05,
+      "loss": 0.0234,
       "step": 40
     },
     {
+      "epoch": 0.05018120992472819,
+      "grad_norm": 0.03225620836019516,
+      "learning_rate": 7.857142857142858e-05,
+      "loss": 0.0199,
       "step": 45
     },
     {
+      "epoch": 0.05575689991636465,
+      "grad_norm": 0.03156769648194313,
+      "learning_rate": 8.75e-05,
+      "loss": 0.02,
       "step": 50
     },
     {
+      "epoch": 0.06133258990800112,
+      "grad_norm": 0.03408223018050194,
+      "learning_rate": 9.642857142857143e-05,
+      "loss": 0.0227,
       "step": 55
     },
     {
+      "epoch": 0.06690827989963757,
+      "grad_norm": 0.031089385971426964,
+      "learning_rate": 9.999803846452024e-05,
+      "loss": 0.02,
       "step": 60
     },
     {
+      "epoch": 0.07248396989127405,
+      "grad_norm": 0.033274080604314804,
+      "learning_rate": 9.998605186060137e-05,
+      "loss": 0.014,
       "step": 65
     },
     {
+      "epoch": 0.07805965988291051,
+      "grad_norm": 0.02827683836221695,
+      "learning_rate": 9.996317100396068e-05,
+      "loss": 0.0202,
       "step": 70
     },
     {
+      "epoch": 0.08363534987454697,
+      "grad_norm": 0.037661969661712646,
+      "learning_rate": 9.992940088138597e-05,
+      "loss": 0.0222,
       "step": 75
     },
     {
+      "epoch": 0.08921103986618344,
+      "grad_norm": 0.04279659315943718,
+      "learning_rate": 9.988474885293544e-05,
+      "loss": 0.0186,
       "step": 80
     },
     {
+      "epoch": 0.0947867298578199,
+      "grad_norm": 0.026112260296940804,
+      "learning_rate": 9.98292246503335e-05,
+      "loss": 0.02,
       "step": 85
     },
     {
+      "epoch": 0.10036241984945637,
+      "grad_norm": 0.029179614037275314,
+      "learning_rate": 9.976284037484988e-05,
+      "loss": 0.0175,
       "step": 90
     },
     {
+      "epoch": 0.10593810984109284,
+      "grad_norm": 0.035265687853097916,
+      "learning_rate": 9.968561049466214e-05,
+      "loss": 0.0163,
       "step": 95
     },
     {
+      "epoch": 0.1115137998327293,
+      "grad_norm": 0.02659149281680584,
+      "learning_rate": 9.95975518417024e-05,
+      "loss": 0.0134,
       "step": 100
     },
     {
+      "epoch": 0.11708948982436576,
+      "grad_norm": 0.03535303473472595,
+      "learning_rate": 9.949868360798893e-05,
+      "loss": 0.0174,
       "step": 105
     },
     {
+      "epoch": 0.12266517981600224,
+      "grad_norm": 0.028047222644090652,
+      "learning_rate": 9.938902734144326e-05,
+      "loss": 0.014,
       "step": 110
     },
     {
+      "epoch": 0.12824086980763869,
+      "grad_norm": 0.025049524381756783,
+      "learning_rate": 9.926860694119398e-05,
+      "loss": 0.014,
       "step": 115
     },
     {
+      "epoch": 0.13381655979927515,
+      "grad_norm": 0.03536759689450264,
+      "learning_rate": 9.913744865236798e-05,
+      "loss": 0.0161,
       "step": 120
     },
     {
+      "epoch": 0.13939224979091164,
+      "grad_norm": 0.03142830356955528,
+      "learning_rate": 9.899558106037039e-05,
+      "loss": 0.016,
       "step": 125
     },
     {
+      "epoch": 0.1449679397825481,
+      "grad_norm": 0.029837962239980698,
+      "learning_rate": 9.884303508465463e-05,
+      "loss": 0.017,
       "step": 130
     },
     {
+      "epoch": 0.15054362977418456,
+      "grad_norm": 0.050843242555856705,
+      "learning_rate": 9.867984397198348e-05,
+      "loss": 0.0209,
       "step": 135
     },
     {
+      "epoch": 0.15611931976582102,
+      "grad_norm": 0.022020680829882622,
+      "learning_rate": 9.85060432891833e-05,
+      "loss": 0.013,
       "step": 140
     },
     {
+      "epoch": 0.1616950097574575,
+      "grad_norm": 0.03178994357585907,
+      "learning_rate": 9.832167091539214e-05,
+      "loss": 0.0144,
       "step": 145
     },
     {
+      "epoch": 0.16727069974909395,
+      "grad_norm": 0.030585451051592827,
+      "learning_rate": 9.812676703380433e-05,
+      "loss": 0.0144,
       "step": 150
     },
     {
+      "epoch": 0.1728463897407304,
+      "grad_norm": 0.02348562888801098,
+      "learning_rate": 9.792137412291265e-05,
+      "loss": 0.0137,
       "step": 155
     },
     {
+      "epoch": 0.17842207973236687,
+      "grad_norm": 0.028541121631860733,
+      "learning_rate": 9.770553694725028e-05,
+      "loss": 0.0169,
       "step": 160
     },
     {
+      "epoch": 0.18399776972400333,
+      "grad_norm": 0.02496708557009697,
+      "learning_rate": 9.747930254763467e-05,
+      "loss": 0.013,
       "step": 165
     },
     {
+      "epoch": 0.1895734597156398,
+      "grad_norm": 0.03572074696421623,
+      "learning_rate": 9.724272023091503e-05,
+      "loss": 0.0164,
       "step": 170
     },
     {
+      "epoch": 0.1951491497072763,
+      "grad_norm": 0.02730608731508255,
+      "learning_rate": 9.699584155922625e-05,
+      "loss": 0.0135,
       "step": 175
     },
     {
+      "epoch": 0.20072483969891275,
+      "grad_norm": 0.03099130466580391,
+      "learning_rate": 9.673872033875109e-05,
+      "loss": 0.0157,
       "step": 180
     },
     {
+      "epoch": 0.2063005296905492,
+      "grad_norm": 0.031458914279937744,
+      "learning_rate": 9.64714126079933e-05,
+      "loss": 0.0138,
       "step": 185
     },
     {
+      "epoch": 0.21187621968218567,
+      "grad_norm": 0.03125375509262085,
+      "learning_rate": 9.619397662556435e-05,
+      "loss": 0.0114,
       "step": 190
     },
     {
+      "epoch": 0.21745190967382214,
+      "grad_norm": 0.031778380274772644,
+      "learning_rate": 9.590647285748613e-05,
+      "loss": 0.0117,
       "step": 195
     },
     {
+      "epoch": 0.2230275996654586,
+      "grad_norm": 0.019305897876620293,
+      "learning_rate": 9.56089639640127e-05,
+      "loss": 0.0143,
       "step": 200
     },
     {
+      "epoch": 0.22860328965709506,
+      "grad_norm": 0.02124331146478653,
+      "learning_rate": 9.530151478597366e-05,
+      "loss": 0.0135,
       "step": 205
     },
     {
+      "epoch": 0.23417897964873152,
+      "grad_norm": 0.033200375735759735,
+      "learning_rate": 9.498419233064246e-05,
+      "loss": 0.0143,
       "step": 210
     },
     {
+      "epoch": 0.23975466964036798,
+      "grad_norm": 0.03514528647065163,
+      "learning_rate": 9.465706575713236e-05,
+      "loss": 0.0204,
       "step": 215
     },
     {
+      "epoch": 0.24533035963200447,
+      "grad_norm": 0.029311848804354668,
+      "learning_rate": 9.432020636132354e-05,
+      "loss": 0.0176,
       "step": 220
     },
     {
+      "epoch": 0.25090604962364094,
+      "grad_norm": 0.030977483838796616,
+      "learning_rate": 9.397368756032445e-05,
+      "loss": 0.0146,
       "step": 225
     },
     {
+      "epoch": 0.25648173961527737,
+      "grad_norm": 0.025566186755895615,
+      "learning_rate": 9.361758487647082e-05,
+      "loss": 0.0136,
       "step": 230
     },
     {
+      "epoch": 0.26205742960691386,
+      "grad_norm": 0.02457290142774582,
+      "learning_rate": 9.32519759208659e-05,
+      "loss": 0.0152,
       "step": 235
     },
     {
+      "epoch": 0.2676331195985503,
+      "grad_norm": 0.023812102153897285,
+      "learning_rate": 9.287694037646548e-05,
+      "loss": 0.0148,
       "step": 240
     },
     {
+      "epoch": 0.2732088095901868,
+      "grad_norm": 0.023294365033507347,
+      "learning_rate": 9.249255998071126e-05,
+      "loss": 0.0123,
       "step": 245
     },
     {
+      "epoch": 0.2787844995818233,
+      "grad_norm": 0.018993759527802467,
+      "learning_rate": 9.209891850771657e-05,
+      "loss": 0.0099,
       "step": 250
     },
     {
+      "epoch": 0.2843601895734597,
+      "grad_norm": 0.034646522253751755,
+      "learning_rate": 9.169610175000812e-05,
+      "loss": 0.0139,
       "step": 255
     },
     {
+      "epoch": 0.2899358795650962,
+      "grad_norm": 0.029509609565138817,
+      "learning_rate": 9.12841974998278e-05,
+      "loss": 0.0117,
       "step": 260
     },
     {
+      "epoch": 0.29551156955673263,
+      "grad_norm": 0.024864595383405685,
+      "learning_rate": 9.086329552999891e-05,
+      "loss": 0.0146,
       "step": 265
     },
     {
+      "epoch": 0.3010872595483691,
+      "grad_norm": 0.023953670635819435,
+      "learning_rate": 9.043348757436037e-05,
+      "loss": 0.0131,
       "step": 270
     },
     {
+      "epoch": 0.30666294954000556,
+      "grad_norm": 0.019653376191854477,
+      "learning_rate": 8.99948673077738e-05,
+      "loss": 0.0107,
       "step": 275
     },
     {
+      "epoch": 0.31223863953164205,
+      "grad_norm": 0.03767814487218857,
+      "learning_rate": 8.954753032570742e-05,
+      "loss": 0.0143,
       "step": 280
     },
     {
+      "epoch": 0.3178143295232785,
+      "grad_norm": 0.021987926214933395,
+      "learning_rate": 8.90915741234015e-05,
+      "loss": 0.0098,
       "step": 285
     },
     {
+      "epoch": 0.323390019514915,
+      "grad_norm": 0.021244384348392487,
+      "learning_rate": 8.862709807461956e-05,
+      "loss": 0.0106,
       "step": 290
     },
     {
+      "epoch": 0.32896570950655146,
+      "grad_norm": 0.02844163216650486,
+      "learning_rate": 8.815420340999033e-05,
+      "loss": 0.0162,
       "step": 295
     },
     {
+      "epoch": 0.3345413994981879,
+      "grad_norm": 0.02485722117125988,
+      "learning_rate": 8.767299319494503e-05,
+      "loss": 0.0164,
       "step": 300
     },
     {
+      "epoch": 0.3401170894898244,
+      "grad_norm": 0.019627615809440613,
+      "learning_rate": 8.718357230725449e-05,
+      "loss": 0.0139,
       "step": 305
     },
     {
+      "epoch": 0.3456927794814608,
+      "grad_norm": 0.02517726831138134,
+      "learning_rate": 8.668604741417171e-05,
+      "loss": 0.0128,
       "step": 310
     },
     {
+      "epoch": 0.3512684694730973,
+      "grad_norm": 0.02175074815750122,
+      "learning_rate": 8.618052694918399e-05,
+      "loss": 0.0111,
       "step": 315
     },
     {
+      "epoch": 0.35684415946473375,
+      "grad_norm": 0.021222930401563644,
+      "learning_rate": 8.566712108838042e-05,
+      "loss": 0.0111,
       "step": 320
     },
     {
+      "epoch": 0.36241984945637024,
+      "grad_norm": 0.024494808167219162,
+      "learning_rate": 8.514594172643934e-05,
+      "loss": 0.0138,
       "step": 325
     },
     {
+      "epoch": 0.36799553944800667,
+      "grad_norm": 0.022174010053277016,
+      "learning_rate": 8.461710245224148e-05,
+      "loss": 0.0134,
       "step": 330
     },
     {
+      "epoch": 0.37357122943964316,
+      "grad_norm": 0.01959528774023056,
+      "learning_rate": 8.40807185241137e-05,
+      "loss": 0.0102,
       "step": 335
     },
     {
+      "epoch": 0.3791469194312796,
+      "grad_norm": 0.017945902422070503,
+      "learning_rate": 8.353690684470884e-05,
+      "loss": 0.0143,
       "step": 340
     },
     {
+      "epoch": 0.3847226094229161,
+      "grad_norm": 0.020864926278591156,
+      "learning_rate": 8.298578593552737e-05,
+      "loss": 0.0179,
       "step": 345
     },
     {
+      "epoch": 0.3902982994145526,
+      "grad_norm": 0.027325566858053207,
+      "learning_rate": 8.242747591108605e-05,
+      "loss": 0.0133,
       "step": 350
     },
     {
+      "epoch": 0.395873989406189,
+      "grad_norm": 0.019658569246530533,
+      "learning_rate": 8.186209845273954e-05,
+      "loss": 0.0139,
       "step": 355
     },
     {
+      "epoch": 0.4014496793978255,
+      "grad_norm": 0.02014886401593685,
+      "learning_rate": 8.128977678216039e-05,
+      "loss": 0.009,
       "step": 360
     },
     {
+      "epoch": 0.40702536938946193,
+      "grad_norm": 0.02425803802907467,
+      "learning_rate": 8.07106356344834e-05,
+      "loss": 0.0125,
       "step": 365
     },
     {
+      "epoch": 0.4126010593810984,
+      "grad_norm": 0.030235106125473976,
+      "learning_rate": 8.012480123112014e-05,
+      "loss": 0.0171,
       "step": 370
     },
     {
+      "epoch": 0.41817674937273486,
+      "grad_norm": 0.022229960188269615,
+      "learning_rate": 7.953240125224948e-05,
+      "loss": 0.0116,
       "step": 375
+    },
+    {
+      "epoch": 0.42375243936437135,
+      "grad_norm": 0.025448938831686974,
+      "learning_rate": 7.89335648089903e-05,
+      "loss": 0.0142,
+      "step": 380
+    },
+    {
+      "epoch": 0.4293281293560078,
+      "grad_norm": 0.023552658036351204,
+      "learning_rate": 7.832842241526212e-05,
+      "loss": 0.0147,
+      "step": 385
+    },
+    {
+      "epoch": 0.43490381934764427,
+      "grad_norm": 0.019487692043185234,
+      "learning_rate": 7.77171059593403e-05,
+      "loss": 0.0115,
+      "step": 390
+    },
+    {
+      "epoch": 0.44047950933928076,
+      "grad_norm": 0.021791953593492508,
+      "learning_rate": 7.709974867511138e-05,
+      "loss": 0.012,
+      "step": 395
+    },
+    {
+      "epoch": 0.4460551993309172,
+      "grad_norm": 0.02281327173113823,
+      "learning_rate": 7.647648511303544e-05,
+      "loss": 0.0126,
+      "step": 400
+    },
+    {
+      "epoch": 0.4516308893225537,
+      "grad_norm": 0.02623576670885086,
+      "learning_rate": 7.584745111082127e-05,
+      "loss": 0.0128,
+      "step": 405
+    },
+    {
+      "epoch": 0.4572065793141901,
+      "grad_norm": 0.019894316792488098,
+      "learning_rate": 7.521278376382123e-05,
+      "loss": 0.0092,
+      "step": 410
+    },
+    {
+      "epoch": 0.4627822693058266,
+      "grad_norm": 0.022427916526794434,
+      "learning_rate": 7.457262139515171e-05,
+      "loss": 0.0111,
+      "step": 415
+    },
+    {
+      "epoch": 0.46835795929746304,
+      "grad_norm": 0.021067511290311813,
+      "learning_rate": 7.392710352554641e-05,
+      "loss": 0.0099,
+      "step": 420
+    },
+    {
+      "epoch": 0.47393364928909953,
+      "grad_norm": 0.019623806700110435,
+      "learning_rate": 7.327637084294817e-05,
+      "loss": 0.012,
+      "step": 425
+    },
+    {
+      "epoch": 0.47950933928073597,
+      "grad_norm": 0.02039971947669983,
+      "learning_rate": 7.262056517184669e-05,
+      "loss": 0.0138,
+      "step": 430
+    },
+    {
+      "epoch": 0.48508502927237246,
+      "grad_norm": 0.021388281136751175,
+      "learning_rate": 7.195982944236851e-05,
+      "loss": 0.0123,
+      "step": 435
+    },
+    {
+      "epoch": 0.49066071926400895,
+      "grad_norm": 0.022272834554314613,
+      "learning_rate": 7.1294307659126e-05,
+      "loss": 0.0126,
+      "step": 440
+    },
+    {
+      "epoch": 0.4962364092556454,
+      "grad_norm": 0.02803129144012928,
+      "learning_rate": 7.062414486983197e-05,
+      "loss": 0.0118,
+      "step": 445
+    },
+    {
+      "epoch": 0.5018120992472819,
+      "grad_norm": 0.025339094921946526,
+      "learning_rate": 6.994948713368737e-05,
+      "loss": 0.0147,
+      "step": 450
+    },
+    {
+      "epoch": 0.5073877892389184,
+      "grad_norm": 0.024465398862957954,
+      "learning_rate": 6.927048148954812e-05,
+      "loss": 0.0118,
+      "step": 455
+    },
+    {
+      "epoch": 0.5129634792305547,
+      "grad_norm": 0.025315098464488983,
+      "learning_rate": 6.858727592387867e-05,
+      "loss": 0.0165,
+      "step": 460
+    },
+    {
+      "epoch": 0.5185391692221912,
+      "grad_norm": 0.020109234377741814,
+      "learning_rate": 6.790001933849899e-05,
+      "loss": 0.0108,
+      "step": 465
+    },
+    {
+      "epoch": 0.5241148592138277,
+      "grad_norm": 0.01888495869934559,
+      "learning_rate": 6.720886151813194e-05,
+      "loss": 0.0097,
+      "step": 470
+    },
+    {
+      "epoch": 0.5296905492054642,
+      "grad_norm": 0.023433005437254906,
+      "learning_rate": 6.651395309775837e-05,
+      "loss": 0.0122,
+      "step": 475
+    },
+    {
+      "epoch": 0.5352662391971006,
+      "grad_norm": 0.019298607483506203,
+      "learning_rate": 6.581544552978687e-05,
+      "loss": 0.0134,
+      "step": 480
+    },
+    {
+      "epoch": 0.5408419291887371,
+      "grad_norm": 0.025588713586330414,
+      "learning_rate": 6.511349105104534e-05,
+      "loss": 0.0108,
+      "step": 485
+    },
+    {
+      "epoch": 0.5464176191803736,
+      "grad_norm": 0.03148540109395981,
+      "learning_rate": 6.440824264960157e-05,
+      "loss": 0.0115,
+      "step": 490
+    },
+    {
+      "epoch": 0.5519933091720101,
+      "grad_norm": 0.01748904027044773,
+      "learning_rate": 6.369985403142014e-05,
+      "loss": 0.0112,
+      "step": 495
+    },
+    {
+      "epoch": 0.5575689991636466,
+      "grad_norm": 0.027883267030119896,
+      "learning_rate": 6.298847958686283e-05,
+      "loss": 0.0125,
+      "step": 500
+    },
+    {
+      "epoch": 0.5631446891552829,
+      "grad_norm": 0.02129966951906681,
+      "learning_rate": 6.227427435703997e-05,
+      "loss": 0.0149,
+      "step": 505
+    },
+    {
+      "epoch": 0.5687203791469194,
+      "grad_norm": 0.02562125027179718,
+      "learning_rate": 6.15573940000197e-05,
+      "loss": 0.0136,
+      "step": 510
+    },
+    {
+      "epoch": 0.5742960691385559,
+      "grad_norm": 0.02446940541267395,
+      "learning_rate": 6.083799475690309e-05,
+      "loss": 0.0112,
+      "step": 515
+    },
+    {
+      "epoch": 0.5798717591301924,
+      "grad_norm": 0.024061646312475204,
+      "learning_rate": 6.0116233417771994e-05,
+      "loss": 0.0115,
+      "step": 520
+    },
+    {
+      "epoch": 0.5854474491218288,
+      "grad_norm": 0.01640748232603073,
+      "learning_rate": 5.9392267287517325e-05,
+      "loss": 0.0103,
+      "step": 525
+    },
+    {
+      "epoch": 0.5910231391134653,
+      "grad_norm": 0.023191062733530998,
+      "learning_rate": 5.8666254151554976e-05,
+      "loss": 0.0113,
+      "step": 530
+    },
+    {
+      "epoch": 0.5965988291051018,
+      "grad_norm": 0.017516452819108963,
+      "learning_rate": 5.7938352241437366e-05,
+      "loss": 0.0093,
+      "step": 535
+    },
+    {
+      "epoch": 0.6021745190967382,
+      "grad_norm": 0.019351812079548836,
+      "learning_rate": 5.720872020036734e-05,
+      "loss": 0.0125,
+      "step": 540
+    },
+    {
+      "epoch": 0.6077502090883747,
+      "grad_norm": 0.029706666246056557,
+      "learning_rate": 5.647751704862263e-05,
+      "loss": 0.008,
+      "step": 545
+    },
+    {
+      "epoch": 0.6133258990800111,
+      "grad_norm": 0.016750017181038857,
+      "learning_rate": 5.5744902148898005e-05,
+      "loss": 0.0118,
+      "step": 550
+    },
+    {
+      "epoch": 0.6189015890716476,
+      "grad_norm": 0.022833596915006638,
+      "learning_rate": 5.501103517157288e-05,
+      "loss": 0.0088,
+      "step": 555
+    },
+    {
+      "epoch": 0.6244772790632841,
+      "grad_norm": 0.03475171700119972,
+      "learning_rate": 5.427607605991176e-05,
+      "loss": 0.0136,
+      "step": 560
+    },
+    {
+      "epoch": 0.6300529690549206,
+      "grad_norm": 0.021340183913707733,
+      "learning_rate": 5.354018499520536e-05,
+      "loss": 0.0103,
+      "step": 565
+    },
+    {
+      "epoch": 0.635628659046557,
+      "grad_norm": 0.024497641250491142,
+      "learning_rate": 5.2803522361859594e-05,
+      "loss": 0.011,
+      "step": 570
+    },
+    {
+      "epoch": 0.6412043490381935,
+      "grad_norm": 0.01924068294465542,
+      "learning_rate": 5.2066248712440656e-05,
+      "loss": 0.0125,
+      "step": 575
+    },
+    {
+      "epoch": 0.64678003902983,
+      "grad_norm": 0.017638731747865677,
+      "learning_rate": 5.1328524732683134e-05,
+      "loss": 0.0104,
+      "step": 580
+    },
+    {
+      "epoch": 0.6523557290214664,
+      "grad_norm": 0.022175751626491547,
+      "learning_rate": 5.059051120646924e-05,
+      "loss": 0.0128,
+      "step": 585
+    },
+    {
+      "epoch": 0.6579314190131029,
+      "grad_norm": 0.022414250299334526,
+      "learning_rate": 4.985236898078658e-05,
+      "loss": 0.0128,
+      "step": 590
+    },
+    {
+      "epoch": 0.6635071090047393,
+      "grad_norm": 0.020940134301781654,
+      "learning_rate": 4.911425893067239e-05,
+      "loss": 0.0124,
+      "step": 595
+    },
+    {
+      "epoch": 0.6690827989963758,
+      "grad_norm": 0.021777737885713577,
+      "learning_rate": 4.837634192415128e-05,
+      "loss": 0.0126,
+      "step": 600
+    },
+    {
+      "epoch": 0.6746584889880123,
+      "grad_norm": 0.01768389716744423,
+      "learning_rate": 4.763877878717484e-05,
+      "loss": 0.0095,
+      "step": 605
+    },
+    {
+      "epoch": 0.6802341789796488,
+      "grad_norm": 0.02011968567967415,
+      "learning_rate": 4.6901730268570275e-05,
+      "loss": 0.0093,
+      "step": 610
+    },
+    {
+      "epoch": 0.6858098689712852,
+      "grad_norm": 0.02239886298775673,
+      "learning_rate": 4.616535700500583e-05,
+      "loss": 0.0126,
+      "step": 615
+    },
+    {
+      "epoch": 0.6913855589629216,
+      "grad_norm": 0.022233402356505394,
+      "learning_rate": 4.542981948598071e-05,
+      "loss": 0.0107,
+      "step": 620
+    },
+    {
+      "epoch": 0.6969612489545581,
+      "grad_norm": 0.027380308136343956,
+      "learning_rate": 4.4695278018847105e-05,
+      "loss": 0.0142,
+      "step": 625
+    },
+    {
+      "epoch": 0.7025369389461946,
+      "grad_norm": 0.025442643091082573,
+      "learning_rate": 4.396189269387176e-05,
+      "loss": 0.0153,
+      "step": 630
+    },
+    {
+      "epoch": 0.708112628937831,
+      "grad_norm": 0.01852184161543846,
+      "learning_rate": 4.322982334934509e-05,
+      "loss": 0.0102,
+      "step": 635
+    },
+    {
+      "epoch": 0.7136883189294675,
+      "grad_norm": 0.01528929267078638,
+      "learning_rate": 4.2499229536744986e-05,
+      "loss": 0.0097,
+      "step": 640
+    },
+    {
+      "epoch": 0.719264008921104,
+      "grad_norm": 0.026008352637290955,
+      "learning_rate": 4.17702704859633e-05,
+      "loss": 0.0159,
+      "step": 645
+    },
+    {
+      "epoch": 0.7248396989127405,
+      "grad_norm": 0.018146734684705734,
+      "learning_rate": 4.104310507060234e-05,
+      "loss": 0.0095,
+      "step": 650
+    },
+    {
+      "epoch": 0.730415388904377,
+      "grad_norm": 0.022718293592333794,
+      "learning_rate": 4.0317891773348946e-05,
+      "loss": 0.0095,
+      "step": 655
+    },
+    {
+      "epoch": 0.7359910788960133,
+      "grad_norm": 0.02410387434065342,
+      "learning_rate": 3.959478865143397e-05,
+      "loss": 0.0109,
+      "step": 660
+    },
+    {
+      "epoch": 0.7415667688876498,
+      "grad_norm": 0.017437651753425598,
+      "learning_rate": 3.887395330218429e-05,
+      "loss": 0.0107,
+      "step": 665
+    },
+    {
+      "epoch": 0.7471424588792863,
+      "grad_norm": 0.020500419661402702,
+      "learning_rate": 3.815554282867513e-05,
+      "loss": 0.0107,
+      "step": 670
+    },
+    {
+      "epoch": 0.7527181488709228,
+      "grad_norm": 0.01553898025304079,
+      "learning_rate": 3.743971380549008e-05,
+      "loss": 0.0083,
+      "step": 675
+    },
+    {
+      "epoch": 0.7582938388625592,
+      "grad_norm": 0.020166153088212013,
+      "learning_rate": 3.67266222445964e-05,
+      "loss": 0.0111,
+      "step": 680
+    },
+    {
+      "epoch": 0.7638695288541957,
+      "grad_norm": 0.023076798766851425,
+      "learning_rate": 3.6016423561342706e-05,
+      "loss": 0.0128,
+      "step": 685
+    },
+    {
+      "epoch": 0.7694452188458322,
+      "grad_norm": 0.02140919119119644,
+      "learning_rate": 3.5309272540587e-05,
+      "loss": 0.0104,
+      "step": 690
+    },
+    {
+      "epoch": 0.7750209088374687,
+      "grad_norm": 0.016010567545890808,
+      "learning_rate": 3.4605323302961854e-05,
+      "loss": 0.0135,
+      "step": 695
+    },
+    {
+      "epoch": 0.7805965988291051,
+      "grad_norm": 0.017510782927274704,
+      "learning_rate": 3.3904729271284473e-05,
+      "loss": 0.0115,
+      "step": 700
+    },
+    {
+      "epoch": 0.7861722888207415,
+      "grad_norm": 0.024468230083584785,
+      "learning_rate": 3.3207643137118874e-05,
+      "loss": 0.01,
+      "step": 705
+    },
+    {
+      "epoch": 0.791747978812378,
+      "grad_norm": 0.020976202562451363,
+      "learning_rate": 3.251421682749732e-05,
+      "loss": 0.0114,
+      "step": 710
+    },
+    {
+      "epoch": 0.7973236688040145,
+      "grad_norm": 0.02256660722196102,
+      "learning_rate": 3.18246014718085e-05,
+      "loss": 0.0108,
+      "step": 715
+    },
+    {
+      "epoch": 0.802899358795651,
+      "grad_norm": 0.030024701729416847,
+      "learning_rate": 3.113894736885953e-05,
+      "loss": 0.0104,
+      "step": 720
+    },
+    {
+      "epoch": 0.8084750487872874,
+      "grad_norm": 0.019312310963869095,
+      "learning_rate": 3.0457403954118856e-05,
+      "loss": 0.0082,
+      "step": 725
+    },
+    {
+      "epoch": 0.8140507387789239,
+      "grad_norm": 0.02008041925728321,
+      "learning_rate": 2.978011976714753e-05,
+      "loss": 0.0099,
+      "step": 730
+    },
+    {
+      "epoch": 0.8196264287705604,
+      "grad_norm": 0.021896323189139366,
+      "learning_rate": 2.9107242419225577e-05,
+      "loss": 0.0143,
+      "step": 735
+    },
+    {
+      "epoch": 0.8252021187621968,
+      "grad_norm": 0.019579166546463966,
+      "learning_rate": 2.8438918561180634e-05,
+      "loss": 0.0106,
+      "step": 740
+    },
+    {
+      "epoch": 0.8307778087538333,
+      "grad_norm": 0.02049921080470085,
+      "learning_rate": 2.7775293851426232e-05,
+      "loss": 0.0115,
+      "step": 745
+    },
+    {
+      "epoch": 0.8363534987454697,
+      "grad_norm": 0.014969157055020332,
+      "learning_rate": 2.711651292421593e-05,
+      "loss": 0.0101,
+      "step": 750
+    },
+    {
+      "epoch": 0.8419291887371062,
+      "grad_norm": 0.020416075363755226,
+      "learning_rate": 2.646271935812098e-05,
+      "loss": 0.0098,
+      "step": 755
+    },
+    {
+      "epoch": 0.8475048787287427,
+      "grad_norm": 0.018367785960435867,
+      "learning_rate": 2.581405564473801e-05,
+      "loss": 0.0165,
+      "step": 760
+    },
+    {
+      "epoch": 0.8530805687203792,
+      "grad_norm": 0.0190111193805933,
+      "learning_rate": 2.5170663157633477e-05,
+      "loss": 0.0135,
+      "step": 765
+    },
+    {
+      "epoch": 0.8586562587120156,
+      "grad_norm": 0.024806899949908257,
+      "learning_rate": 2.45326821215319e-05,
+      "loss": 0.0116,
+      "step": 770
+    },
+    {
+      "epoch": 0.864231948703652,
+      "grad_norm": 0.02073819749057293,
+      "learning_rate": 2.390025158175458e-05,
+      "loss": 0.0129,
+      "step": 775
+    },
+    {
+      "epoch": 0.8698076386952885,
+      "grad_norm": 0.02042596973478794,
+      "learning_rate": 2.3273509373915093e-05,
+      "loss": 0.0088,
+      "step": 780
+    },
+    {
+      "epoch": 0.875383328686925,
+      "grad_norm": 0.015911240130662918,
+      "learning_rate": 2.2652592093878666e-05,
+      "loss": 0.01,
+      "step": 785
+    },
+    {
+      "epoch": 0.8809590186785615,
+      "grad_norm": 0.023589760065078735,
+      "learning_rate": 2.2037635067991663e-05,
+      "loss": 0.0107,
+      "step": 790
+    },
+    {
+      "epoch": 0.8865347086701979,
+      "grad_norm": 0.017796384170651436,
+      "learning_rate": 2.1428772323587827e-05,
+      "loss": 0.0103,
+      "step": 795
+    },
+    {
+      "epoch": 0.8921103986618344,
+      "grad_norm": 0.018958481028676033,
+      "learning_rate": 2.082613655977745e-05,
+      "loss": 0.0079,
+      "step": 800
     }
   ],
   "logging_steps": 5,
+  "max_steps": 1120,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": false
       },
       "attributes": {}
     }
   },
+  "total_flos": 2.728813755050754e+18,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:51c4eab5dcfa36e42ea1871a443badd7333ec1d0f3b46f9485f900e1f6e3db2c
 size 7825

 version https://git-lfs.github.com/spec/v1
+oid sha256:8568d83effc9ae0e86eecad81d4c2bc1c32496e167d72cebf2a94b86d0aa123c
 size 7825