Model save

Browse files

Files changed (9) hide show

README.md +1 -1
all_results.json +5 -5
model.safetensors +1 -1
runs/Feb14_06-44-19_yekyung-nah-0/events.out.tfevents.1739483078.yekyung-nah-0.3735356.0 +2 -2
runs/Feb14_06-47-41_yekyung-nah-0/events.out.tfevents.1739483278.yekyung-nah-0.3747757.0 +3 -0
runs/Feb14_06-47-47_yekyung-nah-0/events.out.tfevents.1739483287.yekyung-nah-0.3748246.0 +3 -0
train_results.json +5 -5
trainer_state.json +444 -493
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -27,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yknxxh-seoul-national-university/huggingface/runs/zn4j6jmk)
 This model was trained with SFT.

 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yknxxh-seoul-national-university/huggingface/runs/qk077co9)
 This model was trained with SFT.

all_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
     "epoch": 3.0,
-    "total_flos": 2.7685804472008704e+16,
-    "train_loss": 1.0384218392182714,
-    "train_runtime": 242.5709,
     "train_samples": 10000,
-    "train_samples_per_second": 38.179,
-    "train_steps_per_second": 2.387
 }

 {
     "epoch": 3.0,
+    "total_flos": 2.5856227500097536e+16,
+    "train_loss": 1.017678025019103,
+    "train_runtime": 224.2528,
     "train_samples": 10000,
+    "train_samples_per_second": 38.568,
+    "train_steps_per_second": 2.421
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:95a541f3397f3ea70e4797acd153ad3d579b4b029fa3dbbe7c2edb68c03c5457
 size 2471645608

 version https://git-lfs.github.com/spec/v1
+oid sha256:6e1d80a91a84e41f750f86c8e62de9533f001784a64e4505162111138a38af21
 size 2471645608

runs/Feb14_06-44-19_yekyung-nah-0/events.out.tfevents.1739483078.yekyung-nah-0.3735356.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:35ab42525d078c3a6d2274c74a43ae0a90a88dfc1bc107d7fa43542431819df7
-size 16266

 version https://git-lfs.github.com/spec/v1
+oid sha256:0619ad61ddd9702259f41551a76d2a8d20c1d373543c7fe2cfc85b3cc05f2882
+size 28858

runs/Feb14_06-47-41_yekyung-nah-0/events.out.tfevents.1739483278.yekyung-nah-0.3747757.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1356a3b87766e684961517e4490bb72c4798f8f3eb9a3c37bd1f1e92a3fc45f6
+size 10577

runs/Feb14_06-47-47_yekyung-nah-0/events.out.tfevents.1739483287.yekyung-nah-0.3748246.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f9a74a3bb1e50421899df919e0ee24f13c88d5b1c4097baaf2aed112704f7ef
+size 9956

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
     "epoch": 3.0,
-    "total_flos": 2.7685804472008704e+16,
-    "train_loss": 1.0384218392182714,
-    "train_runtime": 242.5709,
     "train_samples": 10000,
-    "train_samples_per_second": 38.179,
-    "train_steps_per_second": 2.387
 }

 {
     "epoch": 3.0,
+    "total_flos": 2.5856227500097536e+16,
+    "train_loss": 1.017678025019103,
+    "train_runtime": 224.2528,
     "train_samples": 10000,
+    "train_samples_per_second": 38.568,
+    "train_steps_per_second": 2.421
 }

trainer_state.json CHANGED Viewed

@@ -3,835 +3,786 @@
   "best_model_checkpoint": null,
   "epoch": 3.0,
   "eval_steps": 500,
-  "global_step": 579,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.0051813471502590676,
-      "grad_norm": 7.28125,
-      "learning_rate": 3.4482758620689656e-07,
-      "loss": 2.3287,
       "step": 1
     },
     {
-      "epoch": 0.025906735751295335,
-      "grad_norm": 7.125,
-      "learning_rate": 1.724137931034483e-06,
-      "loss": 2.3778,
       "step": 5
     },
     {
-      "epoch": 0.05181347150259067,
-      "grad_norm": 7.34375,
-      "learning_rate": 3.448275862068966e-06,
-      "loss": 2.3462,
       "step": 10
     },
     {
-      "epoch": 0.07772020725388601,
-      "grad_norm": 7.625,
-      "learning_rate": 5.172413793103449e-06,
-      "loss": 2.303,
       "step": 15
     },
     {
-      "epoch": 0.10362694300518134,
-      "grad_norm": 6.0625,
-      "learning_rate": 6.896551724137932e-06,
-      "loss": 2.2303,
       "step": 20
     },
     {
-      "epoch": 0.12953367875647667,
-      "grad_norm": 6.03125,
-      "learning_rate": 8.620689655172414e-06,
-      "loss": 2.1691,
       "step": 25
     },
     {
-      "epoch": 0.15544041450777202,
-      "grad_norm": 3.921875,
-      "learning_rate": 1.0344827586206898e-05,
-      "loss": 1.9911,
       "step": 30
     },
     {
-      "epoch": 0.18134715025906736,
-      "grad_norm": 3.3125,
-      "learning_rate": 1.206896551724138e-05,
-      "loss": 1.8287,
       "step": 35
     },
     {
-      "epoch": 0.20725388601036268,
-      "grad_norm": 2.5,
-      "learning_rate": 1.3793103448275863e-05,
-      "loss": 1.7025,
       "step": 40
     },
     {
-      "epoch": 0.23316062176165803,
-      "grad_norm": 2.3125,
-      "learning_rate": 1.5517241379310346e-05,
-      "loss": 1.5651,
       "step": 45
     },
     {
-      "epoch": 0.25906735751295334,
-      "grad_norm": 1.875,
-      "learning_rate": 1.7241379310344828e-05,
-      "loss": 1.4727,
       "step": 50
     },
     {
-      "epoch": 0.2849740932642487,
-      "grad_norm": 2.03125,
-      "learning_rate": 1.896551724137931e-05,
-      "loss": 1.3068,
       "step": 55
     },
     {
-      "epoch": 0.31088082901554404,
-      "grad_norm": 1.2578125,
-      "learning_rate": 1.9999272808103276e-05,
-      "loss": 1.1946,
       "step": 60
     },
     {
-      "epoch": 0.33678756476683935,
       "grad_norm": 1.0078125,
-      "learning_rate": 1.9991093113822542e-05,
-      "loss": 1.1095,
       "step": 65
     },
     {
-      "epoch": 0.3626943005181347,
-      "grad_norm": 0.91015625,
-      "learning_rate": 1.997383219496441e-05,
-      "loss": 1.0579,
       "step": 70
     },
     {
-      "epoch": 0.38860103626943004,
-      "grad_norm": 0.73046875,
-      "learning_rate": 1.9947505740530947e-05,
-      "loss": 1.0251,
       "step": 75
     },
     {
-      "epoch": 0.41450777202072536,
-      "grad_norm": 0.7421875,
-      "learning_rate": 1.991213767947991e-05,
-      "loss": 1.0027,
       "step": 80
     },
     {
-      "epoch": 0.44041450777202074,
-      "grad_norm": 0.71875,
-      "learning_rate": 1.986776015897494e-05,
-      "loss": 1.0183,
       "step": 85
     },
     {
-      "epoch": 0.46632124352331605,
-      "grad_norm": 0.67578125,
-      "learning_rate": 1.9814413515165974e-05,
-      "loss": 1.0133,
       "step": 90
     },
     {
-      "epoch": 0.49222797927461137,
-      "grad_norm": 0.640625,
-      "learning_rate": 1.975214623652643e-05,
-      "loss": 0.9781,
       "step": 95
     },
     {
-      "epoch": 0.5181347150259067,
-      "grad_norm": 0.6640625,
-      "learning_rate": 1.968101491978049e-05,
-      "loss": 0.9952,
       "step": 100
     },
     {
-      "epoch": 0.5440414507772021,
-      "grad_norm": 0.6875,
-      "learning_rate": 1.9601084218460494e-05,
-      "loss": 0.9748,
       "step": 105
     },
     {
-      "epoch": 0.5699481865284974,
-      "grad_norm": 0.6171875,
-      "learning_rate": 1.9512426784141307e-05,
-      "loss": 0.9606,
       "step": 110
     },
     {
-      "epoch": 0.5958549222797928,
-      "grad_norm": 0.62890625,
-      "learning_rate": 1.9415123200404962e-05,
-      "loss": 0.985,
       "step": 115
     },
     {
-      "epoch": 0.6217616580310881,
-      "grad_norm": 0.671875,
-      "learning_rate": 1.930926190959566e-05,
-      "loss": 0.9631,
       "step": 120
     },
     {
-      "epoch": 0.6476683937823834,
-      "grad_norm": 0.6328125,
-      "learning_rate": 1.9194939132431682e-05,
-      "loss": 0.9482,
       "step": 125
     },
     {
-      "epoch": 0.6735751295336787,
-      "grad_norm": 0.609375,
-      "learning_rate": 1.9072258780547317e-05,
-      "loss": 0.9733,
       "step": 130
     },
     {
-      "epoch": 0.6994818652849741,
-      "grad_norm": 0.63671875,
-      "learning_rate": 1.894133236204423e-05,
-      "loss": 0.9539,
       "step": 135
     },
     {
-      "epoch": 0.7253886010362695,
-      "grad_norm": 0.70703125,
-      "learning_rate": 1.880227888013818e-05,
-      "loss": 0.9892,
       "step": 140
     },
     {
-      "epoch": 0.7512953367875648,
-      "grad_norm": 0.6171875,
-      "learning_rate": 1.8655224724993202e-05,
-      "loss": 0.9594,
       "step": 145
     },
     {
-      "epoch": 0.7772020725388601,
-      "grad_norm": 0.609375,
-      "learning_rate": 1.850030355884151e-05,
-      "loss": 0.9257,
       "step": 150
     },
     {
-      "epoch": 0.8031088082901554,
-      "grad_norm": 0.62109375,
-      "learning_rate": 1.8337656194493637e-05,
-      "loss": 0.9688,
       "step": 155
     },
     {
-      "epoch": 0.8290155440414507,
-      "grad_norm": 0.65625,
-      "learning_rate": 1.8167430467349145e-05,
-      "loss": 0.9853,
       "step": 160
     },
     {
-      "epoch": 0.8549222797927462,
-      "grad_norm": 0.62109375,
-      "learning_rate": 1.7989781101024305e-05,
-      "loss": 0.9375,
       "step": 165
     },
     {
-      "epoch": 0.8808290155440415,
-      "grad_norm": 0.70703125,
-      "learning_rate": 1.7804869566718833e-05,
-      "loss": 0.9712,
       "step": 170
     },
     {
-      "epoch": 0.9067357512953368,
-      "grad_norm": 0.6953125,
-      "learning_rate": 1.7612863936449568e-05,
-      "loss": 0.951,
       "step": 175
     },
     {
-      "epoch": 0.9326424870466321,
-      "grad_norm": 0.6875,
-      "learning_rate": 1.7413938730284404e-05,
-      "loss": 0.9801,
       "step": 180
     },
     {
-      "epoch": 0.9585492227979274,
-      "grad_norm": 0.640625,
-      "learning_rate": 1.7208274757715425e-05,
-      "loss": 0.9527,
       "step": 185
     },
     {
-      "epoch": 0.9844559585492227,
-      "grad_norm": 0.61328125,
-      "learning_rate": 1.6996058953315372e-05,
-      "loss": 0.9285,
       "step": 190
     },
     {
-      "epoch": 1.0103626943005182,
-      "grad_norm": 0.6171875,
-      "learning_rate": 1.6777484206826793e-05,
-      "loss": 0.9499,
       "step": 195
     },
     {
-      "epoch": 1.0362694300518134,
-      "grad_norm": 0.69921875,
-      "learning_rate": 1.6552749187838425e-05,
-      "loss": 0.9556,
       "step": 200
     },
     {
-      "epoch": 1.0621761658031088,
-      "grad_norm": 0.640625,
-      "learning_rate": 1.632205816520799e-05,
-      "loss": 0.9324,
       "step": 205
     },
     {
-      "epoch": 1.0880829015544042,
-      "grad_norm": 0.7265625,
-      "learning_rate": 1.6085620821395722e-05,
-      "loss": 0.9479,
       "step": 210
     },
     {
-      "epoch": 1.1139896373056994,
-      "grad_norm": 0.57421875,
-      "learning_rate": 1.5843652061877245e-05,
-      "loss": 0.9215,
       "step": 215
     },
     {
-      "epoch": 1.1398963730569949,
-      "grad_norm": 0.5859375,
-      "learning_rate": 1.5596371819809104e-05,
-      "loss": 0.9367,
       "step": 220
     },
     {
-      "epoch": 1.16580310880829,
-      "grad_norm": 0.5703125,
-      "learning_rate": 1.534400485612449e-05,
-      "loss": 0.91,
       "step": 225
     },
     {
-      "epoch": 1.1917098445595855,
-      "grad_norm": 0.63671875,
-      "learning_rate": 1.5086780555240803e-05,
-      "loss": 0.9297,
       "step": 230
     },
     {
-      "epoch": 1.2176165803108807,
-      "grad_norm": 0.56640625,
-      "learning_rate": 1.482493271656482e-05,
-      "loss": 0.9426,
       "step": 235
     },
     {
-      "epoch": 1.2435233160621761,
-      "grad_norm": 0.60546875,
-      "learning_rate": 1.4558699341984928e-05,
-      "loss": 0.9639,
       "step": 240
     },
     {
-      "epoch": 1.2694300518134716,
-      "grad_norm": 0.59765625,
-      "learning_rate": 1.4288322419543576e-05,
-      "loss": 0.946,
       "step": 245
     },
     {
-      "epoch": 1.2953367875647668,
-      "grad_norm": 0.65234375,
-      "learning_rate": 1.40140477034866e-05,
-      "loss": 0.9438,
       "step": 250
     },
     {
-      "epoch": 1.3212435233160622,
-      "grad_norm": 0.6015625,
-      "learning_rate": 1.373612449088931e-05,
-      "loss": 0.9652,
       "step": 255
     },
     {
-      "epoch": 1.3471502590673574,
-      "grad_norm": 0.59765625,
-      "learning_rate": 1.3454805395062386e-05,
-      "loss": 0.9245,
       "step": 260
     },
     {
-      "epoch": 1.3730569948186528,
-      "grad_norm": 0.66015625,
-      "learning_rate": 1.3170346115943575e-05,
-      "loss": 0.9195,
       "step": 265
     },
     {
-      "epoch": 1.3989637305699483,
-      "grad_norm": 0.6015625,
-      "learning_rate": 1.2883005207683784e-05,
-      "loss": 0.9341,
       "step": 270
     },
     {
-      "epoch": 1.4248704663212435,
-      "grad_norm": 0.6015625,
-      "learning_rate": 1.2593043843638978e-05,
-      "loss": 0.9238,
       "step": 275
     },
     {
-      "epoch": 1.450777202072539,
-      "grad_norm": 0.62109375,
-      "learning_rate": 1.2300725578981308e-05,
-      "loss": 0.9271,
       "step": 280
     },
     {
-      "epoch": 1.4766839378238341,
-      "grad_norm": 0.58984375,
-      "learning_rate": 1.2006316111145401e-05,
-      "loss": 0.9348,
       "step": 285
     },
     {
-      "epoch": 1.5025906735751295,
-      "grad_norm": 0.70703125,
-      "learning_rate": 1.1710083038327436e-05,
-      "loss": 0.9341,
       "step": 290
     },
     {
-      "epoch": 1.528497409326425,
-      "grad_norm": 0.70703125,
-      "learning_rate": 1.1412295616256577e-05,
-      "loss": 0.946,
       "step": 295
     },
     {
-      "epoch": 1.5544041450777202,
-      "grad_norm": 0.59375,
-      "learning_rate": 1.1113224513459819e-05,
-      "loss": 0.9134,
       "step": 300
     },
     {
-      "epoch": 1.5803108808290154,
-      "grad_norm": 0.61328125,
-      "learning_rate": 1.0813141565242682e-05,
-      "loss": 0.9177,
       "step": 305
     },
     {
-      "epoch": 1.6062176165803108,
-      "grad_norm": 0.65625,
-      "learning_rate": 1.0512319526609405e-05,
-      "loss": 0.937,
       "step": 310
     },
     {
-      "epoch": 1.6321243523316062,
-      "grad_norm": 0.6015625,
-      "learning_rate": 1.021103182434718e-05,
-      "loss": 0.9398,
       "step": 315
     },
     {
-      "epoch": 1.6580310880829017,
-      "grad_norm": 0.5625,
-      "learning_rate": 9.909552308499792e-06,
-      "loss": 0.9289,
       "step": 320
     },
     {
-      "epoch": 1.6839378238341969,
-      "grad_norm": 0.71484375,
-      "learning_rate": 9.608155003456529e-06,
-      "loss": 0.9075,
       "step": 325
     },
     {
-      "epoch": 1.709844559585492,
-      "grad_norm": 0.6328125,
-      "learning_rate": 9.307113858882664e-06,
-      "loss": 0.9154,
       "step": 330
     },
     {
-      "epoch": 1.7357512953367875,
-      "grad_norm": 0.55859375,
-      "learning_rate": 9.006702500717786e-06,
-      "loss": 0.9036,
       "step": 335
     },
     {
-      "epoch": 1.761658031088083,
-      "grad_norm": 0.69921875,
-      "learning_rate": 8.707193982468456e-06,
-      "loss": 0.9421,
       "step": 340
     },
     {
-      "epoch": 1.7875647668393784,
-      "grad_norm": 0.64453125,
-      "learning_rate": 8.408860537021127e-06,
-      "loss": 0.9359,
       "step": 345
     },
     {
-      "epoch": 1.8134715025906736,
-      "grad_norm": 0.58984375,
-      "learning_rate": 8.111973329200909e-06,
-      "loss": 0.9473,
       "step": 350
     },
     {
-      "epoch": 1.8393782383419688,
-      "grad_norm": 0.6328125,
-      "learning_rate": 7.816802209301241e-06,
-      "loss": 0.9257,
       "step": 355
     },
     {
-      "epoch": 1.8652849740932642,
-      "grad_norm": 0.609375,
-      "learning_rate": 7.523615467808249e-06,
-      "loss": 0.9486,
       "step": 360
     },
     {
-      "epoch": 1.8911917098445596,
-      "grad_norm": 0.640625,
-      "learning_rate": 7.232679591542979e-06,
-      "loss": 0.9366,
       "step": 365
     },
     {
-      "epoch": 1.917098445595855,
-      "grad_norm": 0.60546875,
-      "learning_rate": 6.944259021442967e-06,
-      "loss": 0.9292,
       "step": 370
     },
     {
-      "epoch": 1.9430051813471503,
-      "grad_norm": 0.62109375,
-      "learning_rate": 6.6586159122033914e-06,
-      "loss": 0.9025,
       "step": 375
     },
     {
-      "epoch": 1.9689119170984455,
-      "grad_norm": 0.609375,
-      "learning_rate": 6.3760098939962935e-06,
-      "loss": 0.9151,
       "step": 380
     },
     {
-      "epoch": 1.994818652849741,
-      "grad_norm": 0.73046875,
-      "learning_rate": 6.096697836484382e-06,
-      "loss": 0.9283,
       "step": 385
     },
     {
-      "epoch": 2.0207253886010363,
-      "grad_norm": 0.640625,
-      "learning_rate": 5.820933615343975e-06,
-      "loss": 0.9249,
       "step": 390
     },
     {
-      "epoch": 2.0466321243523318,
-      "grad_norm": 0.5390625,
-      "learning_rate": 5.548967881509275e-06,
-      "loss": 0.9119,
       "step": 395
     },
     {
-      "epoch": 2.0725388601036268,
-      "grad_norm": 0.60546875,
-      "learning_rate": 5.281047833347676e-06,
-      "loss": 0.9236,
       "step": 400
     },
     {
-      "epoch": 2.098445595854922,
-      "grad_norm": 0.609375,
-      "learning_rate": 5.017416991973281e-06,
-      "loss": 0.9169,
       "step": 405
     },
     {
-      "epoch": 2.1243523316062176,
-      "grad_norm": 0.62109375,
-      "learning_rate": 4.758314979902734e-06,
-      "loss": 0.9315,
       "step": 410
     },
     {
-      "epoch": 2.150259067357513,
-      "grad_norm": 0.5859375,
-      "learning_rate": 4.503977303254673e-06,
-      "loss": 0.9111,
       "step": 415
     },
     {
-      "epoch": 2.1761658031088085,
-      "grad_norm": 0.62109375,
-      "learning_rate": 4.25463513769064e-06,
-      "loss": 0.9495,
       "step": 420
     },
     {
-      "epoch": 2.2020725388601035,
-      "grad_norm": 0.59375,
-      "learning_rate": 4.010515118292127e-06,
-      "loss": 0.9338,
       "step": 425
     },
     {
-      "epoch": 2.227979274611399,
-      "grad_norm": 0.5625,
-      "learning_rate": 3.771839133564704e-06,
-      "loss": 0.9028,
       "step": 430
     },
     {
-      "epoch": 2.2538860103626943,
-      "grad_norm": 0.625,
-      "learning_rate": 3.5388241237564337e-06,
-      "loss": 0.9227,
       "step": 435
     },
     {
-      "epoch": 2.2797927461139897,
-      "grad_norm": 0.64453125,
-      "learning_rate": 3.311681883673937e-06,
-      "loss": 0.9462,
       "step": 440
     },
     {
-      "epoch": 2.305699481865285,
-      "grad_norm": 0.6015625,
-      "learning_rate": 3.0906188701753127e-06,
-      "loss": 0.929,
       "step": 445
     },
     {
-      "epoch": 2.33160621761658,
-      "grad_norm": 0.609375,
-      "learning_rate": 2.875836014514867e-06,
-      "loss": 0.9178,
       "step": 450
     },
     {
-      "epoch": 2.3575129533678756,
-      "grad_norm": 0.58203125,
-      "learning_rate": 2.6675285397102856e-06,
-      "loss": 0.9052,
       "step": 455
     },
     {
-      "epoch": 2.383419689119171,
-      "grad_norm": 0.59375,
-      "learning_rate": 2.465885783098166e-06,
-      "loss": 0.9019,
       "step": 460
     },
     {
-      "epoch": 2.4093264248704664,
-      "grad_norm": 0.6015625,
-      "learning_rate": 2.2710910242392468e-06,
-      "loss": 0.9393,
       "step": 465
     },
     {
-      "epoch": 2.4352331606217614,
-      "grad_norm": 0.6328125,
-      "learning_rate": 2.0833213183297475e-06,
-      "loss": 0.9261,
       "step": 470
     },
     {
-      "epoch": 2.461139896373057,
-      "grad_norm": 0.72265625,
-      "learning_rate": 1.9027473352702208e-06,
-      "loss": 0.9021,
       "step": 475
     },
     {
-      "epoch": 2.4870466321243523,
-      "grad_norm": 0.5625,
-      "learning_rate": 1.729533204538224e-06,
-      "loss": 0.9289,
       "step": 480
     },
     {
-      "epoch": 2.5129533678756477,
-      "grad_norm": 0.625,
-      "learning_rate": 1.563836366005782e-06,
-      "loss": 0.9077,
       "step": 485
     },
     {
-      "epoch": 2.538860103626943,
-      "grad_norm": 0.625,
-      "learning_rate": 1.4058074268372224e-06,
-      "loss": 0.9668,
       "step": 490
     },
     {
-      "epoch": 2.5647668393782386,
-      "grad_norm": 0.59765625,
-      "learning_rate": 1.2555900245975262e-06,
-      "loss": 0.9356,
       "step": 495
     },
     {
-      "epoch": 2.5906735751295336,
-      "grad_norm": 0.5390625,
-      "learning_rate": 1.1133206966955213e-06,
-      "loss": 0.9311,
       "step": 500
     },
     {
-      "epoch": 2.616580310880829,
-      "grad_norm": 0.5703125,
-      "learning_rate": 9.79128756280675e-07,
-      "loss": 0.9411,
       "step": 505
     },
     {
-      "epoch": 2.6424870466321244,
-      "grad_norm": 0.69140625,
-      "learning_rate": 8.531361747062272e-07,
-      "loss": 0.9778,
       "step": 510
     },
     {
-      "epoch": 2.66839378238342,
-      "grad_norm": 0.57421875,
-      "learning_rate": 7.354574706655038e-07,
-      "loss": 0.9099,
       "step": 515
     },
     {
-      "epoch": 2.694300518134715,
-      "grad_norm": 0.578125,
-      "learning_rate": 6.261996061022335e-07,
-      "loss": 0.9045,
       "step": 520
     },
     {
-      "epoch": 2.7202072538860103,
-      "grad_norm": 0.58203125,
-      "learning_rate": 5.254618889893858e-07,
-      "loss": 0.8916,
       "step": 525
     },
     {
-      "epoch": 2.7461139896373057,
-      "grad_norm": 0.61328125,
-      "learning_rate": 4.3333588306499584e-07,
-      "loss": 0.9151,
       "step": 530
     },
     {
-      "epoch": 2.772020725388601,
-      "grad_norm": 0.5703125,
-      "learning_rate": 3.499053246069362e-07,
-      "loss": 0.9418,
       "step": 535
     },
     {
-      "epoch": 2.7979274611398965,
-      "grad_norm": 0.59765625,
-      "learning_rate": 2.7524604632233054e-07,
-      "loss": 0.9384,
       "step": 540
     },
-    {
-      "epoch": 2.823834196891192,
-      "grad_norm": 0.59375,
-      "learning_rate": 2.0942590842078503e-07,
-      "loss": 0.9117,
-      "step": 545
-    },
-    {
-      "epoch": 2.849740932642487,
-      "grad_norm": 0.6328125,
-      "learning_rate": 1.5250473693406486e-07,
-      "loss": 0.9154,
-      "step": 550
-    },
-    {
-      "epoch": 2.8756476683937824,
-      "grad_norm": 0.6328125,
-      "learning_rate": 1.0453426933830002e-07,
-      "loss": 0.942,
-      "step": 555
-    },
-    {
-      "epoch": 2.901554404145078,
-      "grad_norm": 0.609375,
-      "learning_rate": 6.555810752813308e-08,
-      "loss": 0.9441,
-      "step": 560
-    },
-    {
-      "epoch": 2.927461139896373,
-      "grad_norm": 0.59765625,
-      "learning_rate": 3.5611678185563106e-08,
-      "loss": 0.9353,
-      "step": 565
-    },
-    {
-      "epoch": 2.9533678756476682,
-      "grad_norm": 0.5859375,
-      "learning_rate": 1.4722200579497803e-08,
-      "loss": 0.9235,
-      "step": 570
-    },
-    {
-      "epoch": 2.9792746113989637,
-      "grad_norm": 0.59375,
-      "learning_rate": 2.9086618252893717e-09,
-      "loss": 0.9246,
-      "step": 575
-    },
     {
       "epoch": 3.0,
-      "step": 579,
-      "total_flos": 2.7685804472008704e+16,
-      "train_loss": 1.0384218392182714,
-      "train_runtime": 242.5709,
-      "train_samples_per_second": 38.179,
-      "train_steps_per_second": 2.387
     }
   ],
   "logging_steps": 5,
-  "max_steps": 579,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 500,
@@ -847,7 +798,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.7685804472008704e+16,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

   "best_model_checkpoint": null,
   "epoch": 3.0,
   "eval_steps": 500,
+  "global_step": 543,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.0055248618784530384,
+      "grad_norm": 7.4375,
+      "learning_rate": 3.6363636363636366e-07,
+      "loss": 2.4042,
       "step": 1
     },
     {
+      "epoch": 0.027624309392265192,
+      "grad_norm": 7.59375,
+      "learning_rate": 1.8181818181818183e-06,
+      "loss": 2.4209,
       "step": 5
     },
     {
+      "epoch": 0.055248618784530384,
+      "grad_norm": 7.1875,
+      "learning_rate": 3.6363636363636366e-06,
+      "loss": 2.4119,
       "step": 10
     },
     {
+      "epoch": 0.08287292817679558,
+      "grad_norm": 7.03125,
+      "learning_rate": 5.4545454545454545e-06,
+      "loss": 2.3657,
       "step": 15
     },
     {
+      "epoch": 0.11049723756906077,
+      "grad_norm": 7.28125,
+      "learning_rate": 7.272727272727273e-06,
+      "loss": 2.3123,
       "step": 20
     },
     {
+      "epoch": 0.13812154696132597,
+      "grad_norm": 5.65625,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 2.1773,
       "step": 25
     },
     {
+      "epoch": 0.16574585635359115,
+      "grad_norm": 3.84375,
+      "learning_rate": 1.0909090909090909e-05,
+      "loss": 2.0066,
       "step": 30
     },
     {
+      "epoch": 0.19337016574585636,
+      "grad_norm": 3.0,
+      "learning_rate": 1.2727272727272728e-05,
+      "loss": 1.9028,
       "step": 35
     },
     {
+      "epoch": 0.22099447513812154,
+      "grad_norm": 2.421875,
+      "learning_rate": 1.4545454545454546e-05,
+      "loss": 1.7204,
       "step": 40
     },
     {
+      "epoch": 0.24861878453038674,
+      "grad_norm": 2.109375,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.5881,
       "step": 45
     },
     {
+      "epoch": 0.27624309392265195,
+      "grad_norm": 1.8125,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 1.4278,
       "step": 50
     },
     {
+      "epoch": 0.30386740331491713,
+      "grad_norm": 1.71875,
+      "learning_rate": 2e-05,
+      "loss": 1.2964,
       "step": 55
     },
     {
+      "epoch": 0.3314917127071823,
+      "grad_norm": 1.2109375,
+      "learning_rate": 1.9994819965926346e-05,
+      "loss": 1.1979,
       "step": 60
     },
     {
+      "epoch": 0.35911602209944754,
       "grad_norm": 1.0078125,
+      "learning_rate": 1.997928523025598e-05,
+      "loss": 1.1192,
       "step": 65
     },
     {
+      "epoch": 0.3867403314917127,
+      "grad_norm": 0.76171875,
+      "learning_rate": 1.9953411887080917e-05,
+      "loss": 1.0823,
       "step": 70
     },
     {
+      "epoch": 0.4143646408839779,
+      "grad_norm": 0.70703125,
+      "learning_rate": 1.9917226741361014e-05,
+      "loss": 1.0383,
       "step": 75
     },
     {
+      "epoch": 0.4419889502762431,
+      "grad_norm": 0.69921875,
+      "learning_rate": 1.987076728115383e-05,
+      "loss": 1.0168,
       "step": 80
     },
     {
+      "epoch": 0.4696132596685083,
+      "grad_norm": 0.68359375,
+      "learning_rate": 1.9814081638776743e-05,
+      "loss": 1.004,
       "step": 85
     },
     {
+      "epoch": 0.4972375690607735,
+      "grad_norm": 0.7109375,
+      "learning_rate": 1.9747228540941555e-05,
+      "loss": 0.993,
       "step": 90
     },
     {
+      "epoch": 0.5248618784530387,
+      "grad_norm": 0.765625,
+      "learning_rate": 1.9670277247913205e-05,
+      "loss": 0.9644,
       "step": 95
     },
     {
+      "epoch": 0.5524861878453039,
+      "grad_norm": 0.76953125,
+      "learning_rate": 1.958330748175568e-05,
+      "loss": 0.996,
       "step": 100
     },
     {
+      "epoch": 0.580110497237569,
+      "grad_norm": 0.734375,
+      "learning_rate": 1.948640934373939e-05,
+      "loss": 0.9704,
       "step": 105
     },
     {
+      "epoch": 0.6077348066298343,
+      "grad_norm": 0.76953125,
+      "learning_rate": 1.9379683220995657e-05,
+      "loss": 0.9572,
       "step": 110
     },
     {
+      "epoch": 0.6353591160220995,
+      "grad_norm": 0.86328125,
+      "learning_rate": 1.9263239682514953e-05,
+      "loss": 0.9553,
       "step": 115
     },
     {
+      "epoch": 0.6629834254143646,
+      "grad_norm": 0.75,
+      "learning_rate": 1.9137199364596673e-05,
+      "loss": 0.9454,
       "step": 120
     },
     {
+      "epoch": 0.6906077348066298,
+      "grad_norm": 0.75390625,
+      "learning_rate": 1.9001692845869113e-05,
+      "loss": 0.939,
       "step": 125
     },
     {
+      "epoch": 0.7182320441988951,
+      "grad_norm": 0.77734375,
+      "learning_rate": 1.8856860512009115e-05,
+      "loss": 0.9433,
       "step": 130
     },
     {
+      "epoch": 0.7458563535911602,
+      "grad_norm": 0.76953125,
+      "learning_rate": 1.8702852410301556e-05,
+      "loss": 0.9329,
       "step": 135
     },
     {
+      "epoch": 0.7734806629834254,
+      "grad_norm": 0.71484375,
+      "learning_rate": 1.853982809418932e-05,
+      "loss": 0.9416,
       "step": 140
     },
     {
+      "epoch": 0.8011049723756906,
+      "grad_norm": 0.7265625,
+      "learning_rate": 1.8367956457974872e-05,
+      "loss": 0.914,
       "step": 145
     },
     {
+      "epoch": 0.8287292817679558,
+      "grad_norm": 0.87890625,
+      "learning_rate": 1.8187415561844586e-05,
+      "loss": 0.9229,
       "step": 150
     },
     {
+      "epoch": 0.856353591160221,
+      "grad_norm": 0.828125,
+      "learning_rate": 1.7998392447397197e-05,
+      "loss": 0.9259,
       "step": 155
     },
     {
+      "epoch": 0.8839779005524862,
+      "grad_norm": 0.85546875,
+      "learning_rate": 1.7801082943867406e-05,
+      "loss": 0.9421,
       "step": 160
     },
     {
+      "epoch": 0.9116022099447514,
+      "grad_norm": 0.8046875,
+      "learning_rate": 1.7595691465245484e-05,
+      "loss": 0.9225,
       "step": 165
     },
     {
+      "epoch": 0.9392265193370166,
+      "grad_norm": 0.7109375,
+      "learning_rate": 1.7382430798502977e-05,
+      "loss": 0.9066,
       "step": 170
     },
     {
+      "epoch": 0.9668508287292817,
+      "grad_norm": 0.83984375,
+      "learning_rate": 1.7161521883143936e-05,
+      "loss": 0.8903,
       "step": 175
     },
     {
+      "epoch": 0.994475138121547,
+      "grad_norm": 0.875,
+      "learning_rate": 1.693319358231011e-05,
+      "loss": 0.9252,
       "step": 180
     },
     {
+      "epoch": 1.022099447513812,
+      "grad_norm": 0.8203125,
+      "learning_rate": 1.6697682445677158e-05,
+      "loss": 0.9035,
       "step": 185
     },
     {
+      "epoch": 1.0497237569060773,
+      "grad_norm": 0.69140625,
+      "learning_rate": 1.6455232464387587e-05,
+      "loss": 0.9036,
       "step": 190
     },
     {
+      "epoch": 1.0773480662983426,
+      "grad_norm": 0.71875,
+      "learning_rate": 1.6206094818274228e-05,
+      "loss": 0.8932,
       "step": 195
     },
     {
+      "epoch": 1.1049723756906078,
+      "grad_norm": 0.91796875,
+      "learning_rate": 1.595052761563627e-05,
+      "loss": 0.9065,
       "step": 200
     },
     {
+      "epoch": 1.132596685082873,
+      "grad_norm": 0.859375,
+      "learning_rate": 1.5688795625837274e-05,
+      "loss": 0.8995,
       "step": 205
     },
     {
+      "epoch": 1.160220994475138,
+      "grad_norm": 0.78125,
+      "learning_rate": 1.542117000500229e-05,
+      "loss": 0.8844,
       "step": 210
     },
     {
+      "epoch": 1.1878453038674033,
+      "grad_norm": 0.765625,
+      "learning_rate": 1.5147928015098309e-05,
+      "loss": 0.8894,
       "step": 215
     },
     {
+      "epoch": 1.2154696132596685,
+      "grad_norm": 0.81640625,
+      "learning_rate": 1.4869352736688938e-05,
+      "loss": 0.894,
       "step": 220
     },
     {
+      "epoch": 1.2430939226519337,
+      "grad_norm": 0.76953125,
+      "learning_rate": 1.458573277566103e-05,
+      "loss": 0.9222,
       "step": 225
     },
     {
+      "epoch": 1.270718232044199,
+      "grad_norm": 0.75390625,
+      "learning_rate": 1.4297361964227004e-05,
+      "loss": 0.9014,
       "step": 230
     },
     {
+      "epoch": 1.298342541436464,
+      "grad_norm": 0.8125,
+      "learning_rate": 1.4004539056512667e-05,
+      "loss": 0.9052,
       "step": 235
     },
     {
+      "epoch": 1.3259668508287292,
+      "grad_norm": 0.87109375,
+      "learning_rate": 1.3707567419045926e-05,
+      "loss": 0.894,
       "step": 240
     },
     {
+      "epoch": 1.3535911602209945,
+      "grad_norm": 0.8125,
+      "learning_rate": 1.3406754716466978e-05,
+      "loss": 0.9045,
       "step": 245
     },
     {
+      "epoch": 1.3812154696132597,
+      "grad_norm": 0.78515625,
+      "learning_rate": 1.3102412592785654e-05,
+      "loss": 0.8737,
       "step": 250
     },
     {
+      "epoch": 1.408839779005525,
+      "grad_norm": 0.7734375,
+      "learning_rate": 1.2794856348516095e-05,
+      "loss": 0.9029,
       "step": 255
     },
     {
+      "epoch": 1.43646408839779,
+      "grad_norm": 0.83984375,
+      "learning_rate": 1.248440461402328e-05,
+      "loss": 0.8883,
       "step": 260
     },
     {
+      "epoch": 1.4640883977900552,
+      "grad_norm": 0.96875,
+      "learning_rate": 1.2171379019419786e-05,
+      "loss": 0.8932,
       "step": 265
     },
     {
+      "epoch": 1.4917127071823204,
+      "grad_norm": 0.890625,
+      "learning_rate": 1.1856103861354809e-05,
+      "loss": 0.8917,
       "step": 270
     },
     {
+      "epoch": 1.5193370165745856,
+      "grad_norm": 0.78515625,
+      "learning_rate": 1.153890576704062e-05,
+      "loss": 0.9033,
       "step": 275
     },
     {
+      "epoch": 1.5469613259668509,
+      "grad_norm": 0.8046875,
+      "learning_rate": 1.1220113355864549e-05,
+      "loss": 0.8839,
       "step": 280
     },
     {
+      "epoch": 1.5745856353591159,
+      "grad_norm": 0.8046875,
+      "learning_rate": 1.0900056898937055e-05,
+      "loss": 0.8887,
       "step": 285
     },
     {
+      "epoch": 1.6022099447513813,
+      "grad_norm": 0.8046875,
+      "learning_rate": 1.0579067976928614e-05,
+      "loss": 0.8951,
       "step": 290
     },
     {
+      "epoch": 1.6298342541436464,
+      "grad_norm": 0.78125,
+      "learning_rate": 1.0257479136549889e-05,
+      "loss": 0.8954,
       "step": 295
     },
     {
+      "epoch": 1.6574585635359116,
+      "grad_norm": 0.8359375,
+      "learning_rate": 9.935623546031043e-06,
+      "loss": 0.9004,
       "step": 300
     },
     {
+      "epoch": 1.6850828729281768,
+      "grad_norm": 0.85546875,
+      "learning_rate": 9.613834649957216e-06,
+      "loss": 0.9045,
       "step": 305
     },
     {
+      "epoch": 1.7127071823204418,
+      "grad_norm": 0.765625,
+      "learning_rate": 9.292445823817647e-06,
+      "loss": 0.8737,
       "step": 310
     },
     {
+      "epoch": 1.7403314917127073,
+      "grad_norm": 0.8203125,
+      "learning_rate": 8.971790028626395e-06,
+      "loss": 0.8722,
       "step": 315
     },
     {
+      "epoch": 1.7679558011049723,
+      "grad_norm": 0.921875,
+      "learning_rate": 8.652199465972462e-06,
+      "loss": 0.8995,
       "step": 320
     },
     {
+      "epoch": 1.7955801104972375,
+      "grad_norm": 0.8203125,
+      "learning_rate": 8.334005233856681e-06,
+      "loss": 0.9114,
       "step": 325
     },
     {
+      "epoch": 1.8232044198895028,
+      "grad_norm": 0.79296875,
+      "learning_rate": 8.017536983671929e-06,
+      "loss": 0.891,
       "step": 330
     },
     {
+      "epoch": 1.850828729281768,
+      "grad_norm": 0.75,
+      "learning_rate": 7.703122578682047e-06,
+      "loss": 0.8875,
       "step": 335
     },
     {
+      "epoch": 1.8784530386740332,
+      "grad_norm": 0.8125,
+      "learning_rate": 7.391087754353252e-06,
+      "loss": 0.8779,
       "step": 340
     },
     {
+      "epoch": 1.9060773480662982,
+      "grad_norm": 0.76953125,
+      "learning_rate": 7.081755780889978e-06,
+      "loss": 0.885,
       "step": 345
     },
     {
+      "epoch": 1.9337016574585635,
+      "grad_norm": 0.828125,
+      "learning_rate": 6.7754471283247594e-06,
+      "loss": 0.8875,
       "step": 350
     },
     {
+      "epoch": 1.9613259668508287,
+      "grad_norm": 0.9140625,
+      "learning_rate": 6.472479134509052e-06,
+      "loss": 0.9037,
       "step": 355
     },
     {
+      "epoch": 1.988950276243094,
+      "grad_norm": 0.83203125,
+      "learning_rate": 6.173165676349103e-06,
+      "loss": 0.8817,
       "step": 360
     },
     {
+      "epoch": 2.016574585635359,
+      "grad_norm": 0.81640625,
+      "learning_rate": 5.8778168446273045e-06,
+      "loss": 0.8876,
       "step": 365
     },
     {
+      "epoch": 2.044198895027624,
+      "grad_norm": 0.77734375,
+      "learning_rate": 5.586738622746042e-06,
+      "loss": 0.891,
       "step": 370
     },
     {
+      "epoch": 2.0718232044198897,
+      "grad_norm": 0.80078125,
+      "learning_rate": 5.300232569726805e-06,
+      "loss": 0.8843,
       "step": 375
     },
     {
+      "epoch": 2.0994475138121547,
+      "grad_norm": 0.9140625,
+      "learning_rate": 5.0185955077929774e-06,
+      "loss": 0.8696,
       "step": 380
     },
     {
+      "epoch": 2.12707182320442,
+      "grad_norm": 0.7578125,
+      "learning_rate": 4.742119214860009e-06,
+      "loss": 0.8775,
       "step": 385
     },
     {
+      "epoch": 2.154696132596685,
+      "grad_norm": 0.7578125,
+      "learning_rate": 4.471090122251496e-06,
+      "loss": 0.8797,
       "step": 390
     },
     {
+      "epoch": 2.18232044198895,
+      "grad_norm": 0.78125,
+      "learning_rate": 4.205789017954364e-06,
+      "loss": 0.8832,
       "step": 395
     },
     {
+      "epoch": 2.2099447513812156,
+      "grad_norm": 0.859375,
+      "learning_rate": 3.946490755720621e-06,
+      "loss": 0.884,
       "step": 400
     },
     {
+      "epoch": 2.2375690607734806,
+      "grad_norm": 0.9453125,
+      "learning_rate": 3.6934639703169905e-06,
+      "loss": 0.8737,
       "step": 405
     },
     {
+      "epoch": 2.265193370165746,
+      "grad_norm": 0.86328125,
+      "learning_rate": 3.4469707992174607e-06,
+      "loss": 0.8981,
       "step": 410
     },
     {
+      "epoch": 2.292817679558011,
+      "grad_norm": 0.80078125,
+      "learning_rate": 3.207266611027069e-06,
+      "loss": 0.8736,
       "step": 415
     },
     {
+      "epoch": 2.320441988950276,
+      "grad_norm": 0.78125,
+      "learning_rate": 2.97459974091831e-06,
+      "loss": 0.8757,
       "step": 420
     },
     {
+      "epoch": 2.3480662983425415,
+      "grad_norm": 0.8203125,
+      "learning_rate": 2.7492112333541744e-06,
+      "loss": 0.902,
       "step": 425
     },
     {
+      "epoch": 2.3756906077348066,
+      "grad_norm": 0.90625,
+      "learning_rate": 2.531334592364457e-06,
+      "loss": 0.8766,
       "step": 430
     },
     {
+      "epoch": 2.403314917127072,
+      "grad_norm": 0.7734375,
+      "learning_rate": 2.3211955396340003e-06,
+      "loss": 0.8982,
       "step": 435
     },
     {
+      "epoch": 2.430939226519337,
+      "grad_norm": 0.74609375,
+      "learning_rate": 2.1190117806534714e-06,
+      "loss": 0.8801,
       "step": 440
     },
     {
+      "epoch": 2.458563535911602,
+      "grad_norm": 0.828125,
+      "learning_rate": 1.924992779174999e-06,
+      "loss": 0.8707,
       "step": 445
     },
     {
+      "epoch": 2.4861878453038675,
+      "grad_norm": 0.859375,
+      "learning_rate": 1.7393395402063085e-06,
+      "loss": 0.8939,
       "step": 450
     },
     {
+      "epoch": 2.5138121546961325,
+      "grad_norm": 0.859375,
+      "learning_rate": 1.5622444017681438e-06,
+      "loss": 0.8707,
       "step": 455
     },
     {
+      "epoch": 2.541436464088398,
+      "grad_norm": 0.78515625,
+      "learning_rate": 1.3938908356307846e-06,
+      "loss": 0.8771,
       "step": 460
     },
     {
+      "epoch": 2.569060773480663,
+      "grad_norm": 0.76171875,
+      "learning_rate": 1.2344532572360325e-06,
+      "loss": 0.857,
       "step": 465
     },
     {
+      "epoch": 2.596685082872928,
+      "grad_norm": 0.83984375,
+      "learning_rate": 1.0840968450016276e-06,
+      "loss": 0.885,
       "step": 470
     },
     {
+      "epoch": 2.6243093922651934,
+      "grad_norm": 0.78125,
+      "learning_rate": 9.42977369195286e-07,
+      "loss": 0.9007,
       "step": 475
     },
     {
+      "epoch": 2.6519337016574585,
+      "grad_norm": 0.90234375,
+      "learning_rate": 8.112410305556307e-07,
+      "loss": 0.8988,
       "step": 480
     },
     {
+      "epoch": 2.679558011049724,
+      "grad_norm": 0.78125,
+      "learning_rate": 6.890243088272453e-07,
+      "loss": 0.8702,
       "step": 485
     },
     {
+      "epoch": 2.707182320441989,
+      "grad_norm": 0.875,
+      "learning_rate": 5.764538213667103e-07,
+      "loss": 0.8981,
       "step": 490
     },
     {
+      "epoch": 2.734806629834254,
+      "grad_norm": 0.89453125,
+      "learning_rate": 4.73646191966175e-07,
+      "loss": 0.8912,
       "step": 495
     },
     {
+      "epoch": 2.7624309392265194,
+      "grad_norm": 0.875,
+      "learning_rate": 3.8070793003030296e-07,
+      "loss": 0.8967,
       "step": 500
     },
     {
+      "epoch": 2.7900552486187844,
+      "grad_norm": 0.828125,
+      "learning_rate": 2.9773532023180897e-07,
+      "loss": 0.8933,
       "step": 505
     },
     {
+      "epoch": 2.81767955801105,
+      "grad_norm": 0.7578125,
+      "learning_rate": 2.248143227598809e-07,
+      "loss": 0.9039,
       "step": 510
     },
     {
+      "epoch": 2.845303867403315,
+      "grad_norm": 0.7578125,
+      "learning_rate": 1.6202048426483652e-07,
+      "loss": 0.8864,
       "step": 515
     },
     {
+      "epoch": 2.87292817679558,
+      "grad_norm": 0.8515625,
+      "learning_rate": 1.094188595912804e-07,
+      "loss": 0.8997,
       "step": 520
     },
     {
+      "epoch": 2.9005524861878453,
+      "grad_norm": 0.87109375,
+      "learning_rate": 6.706394438083962e-08,
+      "loss": 0.8805,
       "step": 525
     },
     {
+      "epoch": 2.9281767955801103,
+      "grad_norm": 0.7890625,
+      "learning_rate": 3.4999618614309784e-08,
+      "loss": 0.8959,
       "step": 530
     },
     {
+      "epoch": 2.955801104972376,
+      "grad_norm": 0.7265625,
+      "learning_rate": 1.325910115169471e-08,
+      "loss": 0.8552,
       "step": 535
     },
     {
+      "epoch": 2.983425414364641,
+      "grad_norm": 0.82421875,
+      "learning_rate": 1.8649153172423106e-09,
+      "loss": 0.8753,
       "step": 540
     },
     {
       "epoch": 3.0,
+      "step": 543,
+      "total_flos": 2.5856227500097536e+16,
+      "train_loss": 1.017678025019103,
+      "train_runtime": 224.2528,
+      "train_samples_per_second": 38.568,
+      "train_steps_per_second": 2.421
     }
   ],
   "logging_steps": 5,
+  "max_steps": 543,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 2.5856227500097536e+16,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6fa81c0842694b8297e4996a6f1dcfc12daedf7a7564e339a703a7f1e1ac1a96
 size 5560

 version https://git-lfs.github.com/spec/v1
+oid sha256:96799ddaa0622d20b44b5fd1f13fd4ee51acc21d18c3376a5108d307ef6d1ed1
 size 5560