Training in progress, step 30000, checkpoint

Browse files

Files changed (6) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scaler.pt +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +151 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:12badc8586f0f012102da81da5ff919fc8e5eec70764b2b90ad87de6f707d780
 size 357393656

 version https://git-lfs.github.com/spec/v1
+oid sha256:090d054104cd591e7202f6691e825291e2d3b9b1747fffbe12d1459c0f527f40
 size 357393656

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:46cf74c831f431d0c869290f772d7c001ab26053a3f6bac8cebde4d75f75c23b
 size 714965067

 version https://git-lfs.github.com/spec/v1
+oid sha256:015f58639fa75eede621c4156e3ff5c7b08e18681c738f507b3c5c091c46b93e
 size 714965067

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:02aaacae1af734b009a7851c8e1b3261cdb750514583d065268aae220d491289
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:07308549b830cf83d1e9a8a807be8a2d69fd48b26dd3815547941ec18751b208
 size 14645

last-checkpoint/scaler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:829d5a0958b883245e1043bdcede967c316f6195bd1180d3f9310ce5a64e8080
 size 1383

 version https://git-lfs.github.com/spec/v1
+oid sha256:9779a733270277f15e820d84d3dfdfb3a66fd96b857f3f0109ac7f2b54244d67
 size 1383

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e8ab9b9762e16223fba549fb932b52a090591d5fc2fa74af57b6597b735f99e8
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:775f002a0d4d3ca254c2c3990bbecba9e195e6d88b76d01e998cfeb8f1de5721
 size 1465

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.6630633627141194,
   "eval_steps": 10000,
-  "global_step": 20000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -304,6 +304,154 @@
       "eval_samples_per_second": 137.069,
       "eval_steps_per_second": 4.283,
       "step": 20000
     }
   ],
   "logging_steps": 500,
@@ -323,7 +471,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.2419559227146957e+17,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 2.494595044071179,
   "eval_steps": 10000,
+  "global_step": 30000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 137.069,
       "eval_steps_per_second": 4.283,
       "step": 20000
+    },
+    {
+      "epoch": 1.7046399467819724,
+      "grad_norm": 10.757081031799316,
+      "learning_rate": 0.0003352697446304752,
+      "loss": 8.9914,
+      "step": 20500
+    },
+    {
+      "epoch": 1.7462165308498254,
+      "grad_norm": 15.339715957641602,
+      "learning_rate": 0.0003310586842550313,
+      "loss": 8.8893,
+      "step": 21000
+    },
+    {
+      "epoch": 1.7877931149176782,
+      "grad_norm": 56.51789093017578,
+      "learning_rate": 0.0003268307119905293,
+      "loss": 8.9615,
+      "step": 21500
+    },
+    {
+      "epoch": 1.8293696989855315,
+      "grad_norm": 60.40474319458008,
+      "learning_rate": 0.0003226111956705564,
+      "loss": 8.9375,
+      "step": 22000
+    },
+    {
+      "epoch": 1.8709462830533843,
+      "grad_norm": 11.859347343444824,
+      "learning_rate": 0.00031838322340605444,
+      "loss": 9.0322,
+      "step": 22500
+    },
+    {
+      "epoch": 1.9125228671212373,
+      "grad_norm": 41.063846588134766,
+      "learning_rate": 0.0003141637070860815,
+      "loss": 9.0873,
+      "step": 23000
+    },
+    {
+      "epoch": 1.9540994511890903,
+      "grad_norm": 32.88983154296875,
+      "learning_rate": 0.00030993573482157957,
+      "loss": 9.2389,
+      "step": 23500
+    },
+    {
+      "epoch": 1.9956760352569431,
+      "grad_norm": 26.1168212890625,
+      "learning_rate": 0.0003057077625570776,
+      "loss": 8.945,
+      "step": 24000
+    },
+    {
+      "epoch": 2.0372526193247964,
+      "grad_norm": 29.020992279052734,
+      "learning_rate": 0.00030147979029257563,
+      "loss": 9.3528,
+      "step": 24500
+    },
+    {
+      "epoch": 2.078829203392649,
+      "grad_norm": 54.41719436645508,
+      "learning_rate": 0.0002972518180280737,
+      "loss": 9.4035,
+      "step": 25000
+    },
+    {
+      "epoch": 2.1204057874605025,
+      "grad_norm": 66.4457778930664,
+      "learning_rate": 0.00029302384576357176,
+      "loss": 9.2745,
+      "step": 25500
+    },
+    {
+      "epoch": 2.1619823715283553,
+      "grad_norm": 83.49308013916016,
+      "learning_rate": 0.00028879587349906984,
+      "loss": 9.292,
+      "step": 26000
+    },
+    {
+      "epoch": 2.203558955596208,
+      "grad_norm": 59.56986618041992,
+      "learning_rate": 0.0002845679012345679,
+      "loss": 9.1446,
+      "step": 26500
+    },
+    {
+      "epoch": 2.2451355396640613,
+      "grad_norm": 22.600574493408203,
+      "learning_rate": 0.0002803399289700659,
+      "loss": 9.1264,
+      "step": 27000
+    },
+    {
+      "epoch": 2.286712123731914,
+      "grad_norm": 19.706218719482422,
+      "learning_rate": 0.000276111956705564,
+      "loss": 9.1375,
+      "step": 27500
+    },
+    {
+      "epoch": 2.328288707799767,
+      "grad_norm": 23.606611251831055,
+      "learning_rate": 0.00027188398444106203,
+      "loss": 9.2861,
+      "step": 28000
+    },
+    {
+      "epoch": 2.36986529186762,
+      "grad_norm": 20.25541877746582,
+      "learning_rate": 0.0002676560121765601,
+      "loss": 9.0342,
+      "step": 28500
+    },
+    {
+      "epoch": 2.411441875935473,
+      "grad_norm": 40.61511993408203,
+      "learning_rate": 0.00026342803991205815,
+      "loss": 9.1698,
+      "step": 29000
+    },
+    {
+      "epoch": 2.4530184600033262,
+      "grad_norm": 11.889721870422363,
+      "learning_rate": 0.0002592000676475562,
+      "loss": 9.0235,
+      "step": 29500
+    },
+    {
+      "epoch": 2.494595044071179,
+      "grad_norm": NaN,
+      "learning_rate": 0.00025749196685269745,
+      "loss": 12.3327,
+      "step": 30000
+    },
+    {
+      "epoch": 2.494595044071179,
+      "eval_loss": NaN,
+      "eval_runtime": 2733.6805,
+      "eval_samples_per_second": 140.772,
+      "eval_steps_per_second": 4.399,
+      "step": 30000
     }
   ],
   "logging_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 3.3630037372820275e+17,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null