Training in progress, step 500, checkpoint

Browse files

Files changed (10) hide show

last-checkpoint/config.json +4 -4
last-checkpoint/model.safetensors +2 -2
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state_0.pth +1 -1
last-checkpoint/rng_state_1.pth +1 -1
last-checkpoint/rng_state_2.pth +1 -1
last-checkpoint/rng_state_3.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +74 -18
last-checkpoint/training_args.bin +1 -1

last-checkpoint/config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "tattabio/gLM2_150M",
   "architectures": [
     "gLM2ForMaskedLM"
   ],
@@ -8,10 +8,10 @@
     "AutoModel": "modeling_glm2.gLM2Model",
     "AutoModelForMaskedLM": "modeling_glm2.gLM2ForMaskedLM"
   },
-  "depth": 30,
-  "dim": 640,
   "ffn_dim_multiplier": null,
-  "heads": 10,
   "model_type": "gLM2",
   "norm_eps": 1e-05,
   "swiglu_multiple_of": 256,

 {
+  "_name_or_path": "tattabio/gLM2_650M",
   "architectures": [
     "gLM2ForMaskedLM"
   ],
     "AutoModel": "modeling_glm2.gLM2Model",
     "AutoModelForMaskedLM": "modeling_glm2.gLM2ForMaskedLM"
   },
+  "depth": 33,
+  "dim": 1280,
   "ffn_dim_multiplier": null,
+  "heads": 20,
   "model_type": "gLM2",
   "norm_eps": 1e-05,
   "swiglu_multiple_of": 256,

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:516558ed7782de66fc542438abb1c93e159afd70a2aeb6571ce83cca423452b0
-size 609855088

 version https://git-lfs.github.com/spec/v1
+oid sha256:228e54ea153feeb0f49b0800638a29264ec8340106787699023f5720254dacea
+size 2682482800

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:812c91eacfd5aea68d8b5decb8b50302d3944860c0aa6ecd636549bd4f072a92
-size 1219840058

 version https://git-lfs.github.com/spec/v1
+oid sha256:a0006e3e3cc59298369c06ee6e4e8b3272c5752f670d9f969958dfd9e69616dc
+size 5365108834

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7f4312d4eb4a3834512b8e6a5f558f7335f936ed9768ab54b18216e62eb5a7d3
 size 15024

 version https://git-lfs.github.com/spec/v1
+oid sha256:848fdf35f13e1fde847fbd191021c99c0675e5e723a1b65fde4649f2fc9250db
 size 15024

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13ae4134e19f55d5a540bad8977ebfa7de23a5f70c51215224d0742bb2666b1a
 size 15024

 version https://git-lfs.github.com/spec/v1
+oid sha256:d9f9fc41c0627d630837221d5c7872d3197c08985ee35f058d5f5e36bfe0249b
 size 15024

last-checkpoint/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7e287e6f80aed910a1d4cb01fb428361df3b7e62045921fccfd519aab7f20c2e
 size 15024

 version https://git-lfs.github.com/spec/v1
+oid sha256:7ae1449d711371210b0f6284f921f3df183a3c5c6628d3fc2950f5c89910866d
 size 15024

last-checkpoint/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:006e670f373067b7e226643b8cade6148c320aff0b769e7d1532179c7f45b76a
 size 15024

 version https://git-lfs.github.com/spec/v1
+oid sha256:c3cc14e80a0475fa4dead8d6a3c6f0af9c5a92c40ad285584d68830834b3a6ea
 size 15024

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:59630a3df2ec5543c18897bf2cb0562e6bac8d472d75091b8f7ddabcb069715a
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:478ca537cf75a11344e25e46d3c46fdcf2db572bdb8cfff6f1ed3781e47a9787
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,33 +1,89 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.0012275351477837237,
-  "eval_steps": 2,
-  "global_step": 2,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.0006137675738918619,
-      "grad_norm": 158.1446075439453,
-      "learning_rate": 9.993861264579497e-05,
-      "loss": 100.2575,
-      "step": 1
     },
     {
-      "epoch": 0.0012275351477837237,
-      "grad_norm": 105.63041687011719,
-      "learning_rate": 9.987722529158994e-05,
-      "loss": 95.2722,
-      "step": 2
     }
   ],
-  "logging_steps": 1,
-  "max_steps": 1629,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
-  "save_steps": 2,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
@@ -40,8 +96,8 @@
       "attributes": {}
     }
   },
-  "total_flos": 919791151165440.0,
-  "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null
 }

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.03836047336824137,
+  "eval_steps": 500,
+  "global_step": 500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.0038360473368241363,
+      "grad_norm": 44.11674880981445,
+      "learning_rate": 9.961638790854688e-05,
+      "loss": 78.9796,
+      "step": 50
     },
     {
+      "epoch": 0.0076720946736482725,
+      "grad_norm": 31.060550689697266,
+      "learning_rate": 9.923277581709376e-05,
+      "loss": 75.5083,
+      "step": 100
+    },
+    {
+      "epoch": 0.011508142010472408,
+      "grad_norm": 53.42700958251953,
+      "learning_rate": 9.884916372564063e-05,
+      "loss": 74.4945,
+      "step": 150
+    },
+    {
+      "epoch": 0.015344189347296545,
+      "grad_norm": 44.670753479003906,
+      "learning_rate": 9.846555163418752e-05,
+      "loss": 73.8958,
+      "step": 200
+    },
+    {
+      "epoch": 0.019180236684120684,
+      "grad_norm": 44.99592971801758,
+      "learning_rate": 9.80819395427344e-05,
+      "loss": 73.1967,
+      "step": 250
+    },
+    {
+      "epoch": 0.023016284020944817,
+      "grad_norm": 47.95292663574219,
+      "learning_rate": 9.769832745128127e-05,
+      "loss": 72.7175,
+      "step": 300
+    },
+    {
+      "epoch": 0.026852331357768953,
+      "grad_norm": 15.567469596862793,
+      "learning_rate": 9.731471535982815e-05,
+      "loss": 72.0448,
+      "step": 350
+    },
+    {
+      "epoch": 0.03068837869459309,
+      "grad_norm": 37.817440032958984,
+      "learning_rate": 9.693110326837502e-05,
+      "loss": 71.9744,
+      "step": 400
+    },
+    {
+      "epoch": 0.03452442603141723,
+      "grad_norm": 32.989627838134766,
+      "learning_rate": 9.65474911769219e-05,
+      "loss": 71.4153,
+      "step": 450
+    },
+    {
+      "epoch": 0.03836047336824137,
+      "grad_norm": 44.315311431884766,
+      "learning_rate": 9.616387908546877e-05,
+      "loss": 71.042,
+      "step": 500
     }
   ],
+  "logging_steps": 50,
+  "max_steps": 13034,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
+  "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
       "attributes": {}
     }
   },
+  "total_flos": 8.568157303923016e+17,
+  "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null
 }

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:064b240ea07b11fb2a55256aa70c4f515e16a1e7de5972e80b77b98e19219a68
 size 5240

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b0a8ed667ee8b35f22ca4883f52af3ea1273c54ad954652c4052132affac051
 size 5240