Add fine-tuned model

Files changed (8) hide show

config.json +33 -0
generation_config.json +7 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +250 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "langboat/bloom-389m-zh",
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "BloomForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "bias_dropout_fusion": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_dropout": 0.0,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "masked_softmax_fusion": true,
+  "model_type": "bloom",
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 24,
+  "offset_alibi": 100,
+  "pad_token_id": 3,
+  "pretraining_tp": 1,
+  "seq_length": 2048,
+  "skip_bias_add": true,
+  "skip_bias_add_qkv": false,
+  "slow_but_exact": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "unk_token_id": 0,
+  "use_cache": true,
+  "vocab_size": 42437
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 3,
+  "transformers_version": "4.44.2"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d266f190fcc32b71f8ebf7a6c8dd49130d0eeafa993998b44dca1cc73767534f
+size 1383109776

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4db2ef60836bd106633bbaa9c20bc6d11ca101088dbd7bb59c3d8045e446aad6
+size 2766396410

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf649bac0f2cfebbcf7fed3fb87fad328f4aea97a5892dffe764f4191f92f88c
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,250 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.032,
+      "grad_norm": 35.32009506225586,
+      "learning_rate": 4.83974358974359e-05,
+      "loss": 3.9438,
+      "step": 10
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 81.18579864501953,
+      "learning_rate": 4.67948717948718e-05,
+      "loss": 3.9537,
+      "step": 20
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 32.88789749145508,
+      "learning_rate": 4.519230769230769e-05,
+      "loss": 4.0702,
+      "step": 30
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 16.556777954101562,
+      "learning_rate": 4.358974358974359e-05,
+      "loss": 3.8532,
+      "step": 40
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 10.234758377075195,
+      "learning_rate": 4.198717948717949e-05,
+      "loss": 3.7683,
+      "step": 50
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 1121.8642578125,
+      "learning_rate": 4.038461538461539e-05,
+      "loss": 3.8498,
+      "step": 60
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 422.935546875,
+      "learning_rate": 3.878205128205129e-05,
+      "loss": 4.8448,
+      "step": 70
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 885.9011840820312,
+      "learning_rate": 3.717948717948718e-05,
+      "loss": 5.3873,
+      "step": 80
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 9.224353790283203,
+      "learning_rate": 3.557692307692308e-05,
+      "loss": 4.8793,
+      "step": 90
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 7.631784915924072,
+      "learning_rate": 3.397435897435898e-05,
+      "loss": 3.8897,
+      "step": 100
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 6.077576637268066,
+      "learning_rate": 3.2371794871794876e-05,
+      "loss": 3.7135,
+      "step": 110
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 5.676390647888184,
+      "learning_rate": 3.0769230769230774e-05,
+      "loss": 3.6988,
+      "step": 120
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 5.851507663726807,
+      "learning_rate": 2.916666666666667e-05,
+      "loss": 3.6701,
+      "step": 130
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 7.7876362800598145,
+      "learning_rate": 2.756410256410257e-05,
+      "loss": 3.668,
+      "step": 140
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 5.508922100067139,
+      "learning_rate": 2.5961538461538464e-05,
+      "loss": 3.6795,
+      "step": 150
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 5.01690673828125,
+      "learning_rate": 2.435897435897436e-05,
+      "loss": 3.5582,
+      "step": 160
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 10.510420799255371,
+      "learning_rate": 2.2756410256410258e-05,
+      "loss": 3.592,
+      "step": 170
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 50.05154800415039,
+      "learning_rate": 2.1153846153846154e-05,
+      "loss": 3.5765,
+      "step": 180
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 266.8519287109375,
+      "learning_rate": 1.9551282051282052e-05,
+      "loss": 3.7085,
+      "step": 190
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 16487.71875,
+      "learning_rate": 1.794871794871795e-05,
+      "loss": 4.2035,
+      "step": 200
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 132328.34375,
+      "learning_rate": 1.6346153846153847e-05,
+      "loss": 4.9833,
+      "step": 210
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 116585.5546875,
+      "learning_rate": 1.4743589743589745e-05,
+      "loss": 5.4398,
+      "step": 220
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 49.177772521972656,
+      "learning_rate": 1.3141025641025642e-05,
+      "loss": 5.3455,
+      "step": 230
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 287.9832458496094,
+      "learning_rate": 1.153846153846154e-05,
+      "loss": 4.1649,
+      "step": 240
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 17.73406982421875,
+      "learning_rate": 9.935897435897435e-06,
+      "loss": 3.8618,
+      "step": 250
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 21.187644958496094,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 3.6718,
+      "step": 260
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 119.2005844116211,
+      "learning_rate": 6.730769230769231e-06,
+      "loss": 3.7159,
+      "step": 270
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 40.40868377685547,
+      "learning_rate": 5.128205128205128e-06,
+      "loss": 3.7839,
+      "step": 280
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 143.7291259765625,
+      "learning_rate": 3.525641025641026e-06,
+      "loss": 3.6802,
+      "step": 290
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 13.765769004821777,
+      "learning_rate": 1.9230769230769234e-06,
+      "loss": 3.7338,
+      "step": 300
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 12.183256149291992,
+      "learning_rate": 3.205128205128205e-07,
+      "loss": 3.7386,
+      "step": 310
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6702608917266432.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30fb119c0440412ce2b9fd4ca82dbc801b9a0efd84d57add746da33f60b5c1e0
+size 5176