diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..9546427ecfa969c490c4b431d743434f82ff0a0a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+trainer_state.json filter=lfs diff=lfs merge=lfs -text
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..42225441620d264fc468e2fc663d4006485e22ce
--- /dev/null
+++ b/config.json
@@ -0,0 +1,65 @@
+{
+ "_name_or_path": "/fs/archive/share/yulan/data/aa_mini/output/miniyulan-2B-final-stage20-remake/checkpoint-194526-rms_norm",
+ "architectures": [
+ "MiniYuLanModelForCausalLM"
+ ],
+ "attention_bias": true,
+ "attention_dropout": 0.0,
+ "bos_token_id": 1,
+ "dim_model_base": 1920,
+ "dim_model_base_attn": 64,
+ "dim_model_base_init": null,
+ "dim_model_base_lmh": 1,
+ "dim_model_base_logits": 1920.0,
+ "dim_model_base_lr": 256.0,
+ "down_proj_alpha": 0.03450327796711771,
+ "embed_tokens_alpha": 1,
+ "embedding_ln": false,
+ "embedding_rmsln": false,
+ "eos_token_id": 2,
+ "gate_up_proj_alpha": 0.3651483716701107,
+ "gradient_checkpointing_step": 11,
+ "hidden_act": "silu",
+ "hidden_size": 1920,
+ "hidden_states_shrink": 0.18708286933869706,
+ "init_scale_o": 1,
+ "initializer_range": 5e-05,
+ "input_layernorm_alpha": 1.0,
+ "intermediate_size": 4800,
+ "k_proj_alpha": 0.3651483716701107,
+ "layer_norm_eps": 1e-06,
+ "lm_head_alpha": 1.0,
+ "ln_scale": 1,
+ "max_position_embeddings": 4096,
+ "model_reproduce": "transformer",
+ "model_type": "miniyulan",
+ "norm_alpha": 1.0,
+ "num_attention_heads": 30,
+ "num_epochs_trained_before_this_epoch": 20,
+ "num_hidden_layers": 56,
+ "num_key_value_heads": 6,
+ "num_steps_trained_before_this_epoch": 194526,
+ "o_proj_alpha": 0.03450327796711771,
+ "post_attention_layernorm_alpha": 1.0,
+ "q_proj_alpha": 0.3651483716701107,
+ "qk_layernorm": false,
+ "rms_norm_eps": 1e-06,
+ "rms_type": "llama",
+ "rope_scaling": null,
+ "rope_theta": 10000.0,
+ "scale_emb": 10.0,
+ "shrink_alpha": 1,
+ "sliding_window": null,
+ "tie_word_embeddings": true,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.44.0",
+ "use_cache": false,
+ "use_emb_alpha": true,
+ "use_liger": true,
+ "use_norm_alpha": true,
+ "use_sliding_window": false,
+ "v_proj_alpha": 0.3651483716701107,
+ "vocab_size": 99000,
+ "wesar_weights": true,
+ "z_loss": 0.0001
+}
diff --git a/global_step204262/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f3f67058f63e0d33d46b5c561f9a7c82f762c0ca
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09a8f48d541860b1dca434e8806a1f332cdf6b5724ffdbdcf845fabe058d99d8
+size 558554482
diff --git a/global_step204262/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..316d1263ba87c1e7af7aab1766e9a2ba68b393f1
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:389d97104a3500bdf38a6fc11a7282cd9d4724f5806fdf61ec661c29395a88b2
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..be1cb49fc2e6a13204f56cfb069be16477491791
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ee512cf0219e0350014ff2bf8e23d3a1fb8c7e80d6706edd2bb4a9bd6d1547d
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4077131bb7546f5c0f1b42ea28e7fd5c53bb7646
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1173908e32d0aa58d3c25c1037f39125a33a0863ac0ef365be1e13358c78303c
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e7d68b1e914990ecaf16139a3744cc2f7c1f45c5
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d57fd1fe682c315af1c48fb802d16b71f4f8aab3890bff6beec7c8d857e9083
+size 558554434
diff --git a/global_step204262/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..892a839aab93a7159184c69cdabfa57faeace9db
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a91d9f0c7d167544b61609ae6c8faf4cac4e20d09cbaad479c1e635f471a592e
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..df2d6e07c03e334b0a5cfc75b58349523b7296ef
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6340a1306ccc45801ff9d8a18737a2e749cfcf501d8a7a660d22736597de2a17
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4f926c68b77b527a3c4f5836ba900ad629d9cbab
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84da5ed4bf86ec26f616d7d1d001ba32ba5ee23b14ea225c363d1fabe21c8fd7
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..86e584470309051d7958b8ed024ffca5137a8490
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54302e17b523718d0ebd79d60bab2b46f04039cd0d2e4b0ea7cff23203ef66c0
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19cbcc1b54be3d29adf4a70030476ef1c7551d9d
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4eaa15d86759e8f279891e55b99e64cbdfd2216f195d76ebd3f9886945393f2d
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..996fecb11f88b55b8f7acd1eb2632298531121de
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81dde8e12ecec54aa62fe76776d4205dc8e12b23a179c2cb7aaefa159b5705f8
+size 558554434
diff --git a/global_step204262/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..745e083c498d257e00713c729a69c697d926df2d
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac77d24be425dba2350677a16dcc528ef99cafd0f596f42defd1dea14cbbad74
+size 558554418
diff --git a/global_step204262/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2d027931828a03493b900336c70e9b19ad5d13a
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:535a78424b3a829fa227cf865bda52fd29157597994cf1f5e26f0cff667eac05
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..086f9a1c86a929bae27eba1f0def599e7b36d6df
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6a72d54766917e00e9a16524f23b20d7ca5037d62dbd93a414d198f665f0094
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..24c989d782304ec1ab488378adbfa72def8f5987
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb51778cb0e727757ded3376a9dd8e15c28b03add4b1581e8335035b10d5b119
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8ad123828ec0996af99593fd1b3d62d5d6520e17
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7905f0d16f952e4dad0ac94b8c89e43a05f5f3c56b6604c730ed5c8de69ab067
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f4f5669d28a5120bab425eca60f2b12adbc19468
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:614db2f1d2cf8ab070fc63314cbeeb617389865fa5eacc11d39df8b78584409e
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a0c40e8136b0cd667733e197ddad3d556e896939
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38c0b74e77dd640f94569e4569b2b43ce52f9e25bc11bc12584500787422b18f
+size 558554434
diff --git a/global_step204262/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..41341302b64f24be69311838cb1877e9840c8467
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfca4d12a0fd65dfad9479edbdbaaa3cb22e4f576fe45df9b82d4a113d372eb0
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..08d80ce950c0ffede7ffea8cbbd3a0b8051815ce
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90976ef314e712fa58d08acbd6912c8aaac9d05379a24044aff8805fe893e64a
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1cf7723802769874309438fe9989ebac3b3f4b8b
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1dd7271d5279e9610c6578def2df8ccc972346c20b3969ec8c9e5fc3871ee8b
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c55e77dbe6f264a97f54f0f92c4959e158e84c00
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7352551707b58cbb0e0a1cce721bd610e909bf12a2898cbdf7209f899840c299
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a1f0b19bb3ba9be0fa4ab62e474917767063e8cd
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efbbf851d84c588d08c2e6898bb6f4b884211cbf4672498e4084f1bf22d0bcb6
+size 558554290
diff --git a/global_step204262/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fb66b2aef3d152f958bbb945a6e46da6c607c8d3
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6552ef72b9b856bebd5a4e64886dfd1315c0725b7f22a72e5c32ae6c2b9a575
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a98bb4917dc730d807cecaf10bd8ac9a822e0104
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc8cd5d45fddd808c350247d545b2a01e6900a478b184c4a964c699b68f5b223
+size 558554434
diff --git a/global_step204262/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7f2fba13c0c0500ab455d34ed4438ba3075a8472
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4521aa7d326d4c708087b9dd0b03fe83991075a964d353cd69a9778ba116fbd
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7945265d7e614ee5345d6e964472c0d4e01ca76a
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b39098c71a8a180cece643cb1a8bb47ea9f704cc0a0537083d8e0d27a16cbc60
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bd8f874e2375993bd53e51fad8c4d72eea0d874f
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0833bbdc73a62c5bda6ffb0a117d3b0cfebf37125d3a10ae5f6f325b28aa3a6
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..700cd77379ba3cfc7080e9a25f11148eb8db7718
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66b4996b8bc3b65c08cc589f2ae8dcd634ed5dc88da5d9a81845eb9ece2270ef
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..09734d44438125c0398c36abe832ecfb19ef726c
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:439bec3ec6373d89aeae00e75fa2719035fb99582f21300844943c160d0b6ad3
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2e8f77afda26eda8a6f03a7e5d7c03918f3da9c6
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96f48f163a4f960ba4bb39193c60a8f5ab70d17d5baa04f83e256aacf202653a
+size 558554434
diff --git a/global_step204262/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ae56d70a6914f45be9ee1fc402c2747672ad719a
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fcafa7f85eb5a131fbf9a4c1e92fcaca42a3b21002b6fe2e7d6f5c00732b41b
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f88310e6e43d34d428867d96d226436d15464bd8
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a848a1bbe149eae1382122133807404425ec2bc210f79461fa82ba587e3064a
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f523db59a9ba9fbc46e1d68b76adbb57204f026c
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01c69f57a6ba5dcedb3d1df70115d26baaa415677e8befcf2f71b81c6e3166ea
+size 558554354
diff --git a/global_step204262/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9b88624dc73deb4bf591070ea88f71679e6a6baf
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:344edd93cbf1eb32a745d6050c5550c9d77e92242ea3349633faed17638287e1
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d0c3a776302d99d25762fad24b0bd59efffbabca
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94d1b8c63e4c13963b59c5d3aa0527878a1306e9886d2a17bb6f1a3337c869cb
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dbafb1ee34ccd6407d8a854b36567bc045ccfd77
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a26e7a1383ca1bc4e4fa18ba12b74f9714c228c8704e5fef1397d5ed735f718
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5653c27d24bd0e7edd18dbf5fe788e68fd198348
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db07665ca3d1c25aa6e3951af947003730d0fb494e5aebf7d1bb15045b4bd600
+size 558554434
diff --git a/global_step204262/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..191b71e20e6f48b23466d0bc481bc6a23122b008
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2c8560491189986b851c394d42a36ca46c1e3a9293724ea77cc48de2c606f2c
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dbcd1ca4dd0d4e2cab308ff022653d35926ba459
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31d43d545a3838fe05d19598a251606a4ef5ecc5162c34dd75101268cff374f5
+size 558554370
diff --git a/global_step204262/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d1d582369c468381da13655836613886ea6eb4e4
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db1c0f4b52ca5bf81a832118853231b3143f27f9c96ce8c036d13a220ed74155
+size 558554306
diff --git a/global_step204262/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..92fcb29042700105849b6ab85ce1842590a75a4c
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0867fddff4059facff575a7756da7e3a40f9a98e9d456ec47308a385d99151e
+size 558610626
diff --git a/global_step204262/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b8aa52709ee51d2155a8e8f49c046cd017f7ec5d
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfe94d34de1aff447512d88c9d5ea7b065a33cd758ac348cc0ee113b521712b1
+size 558554290
diff --git a/global_step204262/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b63fa5b179086b56930a624a72713e73316b4197
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d0e1953fed26e391b46b11297652da9361653b1845370120e3736f8dbaf7c42
+size 558554290
diff --git a/global_step204262/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6dee3efae356c569b06968911f9971c47e71fe4d
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19905a66ffd58ab87ec638319b43e7ef115ba99de571a9298c8c21d49ed994f7
+size 558554354
diff --git a/global_step204262/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4ee659f68f566d238426b1869eae0d7553dcf12a
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b65162cfb3eab2ea3cd75490d8bc9914e20c10f8d0a158090e7397f47f4a2d9
+size 558554418
diff --git a/global_step204262/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..04c309962450c4eaa150e129502893bb3c67786e
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c563fcab941f5e07e128c34fe2fded15b2165e1c87c67187159e55d19450ec9
+size 558554290
diff --git a/global_step204262/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/global_step204262/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2e27d542402a99a1a55bc869388201d7f96742e0
--- /dev/null
+++ b/global_step204262/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6af6b0922fa72f8739fd2b5781f00cf3d9d0583052745a1a8f8c0a6229599583
+size 558554354
diff --git a/global_step204262/mp_rank_00_model_states.pt b/global_step204262/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6ec4f1aefc4fd7bf7b6947bf13687811d52f4f8e
--- /dev/null
+++ b/global_step204262/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a01f738634ac998585818fed1378c087b7c8175667224627f23d55d4dd61cf30
+size 4468641136
diff --git a/latest b/latest
new file mode 100644
index 0000000000000000000000000000000000000000..9ef3421deb813d72dcae48f49bad5ee8e67ab893
--- /dev/null
+++ b/latest
@@ -0,0 +1 @@
+global_step204262
\ No newline at end of file
diff --git a/model.safetensors b/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e59481901ee04d4a2ab870b4081d9e3737bc2760
--- /dev/null
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:499731d44d54c8ffd2846953164a028f09afea7fde49eb391a2d4d1fbd5fa21b
+size 4848661852
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..a782b2f1cdab4d0bacb2dc0f85d02c4b1e31f0bd
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..efc4b7ce3fdffa249828bf0f3fe2256cae304a3f
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,226306 @@
+{
+ "version": "1.0",
+ "truncation": null,
+ "padding": null,
+ "added_tokens": [
+ {
+ "id": 0,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 1,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 2,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 102,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 103,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 104,
+ "content": "<|start_header_id|>",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 105,
+ "content": "<|end_header_id|>",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 106,
+ "content": "<|eot_id|>",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ }
+ ],
+ "normalizer": {
+ "type": "Sequence",
+ "normalizers": [
+ {
+ "type": "Prepend",
+ "prepend": "▁"
+ },
+ {
+ "type": "Replace",
+ "pattern": {
+ "String": " "
+ },
+ "content": "▁"
+ },
+ {
+ "type": "Replace",
+ "pattern": {
+ "String": " "
+ },
+ "content": "▁"
+ }
+ ]
+ },
+ "pre_tokenizer": null,
+ "post_processor": {
+ "type": "TemplateProcessing",
+ "single": [
+ {
+ "SpecialToken": {
+ "id": "",
+ "type_id": 0
+ }
+ },
+ {
+ "Sequence": {
+ "id": "A",
+ "type_id": 0
+ }
+ }
+ ],
+ "pair": [
+ {
+ "SpecialToken": {
+ "id": "",
+ "type_id": 0
+ }
+ },
+ {
+ "Sequence": {
+ "id": "A",
+ "type_id": 0
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "",
+ "type_id": 1
+ }
+ },
+ {
+ "Sequence": {
+ "id": "B",
+ "type_id": 1
+ }
+ }
+ ],
+ "special_tokens": {
+ "": {
+ "id": "",
+ "ids": [
+ 1
+ ],
+ "tokens": [
+ ""
+ ]
+ }
+ }
+ },
+ "decoder": {
+ "type": "Sequence",
+ "decoders": [
+ {
+ "type": "Replace",
+ "pattern": {
+ "String": "▁"
+ },
+ "content": " "
+ },
+ {
+ "type": "ByteFallback"
+ },
+ {
+ "type": "Fuse"
+ },
+ {
+ "type": "Strip",
+ "content": " ",
+ "start": 1,
+ "stop": 0
+ }
+ ]
+ },
+ "model": {
+ "type": "BPE",
+ "dropout": null,
+ "unk_token": "",
+ "continuing_subword_prefix": null,
+ "end_of_word_suffix": null,
+ "fuse_unk": true,
+ "byte_fallback": true,
+ "ignore_merges": false,
+ "vocab": {
+ "": 0,
+ "": 1,
+ "": 2,
+ "": 3,
+ "": 4,
+ "\n": 5,
+ "\t": 6,
+ "
": 7,
+ "
": 8,
+ "": 9,
+ "": 10,
+ "": 11,
+ "
": 12,
+ "": 13,
+ " | | ": 14,
+ "": 15,
+ "": 16,
+ "": 17,
+ "": 18,
+ "": 21,
+ "": 22,
+ "
": 23,
+ "": 24,
+ "": 25,
+ "": 26,
+ "": 27,
+ "": 28,
+ "": 29,
+ "": 30,
+ "": 31,
+ "": 32,
+ "
": 33,
+ "
": 34,
+ "
": 35,
+ "": 36,
+ "": 37,
+ "": 38,
+ "
": 39,
+ "": 40,
+ "": 41,
+ "
": 42,
+ "": 43,
+ "
": 44,
+ "
": 45,
+ "": 46,
+ "": 47,
+ "
": 48,
+ "": 49,
+ "": 50,
+ "": 51,
+ "0": 52,
+ "1": 53,
+ "2": 54,
+ "3": 55,
+ "4": 56,
+ "5": 57,
+ "6": 58,
+ "7": 59,
+ "8": 60,
+ "9": 61,
+ "+": 62,
+ "-": 63,
+ "=": 64,
+ ",": 65,
+ "。": 66,
+ "!": 67,
+ "?": 68,
+ "、": 69,
+ ":": 70,
+ "¥": 71,
+ ".": 72,
+ "!": 73,
+ "?": 74,
+ "...": 75,
+ "。。。": 76,
+ "。。。。。。": 77,
+ "《": 78,
+ "》": 79,
+ "【": 80,
+ "】": 81,
+ "『": 82,
+ "』": 83,
+ "```": 84,
+ "": 86,
+ "---": 87,
+ "": 88,
+ ";": 89,
+ ".": 90,
+ "=": 91,
+ "<": 92,
+ ">": 93,
+ "-": 94,
+ "+": 95,
+ "%": 96,
+ "‼": 97,
+ "㊣": 98,
+ "/": 99,
+ "|": 100,
+ "": 101,
+ "": 102,
+ "": 103,
+ "<|start_header_id|>": 104,
+ "<|end_header_id|>": 105,
+ "<|eot_id|>": 106,
+ "": 107,
+ "": 108,
+ "": 109,
+ "": 110,
+ "": 111,
+ "": 112,
+ "": 113,
+ "": 114,
+ "": 115,
+ "": 116,
+ "": 117,
+ "": 118,
+ "": 119,
+ "": 120,
+ "": 121,
+ "": 122,
+ "": 123,
+ "": 124,
+ "": 125,
+ "": 126,
+ "": 127,
+ "": 128,
+ "": 129,
+ "": 130,
+ "": 131,
+ "": 132,
+ "": 133,
+ "": 134,
+ "": 135,
+ "": 136,
+ "": 137,
+ "": 138,
+ "": 139,
+ "": 140,
+ "": 141,
+ "": 142,
+ "": 143,
+ "": 144,
+ "": 145,
+ "": 146,
+ "": 147,
+ "": 148,
+ "": 149,
+ "": 150,
+ "": 151,
+ "": 152,
+ "": 153,
+ "": 154,
+ "": 155,
+ "": 156,
+ "": 157,
+ "": 158,
+ "": 159,
+ "": 160,
+ "": 161,
+ "": 162,
+ "": 163,
+ "": 164,
+ "": 165,
+ "": 166,
+ "": 167,
+ "": 168,
+ "": 169,
+ "": 170,
+ "": 171,
+ "": 172,
+ "": 173,
+ "": 174,
+ "": 175,
+ "": 176,
+ "": 177,
+ "": 178,
+ "": 179,
+ "": 180,
+ "": 181,
+ "": 182,
+ "": 183,
+ "": 184,
+ "": 185,
+ "": 186,
+ "": 187,
+ "": 188,
+ "": 189,
+ "": 190,
+ "": 191,
+ "": 192,
+ "": 193,
+ "": 194,
+ "": 195,
+ "": 196,
+ "": 197,
+ "": 198,
+ "": 199,
+ "": 200,
+ "": 201,
+ "": 202,
+ "": 203,
+ "": 204,
+ "": 205,
+ "": 206,
+ "": 207,
+ "": 208,
+ "": 209,
+ "": 210,
+ "": 211,
+ "": 212,
+ "": 213,
+ "": 214,
+ "": 215,
+ "": 216,
+ "": 217,
+ "": 218,
+ "": 219,
+ "": 220,
+ "": 221,
+ "": 222,
+ "": 223,
+ "": 224,
+ "": 225,
+ "": 226,
+ "": 227,
+ "": 228,
+ "": 229,
+ "": 230,
+ "": 231,
+ "": 232,
+ "": 233,
+ "": 234,
+ "": 235,
+ "": 236,
+ "": 237,
+ "": 238,
+ "": 239,
+ "": 240,
+ "": 241,
+ "": 242,
+ "": 243,
+ "": 244,
+ "": 245,
+ "": 246,
+ "": 247,
+ "": 248,
+ "": 249,
+ "": 250,
+ "": 251,
+ "": 252,
+ "": 253,
+ "": 254,
+ "": 255,
+ "": 256,
+ "": 257,
+ "": 258,
+ "": 259,
+ "": 260,
+ "": 261,
+ "": 262,
+ "": 263,
+ "": 264,
+ "": 265,
+ "": 266,
+ "": 267,
+ "": 268,
+ "": 269,
+ "": 270,
+ "": 271,
+ "": 272,
+ "": 273,
+ "": 274,
+ "": 275,
+ "": 276,
+ "": 277,
+ "": 278,
+ "": 279,
+ "": 280,
+ "": 281,
+ "": 282,
+ "": 283,
+ "": 284,
+ "": 285,
+ "": 286,
+ "": 287,
+ "": 288,
+ "": 289,
+ "": 290,
+ "": 291,
+ "": 292,
+ "": 293,
+ "": 294,
+ "": 295,
+ "": 296,
+ "": 297,
+ "": 298,
+ "": 299,
+ "": 300,
+ "": 301,
+ "": 302,
+ "": 303,
+ "": 304,
+ "": 305,
+ "": 306,
+ "": 307,
+ "": 308,
+ "": 309,
+ "": 310,
+ "": 311,
+ "": 312,
+ "": 313,
+ "": 314,
+ "": 315,
+ "": 316,
+ "": 317,
+ "": 318,
+ "": 319,
+ "": 320,
+ "": 321,
+ "": 322,
+ "": 323,
+ "": 324,
+ "": 325,
+ "": 326,
+ "": 327,
+ "": 328,
+ "": 329,
+ "": 330,
+ "": 331,
+ "": 332,
+ "": 333,
+ "": 334,
+ "": 335,
+ "": 336,
+ "": 337,
+ "": 338,
+ "": 339,
+ "": 340,
+ "": 341,
+ "": 342,
+ "": 343,
+ "": 344,
+ "": 345,
+ "": 346,
+ "": 347,
+ "": 348,
+ "": 349,
+ "": 350,
+ "": 351,
+ "": 352,
+ "": 353,
+ "": 354,
+ "": 355,
+ "": 356,
+ "