diff --git a/distill-qwen-7b_skywork/global_step_100/.DS_Store b/distill-qwen-7b_skywork/global_step_100/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_100/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_100/config.json b/distill-qwen-7b_skywork/global_step_100/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_100/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_100/generation_config.json b/distill-qwen-7b_skywork/global_step_100/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_100/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_100/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_100/model-00001-of-00004.safetensors
deleted file mode 100644
index 215a01ad2f49a36bcf4c85648771ecb726af0d5e..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_100/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9dc6f289f92f0cf5876ce95f24d859949319a7911d598ad87d37b1402d4a834c
-size 4962027176
diff --git a/distill-qwen-7b_skywork/global_step_100/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_100/model-00002-of-00004.safetensors
deleted file mode 100644
index 1172736396cca2d7e2d6224c8304d1c4e89f1a34..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_100/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:126f733aed57974fe690e965c481471ae138ea13cb789293d05472159ca85294
-size 4984129904
diff --git a/distill-qwen-7b_skywork/global_step_100/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_100/model-00003-of-00004.safetensors
deleted file mode 100644
index 40bd630c6a43ed55d96565a48c3b2ad94e54d9cb..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_100/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:74a953b91caacafdd5c100d42dd8c230816a00a3aa84fac7a68893c0c12dbd8b
-size 4929112864
diff --git a/distill-qwen-7b_skywork/global_step_100/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_100/model-00004-of-00004.safetensors
deleted file mode 100644
index 57ea8c08f126a5050a1221c7fabe470a4161c477..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_100/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7b9f92c6ca72e5aa27596e8cd575a7d4ae7e0f14fe6eaf89883082f545c268fa
-size 356001928
diff --git a/distill-qwen-7b_skywork/global_step_100/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_100/model.safetensors.index.json
deleted file mode 100644
index af3b6bcb876abad01cd17b8994977cfa314efb11..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_100/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00002-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.norm.weight": "model-00003-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_100/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_100/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_100/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_100/tokenizer.json b/distill-qwen-7b_skywork/global_step_100/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_100/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_100/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_100/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_100/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_1000/.DS_Store b/distill-qwen-7b_skywork/global_step_1000/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_1000/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_1000/config.json b/distill-qwen-7b_skywork/global_step_1000/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1000/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_1000/generation_config.json b/distill-qwen-7b_skywork/global_step_1000/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1000/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_1000/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1000/model-00001-of-00004.safetensors
deleted file mode 100644
index 65967d7e6d5aa7dc4451633b6d7de91b43ca4a04..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1000/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:971ae73173402bb48f92ef52c3668360b5c7523bac0afbe08ca06fcb779a6a59
-size 4184094576
diff --git a/distill-qwen-7b_skywork/global_step_1000/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1000/model-00002-of-00004.safetensors
deleted file mode 100644
index 166f510a6f3b3e56adc8b2805515ef7e908d8827..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1000/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3ea1ffc97c0e366ebb6ae97e7bddcda1db54833d4324d488dc8e9e0cd2bd3236
-size 4991452152
diff --git a/distill-qwen-7b_skywork/global_step_1000/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1000/model-00003-of-00004.safetensors
deleted file mode 100644
index a84618b2845c462cc6c9abe9934a84b86659429e..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1000/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fe05f227cb9e540765dd51c1d413a7e7ac414a7362fa79200fd29189b8307250
-size 4991370656
diff --git a/distill-qwen-7b_skywork/global_step_1000/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1000/model-00004-of-00004.safetensors
deleted file mode 100644
index 6f38144be6e5ee35db9330a8c470584acb0b5c22..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1000/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a6c6087c61af0964c573377081d9fc17514427bd60c6dc6ff6950550cef60c5c
-size 1064354488
diff --git a/distill-qwen-7b_skywork/global_step_1000/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1000/model.safetensors.index.json
deleted file mode 100644
index 96369923233b9a962ab1eb704b7460e040396214..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1000/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00002-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.norm.weight": "model-00003-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1000/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1000/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1000/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1000/tokenizer.json b/distill-qwen-7b_skywork/global_step_1000/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1000/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_1000/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1000/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1000/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_1100/.DS_Store b/distill-qwen-7b_skywork/global_step_1100/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_1100/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_1100/config.json b/distill-qwen-7b_skywork/global_step_1100/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1100/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_1100/generation_config.json b/distill-qwen-7b_skywork/global_step_1100/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1100/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_1100/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1100/model-00001-of-00004.safetensors
deleted file mode 100644
index 818d31f9d39c8a6b335ea56712cd3fe01bac01b3..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1100/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fc8e945b1a813496d7c7898fdd6830ea08acf55d77f9664609740a995232985c
-size 4283087968
diff --git a/distill-qwen-7b_skywork/global_step_1100/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1100/model-00002-of-00004.safetensors
deleted file mode 100644
index ea06780fc72522e200ee88d875a46598bf0f5188..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1100/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:09dcad711b125f9919aba8a6a381b105c7718baa92d2b6933e5105ce485052d5
-size 4954685520
diff --git a/distill-qwen-7b_skywork/global_step_1100/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1100/model-00003-of-00004.safetensors
deleted file mode 100644
index cc2ddb85a26978b5b09a826e171f8ad1e70aa948..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1100/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:448e2a6eaf2bc8f0e6f7ccac6af29f6c7478798738d19db18b7259ca1ab1c2d3
-size 4914443120
diff --git a/distill-qwen-7b_skywork/global_step_1100/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1100/model-00004-of-00004.safetensors
deleted file mode 100644
index fee21e8c2e1d8ac1f4078c760db8d0df1b06dcbe..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1100/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2fe962b869f59148380ce3c203fdcf18a51819efa5bc6b44f4704fc26f423d1c
-size 1079055224
diff --git a/distill-qwen-7b_skywork/global_step_1100/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1100/model.safetensors.index.json
deleted file mode 100644
index c180e5258fb9dc2b9864b49c9fab2eb29f7dac61..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1100/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00002-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.norm.weight": "model-00002-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1100/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1100/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1100/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1100/tokenizer.json b/distill-qwen-7b_skywork/global_step_1100/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1100/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_1100/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1100/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1100/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_1200/.DS_Store b/distill-qwen-7b_skywork/global_step_1200/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_1200/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_1200/config.json b/distill-qwen-7b_skywork/global_step_1200/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1200/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_1200/generation_config.json b/distill-qwen-7b_skywork/global_step_1200/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1200/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_1200/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1200/model-00001-of-00004.safetensors
deleted file mode 100644
index c0a9fab1f448fdb8b133e500fa9b5796977939bc..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1200/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1e7a89a83e519f329468f89ccd082fb4e63858cbfc4455cab98d7782752b06fb
-size 4998801712
diff --git a/distill-qwen-7b_skywork/global_step_1200/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1200/model-00002-of-00004.safetensors
deleted file mode 100644
index 5ad495c464fead72065927ea0e21d90104d19ef5..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1200/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:934f2356e826d0a9990b2f2defdc572b67f17cf236977a635396fac5b2178cb6
-size 4995129568
diff --git a/distill-qwen-7b_skywork/global_step_1200/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1200/model-00003-of-00004.safetensors
deleted file mode 100644
index e7b9af08ddc567967fdee708029334ad232b6aac..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1200/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c786059b2f8659a0959012886ef44f5375c1bbc70f37ef47225bcddd74edfb13
-size 4936361296
diff --git a/distill-qwen-7b_skywork/global_step_1200/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1200/model-00004-of-00004.safetensors
deleted file mode 100644
index ac9e8f1fe11798cc6b678688a8c226c72fa37d06..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1200/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3647c6edaff02019bb1e6068924653a7e8d9bf55189e429f0100c2ba308dbfb7
-size 300979328
diff --git a/distill-qwen-7b_skywork/global_step_1200/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1200/model.safetensors.index.json
deleted file mode 100644
index 217610f418679677d301bfce864cfc0c25f40dcf..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1200/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00001-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.norm.weight": "model-00002-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1200/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1200/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1200/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1200/tokenizer.json b/distill-qwen-7b_skywork/global_step_1200/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1200/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_1200/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1200/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1200/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_1300/.DS_Store b/distill-qwen-7b_skywork/global_step_1300/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_1300/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_1300/config.json b/distill-qwen-7b_skywork/global_step_1300/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1300/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_1300/generation_config.json b/distill-qwen-7b_skywork/global_step_1300/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1300/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_1300/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1300/model-00001-of-00004.safetensors
deleted file mode 100644
index a805947a1dd2d06d3c9d767e2115772d633aa387..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1300/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:47f10fe9bae60b03dfd7885cce6923331f95332375d9acce18ca1a27055c3506
-size 4951095456
diff --git a/distill-qwen-7b_skywork/global_step_1300/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1300/model-00002-of-00004.safetensors
deleted file mode 100644
index 5b394bcede04b46b288298d6679ba7191c9b5e33..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1300/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a6afd8e8077831a61084f68c88e58a80a393990d174a1e58c464fcd7697faa22
-size 4995138368
diff --git a/distill-qwen-7b_skywork/global_step_1300/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1300/model-00003-of-00004.safetensors
deleted file mode 100644
index a6d4fa7a5eda1a0234fc144a6bffd3127a1645d7..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1300/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7d99a8eb68d2b5aded9c4107febb970f6785de109844c2d2d6d90e2226c6f3d2
-size 4984092808
diff --git a/distill-qwen-7b_skywork/global_step_1300/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1300/model-00004-of-00004.safetensors
deleted file mode 100644
index 9fb5f5e0477af377edf403ee186fd8b610c6a8fb..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1300/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:24090e6b8b8dc6b5f21464363354bb781b7b2e91b1448051d36be77b218a9602
-size 300945168
diff --git a/distill-qwen-7b_skywork/global_step_1300/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1300/model.safetensors.index.json
deleted file mode 100644
index 937e1036823ae591dd95c10da2d76fa263a9a67b..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1300/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00001-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.norm.weight": "model-00001-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1300/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1300/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1300/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1300/tokenizer.json b/distill-qwen-7b_skywork/global_step_1300/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1300/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_1300/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1300/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1300/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_1400/.DS_Store b/distill-qwen-7b_skywork/global_step_1400/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_1400/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_1400/config.json b/distill-qwen-7b_skywork/global_step_1400/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1400/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_1400/generation_config.json b/distill-qwen-7b_skywork/global_step_1400/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1400/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_1400/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1400/model-00001-of-00004.safetensors
deleted file mode 100644
index a1e519a23f6cbbd7d5c3d6d94630e26498729dbe..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1400/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:82b09e3d2b014653c1c248738bd3297562b299f7cd97d6e8ed74b5519c140fa7
-size 4947397128
diff --git a/distill-qwen-7b_skywork/global_step_1400/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1400/model-00002-of-00004.safetensors
deleted file mode 100644
index 9ab3b84354fa77310c414ff2692ce0f5ffccbfef..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1400/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a360b4fd107a1a52b071bc547eceb1773658540a579467118a152d31a4dc72bb
-size 4187708088
diff --git a/distill-qwen-7b_skywork/global_step_1400/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1400/model-00003-of-00004.safetensors
deleted file mode 100644
index 7a6132ff74f52f513f6803ecf4b1ffd570eadc43..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1400/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0c912cb1d46f15111c2bc252d88705f2a639b2855b189dc8363967d530983f9d
-size 4929029456
diff --git a/distill-qwen-7b_skywork/global_step_1400/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1400/model-00004-of-00004.safetensors
deleted file mode 100644
index 96cc35943adbd885365c502ba0fcfc58363dfdfb..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1400/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fb505fd01d5a806cc61cd0b5b81fae160aaf59295a8af902ac9a580f1eb31015
-size 1167137192
diff --git a/distill-qwen-7b_skywork/global_step_1400/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1400/model.safetensors.index.json
deleted file mode 100644
index b9deaf3e9af95ff95cd9b577564dd2b05edcfef3..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1400/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00002-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.norm.weight": "model-00004-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1400/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1400/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1400/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1400/tokenizer.json b/distill-qwen-7b_skywork/global_step_1400/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1400/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_1400/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1400/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1400/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_1500/.DS_Store b/distill-qwen-7b_skywork/global_step_1500/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_1500/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_1500/config.json b/distill-qwen-7b_skywork/global_step_1500/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1500/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_1500/generation_config.json b/distill-qwen-7b_skywork/global_step_1500/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1500/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_1500/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1500/model-00001-of-00004.safetensors
deleted file mode 100644
index 7c79a71cbd7b6fec1bc2b12bb349e26fe99568b4..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1500/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:336ac46e60479c19658f3c0065a39ef10a358111bf439f2ecdc15e51ca02b51c
-size 4980433808
diff --git a/distill-qwen-7b_skywork/global_step_1500/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1500/model-00002-of-00004.safetensors
deleted file mode 100644
index 1e049570ef4f9215630c4f867637a8de07d4e292..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1500/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0a5e0e3cb71b4694925705f2bc9cbe7f683ec770ac84ee82cc7e992526532533
-size 4914411904
diff --git a/distill-qwen-7b_skywork/global_step_1500/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1500/model-00003-of-00004.safetensors
deleted file mode 100644
index 22998e5c2e5dc1f7dea3e5fc989706827ebe5e13..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1500/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1594bcffe7bc376316c8dd4cc2f04efaacdaef6f0576de70b4f7a59d50d80731
-size 4895993248
diff --git a/distill-qwen-7b_skywork/global_step_1500/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1500/model-00004-of-00004.safetensors
deleted file mode 100644
index 322360a1378a959a3402cddc77996bd8eedf70b8..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1500/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:581112b9c0d111481a5c0bf06227b237329754eec6113e21af5e8ca4ca16332d
-size 440432904
diff --git a/distill-qwen-7b_skywork/global_step_1500/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1500/model.safetensors.index.json
deleted file mode 100644
index af716bc34b88ddfa57a57f77cdf79bff42101966..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1500/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00001-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.norm.weight": "model-00001-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1500/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1500/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1500/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_1500/tokenizer.json b/distill-qwen-7b_skywork/global_step_1500/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1500/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_1500/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1500/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_1500/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_200/.DS_Store b/distill-qwen-7b_skywork/global_step_200/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_200/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_200/config.json b/distill-qwen-7b_skywork/global_step_200/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_200/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_200/generation_config.json b/distill-qwen-7b_skywork/global_step_200/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_200/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_200/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_200/model-00001-of-00004.safetensors
deleted file mode 100644
index e40cb6e9501942411089b78c861f9a4fd5ab9ca3..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_200/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:81d8b4fca9b10c19cb360a15e14717176fa0498b493a37270fa872fc7b23ab8c
-size 4947504304
diff --git a/distill-qwen-7b_skywork/global_step_200/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_200/model-00002-of-00004.safetensors
deleted file mode 100644
index c321265ea94e0a7716f7d901317de701108c2782..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_200/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:64a974dbcb5aafc8caaf3c98a7465d656fb7ed8a599dc6881d6c332b23367c7f
-size 4084900936
diff --git a/distill-qwen-7b_skywork/global_step_200/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_200/model-00003-of-00004.safetensors
deleted file mode 100644
index 5b257e730a334782f24268e8b6c410b303f08c42..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_200/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5d5409a381b498f85d5d915b4c9b1c66303e91fed1270b4d0bcdc53656ae312a
-size 4921678760
diff --git a/distill-qwen-7b_skywork/global_step_200/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_200/model-00004-of-00004.safetensors
deleted file mode 100644
index 952197cf418a52e7fb41d0154cddb618b80387de..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_200/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bebf9030b36c5b7b3f963b91ee74dad7f822b6659ef248b072655bdce895d8eb
-size 1277187904
diff --git a/distill-qwen-7b_skywork/global_step_200/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_200/model.safetensors.index.json
deleted file mode 100644
index 9b382620d0653556962aee51816e93fbc0e33f17..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_200/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00004-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.norm.weight": "model-00001-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_200/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_200/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_200/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_200/tokenizer.json b/distill-qwen-7b_skywork/global_step_200/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_200/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_200/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_200/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_200/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_300/.DS_Store b/distill-qwen-7b_skywork/global_step_300/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_300/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_300/config.json b/distill-qwen-7b_skywork/global_step_300/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_300/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_300/generation_config.json b/distill-qwen-7b_skywork/global_step_300/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_300/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_300/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_300/model-00001-of-00004.safetensors
deleted file mode 100644
index dfd7cdb35bcdd56d31860772acace106b3cac04b..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_300/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:331f37ea0a304e50b4ad6e912e148ee273290d57b14b6b0d0ff6767dfcbe9927
-size 4954832976
diff --git a/distill-qwen-7b_skywork/global_step_300/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_300/model-00002-of-00004.safetensors
deleted file mode 100644
index 3c443e97fd5857702382da88a085e107d3a64621..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_300/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3849012076a7cc74415e145196c26cfad3bc9445fcb173cfd04f88f478cac752
-size 4106921944
diff --git a/distill-qwen-7b_skywork/global_step_300/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_300/model-00003-of-00004.safetensors
deleted file mode 100644
index 4a1eacd533eb78f6e7564bf7baf4d05ffc792574..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_300/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4e4a912a681a1ddff32ae4e0e01e05811f22fd464d22b39008312adea3ca12b0
-size 4866528464
diff --git a/distill-qwen-7b_skywork/global_step_300/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_300/model-00004-of-00004.safetensors
deleted file mode 100644
index 3b0930a8898fc66d7f6c487a48f04efe0915a5a5..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_300/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:744adb0189d5af47dae565fdefbe126034627d072c04f8f3df74f7d79b626822
-size 1302988440
diff --git a/distill-qwen-7b_skywork/global_step_300/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_300/model.safetensors.index.json
deleted file mode 100644
index a4529830414761188d82c035f2b5b7cd2668c6c7..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_300/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00002-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.norm.weight": "model-00001-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_300/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_300/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_300/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_300/tokenizer.json b/distill-qwen-7b_skywork/global_step_300/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_300/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_300/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_300/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_300/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_400/.DS_Store b/distill-qwen-7b_skywork/global_step_400/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_400/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_400/config.json b/distill-qwen-7b_skywork/global_step_400/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_400/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_400/generation_config.json b/distill-qwen-7b_skywork/global_step_400/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_400/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_400/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_400/model-00001-of-00004.safetensors
deleted file mode 100644
index 279b0c4a58835ec992d4165f8db0f6b2467a0e6c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_400/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:814ef6a41e9e6d83ca1d2eeb768d0b3c986532f997fad886dc50998f09c5dd91
-size 4940067136
diff --git a/distill-qwen-7b_skywork/global_step_400/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_400/model-00002-of-00004.safetensors
deleted file mode 100644
index a4e90068298bc30e4fc480e1b9df3bd9fac7a5a9..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_400/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:845c9aba0c3356c131165f66eb3edbd87427c7a3ade0045623e19a0a4d24f38a
-size 4914396672
diff --git a/distill-qwen-7b_skywork/global_step_400/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_400/model-00003-of-00004.safetensors
deleted file mode 100644
index 246b18594a282ac80ae156342e023b776efc306e..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_400/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e648078312503667c04931bbf71fb9a2d20ab1559f6e70348d5c5326c223a137
-size 4943731160
diff --git a/distill-qwen-7b_skywork/global_step_400/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_400/model-00004-of-00004.safetensors
deleted file mode 100644
index dc81787c288f2b1e74b787506126743c4cb2ed7b..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_400/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ae37ccb1b23e903e439a283912de61ff1d936fbe61e14b8dd25022a8911460eb
-size 433076936
diff --git a/distill-qwen-7b_skywork/global_step_400/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_400/model.safetensors.index.json
deleted file mode 100644
index 5a9bdebb2018665328da9c2817fb5161fc52ddd8..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_400/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00002-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.norm.weight": "model-00002-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_400/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_400/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_400/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_400/tokenizer.json b/distill-qwen-7b_skywork/global_step_400/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_400/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_400/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_400/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_400/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_500/.DS_Store b/distill-qwen-7b_skywork/global_step_500/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_500/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_500/config.json b/distill-qwen-7b_skywork/global_step_500/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_500/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_500/generation_config.json b/distill-qwen-7b_skywork/global_step_500/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_500/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_500/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_500/model-00001-of-00004.safetensors
deleted file mode 100644
index ce23684f810c68b9ee6c9bfaea3aeb305cb01501..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_500/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c2ac1371059d49b908334d99cc3b7484f782166af7a548de708a8736321274d4
-size 4268426296
diff --git a/distill-qwen-7b_skywork/global_step_500/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_500/model-00002-of-00004.safetensors
deleted file mode 100644
index ccc1261432f98958570f98a53d5a096738059f56..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_500/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:debd52b643de552189902ccb697b874f7015ecabb6de7b26126e3d1a2a7ff702
-size 4899713232
diff --git a/distill-qwen-7b_skywork/global_step_500/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_500/model-00003-of-00004.safetensors
deleted file mode 100644
index c0dfd3d1cd459c0ce13b72f54572e3640e9869c7..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_500/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5d43b3ead2373211dbc9585ad968b5f96cc50b0a2ea4762d03ceebad87d337ca
-size 4987750648
diff --git a/distill-qwen-7b_skywork/global_step_500/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_500/model-00004-of-00004.safetensors
deleted file mode 100644
index b829c8bdf599be939028205c17a679f250c83b35..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_500/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:18cc612bfdb9f285acff2b2904416cf38376bb6f784bba9fff61814f8460bf39
-size 1075381664
diff --git a/distill-qwen-7b_skywork/global_step_500/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_500/model.safetensors.index.json
deleted file mode 100644
index bfa2b2b867acae3b6be5eb21dd9f5e66e7114318..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_500/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00002-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.norm.weight": "model-00004-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_500/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_500/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_500/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_500/tokenizer.json b/distill-qwen-7b_skywork/global_step_500/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_500/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_500/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_500/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_500/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_600/.DS_Store b/distill-qwen-7b_skywork/global_step_600/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_600/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_600/config.json b/distill-qwen-7b_skywork/global_step_600/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_600/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_600/generation_config.json b/distill-qwen-7b_skywork/global_step_600/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_600/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_600/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_600/model-00001-of-00004.safetensors
deleted file mode 100644
index 348dd83a8b672f35e0afae903f6aa073900409a2..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_600/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bb7503bc994b9fedaf63eb2ba3deb62e1640a430dea57a5948e713766b0e5510
-size 4910668944
diff --git a/distill-qwen-7b_skywork/global_step_600/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_600/model-00002-of-00004.safetensors
deleted file mode 100644
index c4a0f43e1e0adb94e48c056b7ce0d72abacf2ec8..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_600/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a90641bedd3d3964e03128c17b8b0fc800129f83420338220211361a8a2ae8c3
-size 4995099544
diff --git a/distill-qwen-7b_skywork/global_step_600/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_600/model-00003-of-00004.safetensors
deleted file mode 100644
index c74686b8ff839e0ea5852a6c579ccdc2bd8f7c17..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_600/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7c1c01e503bdcef9f310997b5f4f3926db1d7ac32759719a6c37803c61241e22
-size 4918114408
diff --git a/distill-qwen-7b_skywork/global_step_600/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_600/model-00004-of-00004.safetensors
deleted file mode 100644
index 6b63a73269b18fb5c375468ed20afdf461229305..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_600/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8c2d7f5d8449c0cfdd76ef1f900a7075f0dd0c48e8a7de0cd443f060bc707535
-size 407388968
diff --git a/distill-qwen-7b_skywork/global_step_600/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_600/model.safetensors.index.json
deleted file mode 100644
index 244d7555f61409c89800b4747af8e1dfbd78c2d0..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_600/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00001-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.norm.weight": "model-00002-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_600/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_600/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_600/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_600/tokenizer.json b/distill-qwen-7b_skywork/global_step_600/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_600/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_600/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_600/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_600/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_700/.DS_Store b/distill-qwen-7b_skywork/global_step_700/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_700/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_700/config.json b/distill-qwen-7b_skywork/global_step_700/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_700/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_700/generation_config.json b/distill-qwen-7b_skywork/global_step_700/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_700/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_700/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_700/model-00001-of-00004.safetensors
deleted file mode 100644
index bc2ae04815f1473419dc77655add3f5010be53c9..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_700/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c6aa4dc81ae9211b7c5776bd1663ae0e18a240a00779fb9c72bad3031fa6df9c
-size 4910738848
diff --git a/distill-qwen-7b_skywork/global_step_700/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_700/model-00002-of-00004.safetensors
deleted file mode 100644
index 04f1018050b54840e7f96a26fff0b128236fe601..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_700/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ccf91c0d59345fa2ce5dde9c1ef74e355af33453d01c8b0f351acccd7eaecd66
-size 4984125784
diff --git a/distill-qwen-7b_skywork/global_step_700/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_700/model-00003-of-00004.safetensors
deleted file mode 100644
index 064d2c911accb11a096b0c434830b80d78c7fd95..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_700/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b2c4fb6a567b2a1d18783f07a9da1253e620d2fa804f69379987c069b4205f3c
-size 4877637832
diff --git a/distill-qwen-7b_skywork/global_step_700/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_700/model-00004-of-00004.safetensors
deleted file mode 100644
index a2a8318026b20080cce642090ae6f0125c764a7f..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_700/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:08df4f7e2e69350a262998c16d09fa8e11133e8db956c39a7868739601d9ccf6
-size 458769424
diff --git a/distill-qwen-7b_skywork/global_step_700/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_700/model.safetensors.index.json
deleted file mode 100644
index b37ccd2ca5e4a0cde013e1ee19cdbf011a1860e4..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_700/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00002-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.norm.weight": "model-00001-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_700/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_700/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_700/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_700/tokenizer.json b/distill-qwen-7b_skywork/global_step_700/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_700/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_700/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_700/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_700/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_800/.DS_Store b/distill-qwen-7b_skywork/global_step_800/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_800/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_800/config.json b/distill-qwen-7b_skywork/global_step_800/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_800/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_800/generation_config.json b/distill-qwen-7b_skywork/global_step_800/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_800/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_800/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_800/model-00001-of-00004.safetensors
deleted file mode 100644
index fa7865fb14d0ab46999482b50ae2cad74cd97cff..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_800/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b483847e4bf48d689a1a3c245723c271e1facc0e90817c9818e9be4e59705524
-size 4991593472
diff --git a/distill-qwen-7b_skywork/global_step_800/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_800/model-00002-of-00004.safetensors
deleted file mode 100644
index 9372ed927aa6cb0886b21a23a78a7cd9f95191b7..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_800/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fa655b6230697c8b94e8149bef3475bc0c94d89516006b4489a39bf15b01e5df
-size 4936351000
diff --git a/distill-qwen-7b_skywork/global_step_800/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_800/model-00003-of-00004.safetensors
deleted file mode 100644
index 438d479a1867d019eeab91c29b4e6b256041c62b..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_800/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:940533dfb5627308fa09465e1910b4ad483448a6a8e040d0bf0ec493c0388366
-size 4980358232
diff --git a/distill-qwen-7b_skywork/global_step_800/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_800/model-00004-of-00004.safetensors
deleted file mode 100644
index 95ce0a292b1e702102475d1f56dc52d7c3065f00..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_800/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:872676364d1e3e15993c94e7f7a83ad990e0b6b6f82170340fc70696fa1b2302
-size 322969184
diff --git a/distill-qwen-7b_skywork/global_step_800/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_800/model.safetensors.index.json
deleted file mode 100644
index 29103736d43f322d647cc4ea4b21ffd18c819461..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_800/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00003-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.norm.weight": "model-00001-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_800/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_800/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_800/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_800/tokenizer.json b/distill-qwen-7b_skywork/global_step_800/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_800/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_800/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_800/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_800/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}
diff --git a/distill-qwen-7b_skywork/global_step_900/.DS_Store b/distill-qwen-7b_skywork/global_step_900/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
Binary files a/distill-qwen-7b_skywork/global_step_900/.DS_Store and /dev/null differ
diff --git a/distill-qwen-7b_skywork/global_step_900/config.json b/distill-qwen-7b_skywork/global_step_900/config.json
deleted file mode 100644
index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_900/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "architectures": [
- "Qwen2ForCausalLM"
- ],
- "attention_dropout": 0.0,
- "bos_token_id": 151646,
- "eos_token_id": 151643,
- "hidden_act": "silu",
- "hidden_size": 3584,
- "initializer_range": 0.02,
- "intermediate_size": 18944,
- "max_position_embeddings": 131072,
- "max_window_layers": 28,
- "model_type": "qwen2",
- "num_attention_heads": 28,
- "num_hidden_layers": 28,
- "num_key_value_heads": 4,
- "pad_token_id": 151643,
- "rms_norm_eps": 1e-06,
- "rope_scaling": null,
- "rope_theta": 10000,
- "sliding_window": 4096,
- "tie_word_embeddings": false,
- "torch_dtype": "bfloat16",
- "transformers_version": "4.51.3",
- "use_cache": true,
- "use_mrope": false,
- "use_sliding_window": false,
- "vocab_size": 152064
-}
diff --git a/distill-qwen-7b_skywork/global_step_900/generation_config.json b/distill-qwen-7b_skywork/global_step_900/generation_config.json
deleted file mode 100644
index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_900/generation_config.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
- "_from_model_config": true,
- "bos_token_id": 151646,
- "do_sample": true,
- "eos_token_id": 151643,
- "temperature": 0.6,
- "top_p": 0.95,
- "transformers_version": "4.51.3"
-}
diff --git a/distill-qwen-7b_skywork/global_step_900/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_900/model-00001-of-00004.safetensors
deleted file mode 100644
index 18e4bd57b47d2fdc2173e908bf31cafbb7cfe86b..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_900/model-00001-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c7bcbe9e7d4510eb46128efac85da5d09dbbb8a4bf4b4c079305ffba428644dc
-size 4958409344
diff --git a/distill-qwen-7b_skywork/global_step_900/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_900/model-00002-of-00004.safetensors
deleted file mode 100644
index fe2c913b9f22e83d50bc9affcdff4d5dc6af01da..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_900/model-00002-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1da9668b8a4bc1431cabb09655d953a24a7a93d4359768ee91cdfe8792a19d92
-size 4907092648
diff --git a/distill-qwen-7b_skywork/global_step_900/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_900/model-00003-of-00004.safetensors
deleted file mode 100644
index b741932bc2d459b00eb31ec3e4ec67b09456b96d..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_900/model-00003-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:23d90fdb1a37df14cdf1ba2f54517e887b7898b025a9f3e35ccecb8a1415c47c
-size 4004163312
diff --git a/distill-qwen-7b_skywork/global_step_900/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_900/model-00004-of-00004.safetensors
deleted file mode 100644
index bdb1920ddbad3561a71c77d6855a6d7af27942ac..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_900/model-00004-of-00004.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e4c4fbac8af0b12b2426f01a6a0d67aad1ee609507b22ce5460309622434c595
-size 1361606560
diff --git a/distill-qwen-7b_skywork/global_step_900/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_900/model.safetensors.index.json
deleted file mode 100644
index da93e31fddf4fd18384f45d8c1b640084d1b4617..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_900/model.safetensors.index.json
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "metadata": {
- "total_size": 15231233024
- },
- "weight_map": {
- "lm_head.weight": "model-00001-of-00004.safetensors",
- "model.embed_tokens.weight": "model-00004-of-00004.safetensors",
- "model.layers.0.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.27.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.input_layernorm.weight": "model-00004-of-00004.safetensors",
- "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
- "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
- "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
- "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
- "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
- "model.norm.weight": "model-00001-of-00004.safetensors"
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_900/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_900/special_tokens_map.json
deleted file mode 100644
index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_900/special_tokens_map.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "bos_token": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "eos_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- },
- "pad_token": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/distill-qwen-7b_skywork/global_step_900/tokenizer.json b/distill-qwen-7b_skywork/global_step_900/tokenizer.json
deleted file mode 100644
index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_900/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
-size 11422778
diff --git a/distill-qwen-7b_skywork/global_step_900/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_900/tokenizer_config.json
deleted file mode 100644
index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000
--- a/distill-qwen-7b_skywork/global_step_900/tokenizer_config.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "add_prefix_space": null,
- "added_tokens_decoder": {
- "151643": {
- "content": "<|end▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151644": {
- "content": "<|User|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151645": {
- "content": "<|Assistant|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151646": {
- "content": "<|begin▁of▁sentence|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151647": {
- "content": "<|EOT|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151648": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151649": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151650": {
- "content": "<|quad_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151651": {
- "content": "<|quad_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151652": {
- "content": "<|vision_start|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151653": {
- "content": "<|vision_end|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151654": {
- "content": "<|vision_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151655": {
- "content": "<|image_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151656": {
- "content": "<|video_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": true
- },
- "151657": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151658": {
- "content": "",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151659": {
- "content": "<|fim_prefix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151660": {
- "content": "<|fim_middle|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151661": {
- "content": "<|fim_suffix|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151662": {
- "content": "<|fim_pad|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151663": {
- "content": "<|repo_name|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- },
- "151664": {
- "content": "<|file_sep|>",
- "lstrip": false,
- "normalized": false,
- "rstrip": false,
- "single_word": false,
- "special": false
- }
- },
- "bos_token": "<|begin▁of▁sentence|>",
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}",
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|end▁of▁sentence|>",
- "extra_special_tokens": {},
- "legacy": true,
- "model_max_length": 16384,
- "pad_token": "<|end▁of▁sentence|>",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizerFast",
- "unk_token": null,
- "use_default_system_prompt": false
-}